## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("colorado_motor_vehicle_sales.csv")

In [3]:
data.head()

Unnamed: 0,year,quarter,county,sales
0,2008,1,Adams,231609000
1,2008,1,Arapahoe,550378000
2,2008,1,Boulder/Broomfield,176771000
3,2008,1,Denver,200103000
4,2008,1,Douglas,93259000


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

year       0
quarter    0
county     0
sales      0
dtype: int64

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

year
quarter
sales


In [7]:
data[numerical_features]

Unnamed: 0,year,quarter,sales
0,2008,1,231609000
1,2008,1,550378000
2,2008,1,176771000
3,2008,1,200103000
4,2008,1,93259000
...,...,...,...
496,2015,4,244327000
497,2015,4,88202000
498,2015,4,94606000
499,2015,4,157059000


## Filtering all the categorical features over here

In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

county


In [9]:
data[cat_features]

Unnamed: 0,county
0,Adams
1,Arapahoe
2,Boulder/Broomfield
3,Denver
4,Douglas
...,...
496,Larimer
497,Mesa
498,Pueblo
499,Rest of State


## Encoding the categorical features into numerical features over here

In [10]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

## Treating the Year as a categorical feature over here

In [11]:
data['year'].value_counts()

year
2010    64
2011    64
2012    64
2013    64
2014    64
2015    64
2009    61
2008    56
Name: count, dtype: int64

In [12]:
data

Unnamed: 0,year,quarter,county,sales
0,2008,1,0,231609000
1,2008,1,1,550378000
2,2008,1,2,176771000
3,2008,1,3,200103000
4,2008,1,4,93259000
...,...,...,...,...
496,2015,4,10,244327000
497,2015,4,11,88202000
498,2015,4,12,94606000
499,2015,4,16,157059000


## Creating the features and labels over here

In [14]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [15]:
X

array([[2008,    1,    0],
       [2008,    1,    1],
       [2008,    1,    2],
       ...,
       [2015,    4,   12],
       [2015,    4,   16],
       [2015,    4,   13]])

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing set over here

In [18]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1.88e+08 1.94e+08]
 [2.72e+08 2.45e+08]
 [1.02e+07 9.65e+06]
 [3.79e+08 3.39e+08]
 [1.45e+08 1.43e+08]
 [4.30e+08 5.85e+08]
 [2.13e+08 2.19e+08]
 [6.55e+08 6.80e+08]
 [1.86e+07 1.94e+07]
 [1.93e+08 1.94e+08]
 [9.92e+07 7.05e+07]
 [9.46e+07 8.43e+07]
 [7.25e+07 7.16e+07]
 [1.99e+08 2.03e+08]
 [2.89e+08 3.03e+08]
 [2.33e+08 1.94e+08]
 [3.81e+08 4.04e+08]
 [8.87e+07 8.46e+07]
 [7.40e+07 7.16e+07]
 [2.42e+07 2.46e+07]
 [4.27e+07 4.31e+07]
 [2.63e+07 3.09e+07]
 [1.56e+08 1.48e+08]
 [8.63e+06 7.68e+06]
 [1.75e+08 1.68e+08]
 [2.92e+08 3.60e+08]
 [5.19e+07 5.52e+07]
 [2.34e+08 2.31e+08]
 [1.79e+07 1.94e+07]
 [3.10e+08 1.85e+08]
 [8.85e+07 8.52e+07]
 [4.30e+08 4.24e+08]
 [8.23e+06 7.13e+06]
 [2.61e+07 2.64e+07]
 [1.85e+08 1.86e+08]
 [3.89e+07 4.03e+07]
 [3.94e+07 3.58e+07]
 [8.79e+07 9.44e+07]
 [1.56e+08 1.70e+08]
 [2.90e+08 2.51e+08]
 [3.04e+08 2.74e+08]
 [8.76e+07 7.95e+07]
 [8.23e+08 9.17e+08]
 [4.32e+07 5.12e+07]
 [7.35e+07 7.51e+07]
 [4.04e+08 5.50e+08]
 [3.28e+08 4.04e+08]
 [6.83e+08 7.

## Checking the r2 Score evaluation metrics over here

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9670779245050919