## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [2]:
df_final = pd.read_csv('clean_dataset.csv')
df_final.head()

Unnamed: 0,sellingprice,year,condition,odometer,date,make_acura,make_aston martin,make_audi,make_bentley,make_bmw,...,day_Tue,day_Wed,month_Apr,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May
0,21500,2015,5.0,16639.0,16,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,21500,2015,5.0,9393.0,16,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,30000,2014,4.5,1331.0,15,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,27750,2015,4.1,14282.0,29,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,10900,2015,1.0,5554.0,30,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


## Modelling

### Feature/Target

In [3]:
x = df_final.iloc[:,1:]
y = df_final.iloc[:,0]

In [4]:
x, y

(        year  condition  odometer  date  make_acura  make_aston martin  \
 0       2015        5.0   16639.0    16           0                  0   
 1       2015        5.0    9393.0    16           0                  0   
 2       2014        4.5    1331.0    15           0                  0   
 3       2015        4.1   14282.0    29           0                  0   
 4       2015        1.0    5554.0    30           0                  0   
 ...      ...        ...       ...   ...         ...                ...   
 517754  2015        4.5   18255.0     9           0                  0   
 517755  2012        5.0   54393.0     8           0                  0   
 517756  2012        4.8   50561.0     8           0                  0   
 517757  2015        3.8   16658.0     9           0                  0   
 517758  2014        3.4   15008.0    28           0                  0   
 
         make_audi  make_bentley  make_bmw  make_buick  ...  day_Tue  day_Wed  \
 0               

### train, test split

In [5]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2, random_state=69) #Splitting the data into Train and Test

In [6]:
xtrain, xtest, ytrain, ytest

(        year  condition  odometer  date  make_acura  make_aston martin  \
 394086  2014        4.5   15731.0    22           0                  0   
 341747  2014        4.9   16985.0     3           0                  0   
 266349  2013        4.4   25896.0    17           0                  0   
 241178  2012        4.1   21671.0    12           0                  0   
 249000  2007        3.0   85415.0     9           0                  0   
 ...      ...        ...       ...   ...         ...                ...   
 25015   2012        4.7   14130.0    18           0                  0   
 140890  2004        3.1  164837.0    22           0                  0   
 462793  2014        3.5   41906.0     4           0                  0   
 384203  2014        3.8   13759.0    29           0                  0   
 457782  2004        2.7  103795.0     1           0                  0   
 
         make_audi  make_bentley  make_bmw  make_buick  ...  day_Tue  day_Wed  \
 394086          

### Fitting

In [7]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(xtrain, ytrain)

LinearRegression()

In [8]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(xtrain, ytrain)

Ridge()

In [9]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(xtrain, ytrain)

Lasso()

In [10]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()
en.fit(xtrain, ytrain)

ElasticNet()

In [11]:
from xgboost import XGBRegressor
xg = XGBRegressor(verbosity=2)
xg.fit(xtrain,ytrain)

[15:15:32] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[15:15:32] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:15:33] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:15:33] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:15:33] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[15:15:34] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 124 extra nod

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=2)

### Model Evaluation

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = [reg, ridge, lasso, en, xg]

for i in model:
    score = round(r2_score(ytest, i.predict(xtest)),4)
    score_train = round(r2_score(ytrain, i.predict(xtrain)),4)
    rmse = round(mean_squared_error(ytest, i.predict(xtest), squared=False),4)
    mae = round(mean_absolute_error(ytest, i.predict(xtest)),4)
    print(i, score, score_train, rmse, mae)

LinearRegression() 0.7434 0.7439 4164.7171 3094.4917
Ridge() 0.7433 0.7439 4165.0058 3094.7625
Lasso() 0.7425 0.7432 4171.6736 3097.9868
ElasticNet() 0.5432 0.5438 5556.382 4228.5245
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=2) 0.836 0.844 3329.1142 2294.3902


### Hyperparameter Tuning

In [13]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

hyperparameters = dict(
    learning_rate=[0.05, 0.1, 0.2], 
    n_estimators=[100, 400, 800],
    max_depth=[3, 6, 9],
    min_child_weight=[1, 10, 100]
)

rs = RandomizedSearchCV(xg, hyperparameters, n_jobs=3, cv=5, verbose=3, n_iter=30, refit='r2', scoring=['r2','neg_root_mean_squared_error'])
rs.fit(xtrain, ytrain)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[19:51:01] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 738 extra nodes, 0 pruned nodes, max_depth=9
[19:51:02] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 750 extra nodes, 0 pruned nodes, max_depth=9
[19:51:02] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 750 extra nodes, 0 pruned nodes, max_depth=9
[19:51:03] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 748 extra nodes, 0 pruned nodes, max_depth=9
[19:51:03] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/tree/updater_prune.cc:101: tree pruning end, 744 extra nodes, 0 pruned nodes, max_depth=9
[19:51:04] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1,
                                          enable_categorical=False, gamma=0,
                                          gpu_id=-1, importance_type=None,
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimato...
                                          num_parallel_tree=1, predictor='auto',
                                          random_state=0, reg_alpha

In [14]:
rs.best_params_

{'n_estimators': 800,
 'min_child_weight': 10,
 'max_depth': 9,
 'learning_rate': 0.05}

In [15]:
rs.best_score_

0.8545668622389361

In [16]:
y_predict_rs = rs.predict(xtest)

In [18]:
r2_score(ytest, y_predict_rs)

0.8542245134707244

In [20]:
mean_squared_error(ytest, y_predict_rs, squared=False)

3138.898405743762