In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import explained_variance_score, max_error, mean_squared_error, r2_score
#from scipy.stats import uniform

In [2]:
data = pd.read_csv('processedData.csv')
data.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,make_audi,make_bmw,make_ford,make_hyundai,...,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,avgModelPrice
0,2017,12500,15735,150,55.4,1.4,1,0,0,0,...,0,1,0,0,0,0,0,0,1,14272.146067
1,2016,16500,36203,20,64.2,2.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,22703.678715
2,2016,11000,29946,30,55.4,1.4,1,0,0,0,...,0,1,0,0,0,0,0,0,1,14273.269663
3,2017,16800,25952,145,67.3,2.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,20248.330663
4,2019,17300,1998,145,49.6,1.0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,17419.136197


In [3]:
data.shape

(97443, 25)

In [28]:
#split data
X = data.drop(columns='price')
y = data.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=191)

In [5]:
#first model: Ridge regression
stdsclr = StandardScaler()
colScaler = ColumnTransformer(remainder='passthrough', transformers=[('scale',stdsclr,['year','mileage','tax','mpg','engineSize','avgModelPrice'])])
ridgePipeline = Pipeline([('scale', colScaler),('model', Ridge())])
params = {'model__alpha':np.logspace(-6, 6, 13)}
#version with SelectKBest, not working/useful
#ridgePipeline = Pipeline([('scale', colScaler),
#                          ('selKB', SelectKBest(score_func=f_regression)),
#                          ('model', Ridge())])
#params = {'selKB__k':range(1,25),'model__alpha':np.logspace(-6, 6, 13)}
ridgeSearch = GridSearchCV(ridgePipeline, params, scoring='neg_mean_absolute_error', verbose = 3, cv = 5)

In [6]:
ridgeSearch.fit(X_train, y_train)

Fitting 5 folds for each of 13 candidates, totalling 65 fits
[CV 1/5] END ............model__alpha=1e-06;, score=-2413.635 total time=   0.0s
[CV 2/5] END ............model__alpha=1e-06;, score=-2400.872 total time=   0.0s
[CV 3/5] END ............model__alpha=1e-06;, score=-2405.677 total time=   0.0s
[CV 4/5] END ............model__alpha=1e-06;, score=-2409.441 total time=   0.0s
[CV 5/5] END ............model__alpha=1e-06;, score=-2406.214 total time=   0.0s
[CV 1/5] END ............model__alpha=1e-05;, score=-2413.635 total time=   0.0s
[CV 2/5] END ............model__alpha=1e-05;, score=-2400.872 total time=   0.0s
[CV 3/5] END ............model__alpha=1e-05;, score=-2405.677 total time=   0.0s
[CV 4/5] END ............model__alpha=1e-05;, score=-2409.441 total time=   0.0s
[CV 5/5] END ............model__alpha=1e-05;, score=-2406.214 total time=   0.0s
[CV 1/5] END ...........model__alpha=0.0001;, score=-2413.635 total time=   0.0s
[CV 2/5] END ...........model__alpha=0.0001;, sc

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['year',
                                                                          'mileage',
                                                                          'tax',
                                                                          'mpg',
                                                                          'engineSize',
                                                                          'avgModelPrice'])])),
                                       ('model', Ridge())]),
             param_grid={'model__alpha': array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00

In [7]:
ridgeSearch.best_params_

{'model__alpha': 1000.0}

In [5]:
#keep model for future comparison
ridgeModel = Ridge(alpha=1000)

In [9]:
#second model: SVR
svr = LinearSVR(dual=False, loss='squared_epsilon_insensitive')
svrPipeline = Pipeline([('scale', colScaler),('model', svr)])
params = {'model__C':np.logspace(-6, 6, 13), 'model__tol':np.logspace(-8, -2, 7)}
svrSearch = GridSearchCV(svrPipeline, params, scoring='neg_mean_absolute_error', verbose = 3, cv = 5)

In [10]:
svrSearch.fit(X_train, y_train)

Fitting 5 folds for each of 91 candidates, totalling 455 fits
[CV 1/5] END model__C=1e-06, model__tol=1e-08;, score=-13758.477 total time=   0.0s
[CV 2/5] END model__C=1e-06, model__tol=1e-08;, score=-13863.589 total time=   0.0s
[CV 3/5] END model__C=1e-06, model__tol=1e-08;, score=-13883.509 total time=   0.0s
[CV 4/5] END model__C=1e-06, model__tol=1e-08;, score=-13872.352 total time=   0.0s
[CV 5/5] END model__C=1e-06, model__tol=1e-08;, score=-13777.194 total time=   0.0s
[CV 1/5] END model__C=1e-06, model__tol=1e-07;, score=-13758.477 total time=   0.0s
[CV 2/5] END model__C=1e-06, model__tol=1e-07;, score=-13863.589 total time=   0.0s
[CV 3/5] END model__C=1e-06, model__tol=1e-07;, score=-13883.509 total time=   0.0s
[CV 4/5] END model__C=1e-06, model__tol=1e-07;, score=-13872.352 total time=   0.0s
[CV 5/5] END model__C=1e-06, model__tol=1e-07;, score=-13777.194 total time=   0.0s
[CV 1/5] END model__C=1e-06, model__tol=1e-06;, score=-13758.477 total time=   0.0s
[CV 2/5] END m

[CV 5/5] END model__C=0.0001, model__tol=0.001;, score=-2451.458 total time=   0.0s
[CV 1/5] END model__C=0.0001, model__tol=0.01;, score=-2465.088 total time=   0.0s
[CV 2/5] END model__C=0.0001, model__tol=0.01;, score=-2456.722 total time=   0.0s
[CV 3/5] END model__C=0.0001, model__tol=0.01;, score=-2491.846 total time=   0.0s
[CV 4/5] END model__C=0.0001, model__tol=0.01;, score=-2479.116 total time=   0.0s
[CV 5/5] END model__C=0.0001, model__tol=0.01;, score=-2455.755 total time=   0.0s
[CV 1/5] END model__C=0.001, model__tol=1e-08;, score=-2401.133 total time=   0.0s
[CV 2/5] END model__C=0.001, model__tol=1e-08;, score=-2392.470 total time=   0.0s
[CV 3/5] END model__C=0.001, model__tol=1e-08;, score=-2408.264 total time=   0.0s
[CV 4/5] END model__C=0.001, model__tol=1e-08;, score=-2402.874 total time=   0.0s
[CV 5/5] END model__C=0.001, model__tol=1e-08;, score=-2403.309 total time=   0.1s
[CV 1/5] END model__C=0.001, model__tol=1e-07;, score=-2401.133 total time=   0.0s
[CV

[CV 5/5] END model__C=0.1, model__tol=0.0001;, score=-2406.289 total time=   0.0s
[CV 1/5] END model__C=0.1, model__tol=0.001;, score=-2413.203 total time=   0.0s
[CV 2/5] END model__C=0.1, model__tol=0.001;, score=-2400.974 total time=   0.0s
[CV 3/5] END model__C=0.1, model__tol=0.001;, score=-2405.236 total time=   0.0s
[CV 4/5] END model__C=0.1, model__tol=0.001;, score=-2409.504 total time=   0.0s
[CV 5/5] END model__C=0.1, model__tol=0.001;, score=-2406.347 total time=   0.0s
[CV 1/5] END .model__C=0.1, model__tol=0.01;, score=-2430.394 total time=   0.0s
[CV 2/5] END .model__C=0.1, model__tol=0.01;, score=-2426.320 total time=   0.0s
[CV 3/5] END .model__C=0.1, model__tol=0.01;, score=-2435.653 total time=   0.0s
[CV 4/5] END .model__C=0.1, model__tol=0.01;, score=-2434.252 total time=   0.0s
[CV 5/5] END .model__C=0.1, model__tol=0.01;, score=-2439.869 total time=   0.0s
[CV 1/5] END model__C=1.0, model__tol=1e-08;, score=-2413.965 total time=   0.0s
[CV 2/5] END model__C=1.0, 

[CV 1/5] END model__C=100.0, model__tol=0.0001;, score=-2414.068 total time=   0.0s
[CV 2/5] END model__C=100.0, model__tol=0.0001;, score=-2400.868 total time=   0.0s
[CV 3/5] END model__C=100.0, model__tol=0.0001;, score=-2405.593 total time=   0.0s
[CV 4/5] END model__C=100.0, model__tol=0.0001;, score=-2409.522 total time=   0.0s
[CV 5/5] END model__C=100.0, model__tol=0.0001;, score=-2406.305 total time=   0.0s
[CV 1/5] END model__C=100.0, model__tol=0.001;, score=-2413.650 total time=   0.0s
[CV 2/5] END model__C=100.0, model__tol=0.001;, score=-2401.287 total time=   0.0s
[CV 3/5] END model__C=100.0, model__tol=0.001;, score=-2405.397 total time=   0.0s
[CV 4/5] END model__C=100.0, model__tol=0.001;, score=-2409.898 total time=   0.0s
[CV 5/5] END model__C=100.0, model__tol=0.001;, score=-2406.645 total time=   0.0s
[CV 1/5] END model__C=100.0, model__tol=0.01;, score=-2430.563 total time=   0.0s
[CV 2/5] END model__C=100.0, model__tol=0.01;, score=-2426.485 total time=   0.0s
[

[CV 5/5] END model__C=100000.0, model__tol=1e-06;, score=-2406.214 total time=   0.0s
[CV 1/5] END model__C=100000.0, model__tol=1e-05;, score=-2414.075 total time=   0.0s
[CV 2/5] END model__C=100000.0, model__tol=1e-05;, score=-2400.891 total time=   0.0s
[CV 3/5] END model__C=100000.0, model__tol=1e-05;, score=-2405.677 total time=   0.0s
[CV 4/5] END model__C=100000.0, model__tol=1e-05;, score=-2409.440 total time=   0.0s
[CV 5/5] END model__C=100000.0, model__tol=1e-05;, score=-2406.214 total time=   0.0s
[CV 1/5] END model__C=100000.0, model__tol=0.0001;, score=-2414.075 total time=   0.0s
[CV 2/5] END model__C=100000.0, model__tol=0.0001;, score=-2400.866 total time=   0.0s
[CV 3/5] END model__C=100000.0, model__tol=0.0001;, score=-2405.593 total time=   0.0s
[CV 4/5] END model__C=100000.0, model__tol=0.0001;, score=-2409.524 total time=   0.0s
[CV 5/5] END model__C=100000.0, model__tol=0.0001;, score=-2406.304 total time=   0.0s
[CV 1/5] END model__C=100000.0, model__tol=0.001;

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['year',
                                                                          'mileage',
                                                                          'tax',
                                                                          'mpg',
                                                                          'engineSize',
                                                                          'avgModelPrice'])])),
                                       ('model',
                                        LinearSVR(dual=False,
                                                  lo

In [11]:
svrSearch.best_params_

{'model__C': 0.001, 'model__tol': 1e-06}

In [6]:
#keep model for future comparison
svrModel = LinearSVR(dual=False, loss='squared_epsilon_insensitive', C=.001, tol=1e-6)

In [23]:
#third model: Random forest regression
rfr = RandomForestRegressor()
rfrPipeline = Pipeline([('scale', colScaler),('model', rfr)])
params = {'model__n_estimators':[10,50,100],'model__min_samples_split':[2]+np.arange(50,501,50).tolist()}
#trading verbose doesn't work with n_jobs which we need for this one
rfrSearch = GridSearchCV(rfrPipeline, params, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1, cv = 5)

In [24]:
rfrSearch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('scale',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['year',
                                                                          'mileage',
                                                                          'tax',
                                                                          'mpg',
                                                                          'engineSize',
                                                                          'avgModelPrice'])])),
                                       ('model', RandomForestRegressor())]),
             n_jobs=-1,
             param_grid={'model__min_samp

In [25]:
rfrSearch.best_params_

{'model__min_samples_split': 2, 'model__n_estimators': 100}

In [41]:
#setting larger leaves doesn't make much impact and doesn't speed significantly
#if we have very small max depth we can have more estimators
params = {'model__n_estimators':[100,1000],'model__max_depth':np.arange(1,4)}
rfrStumpSearch = GridSearchCV(rfrPipeline, params, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1, cv = 5)

In [42]:
rfrStumpSearch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('scale',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['year',
                                                                          'mileage',
                                                                          'tax',
                                                                          'mpg',
                                                                          'engineSize',
                                                                          'avgModelPrice'])])),
                                       ('model', RandomForestRegressor())]),
             n_jobs=-1,
             param_grid={'model__max_dept

In [43]:
rfrStumpSearch.best_params_

{'model__max_depth': 3, 'model__n_estimators': 1000}

In [7]:
#save both models, but check if we should even bother with the stumps
rfrStumpModel = RandomForestRegressor(max_depth=3, n_estimators=1000)
rfrModel = RandomForestRegressor()

In [10]:
rfrStumpScore = rfrStumpModel.score(X_train,y_train)
rfrScore = rfrModel.score(X_train,y_train)
rfrStumpScore>rfrScore
False

False

In [62]:
#nope. Uncomment this instead of running the searches again if needed:
#rfrModel = RandomForestRegressor()

#fourth model: Gradient boost regression
gbr = GradientBoostingRegressor()
gbrPipeline = Pipeline([('scale', colScaler),('model', gbr)])
params = {'model__loss':['squared_error','absolute_error','huber','quantile']}
gbrSearch = GridSearchCV(gbrPipeline, params, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1, cv = 5)

In [63]:
gbrSearch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('scale',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['year',
                                                                          'mileage',
                                                                          'tax',
                                                                          'mpg',
                                                                          'engineSize',
                                                                          'avgModelPrice'])])),
                                       ('model', GradientBoostingRegressor())]),
             n_jobs=-1,
             param_grid={'model__loss

In [64]:
gbrSearch.best_params_

{'model__loss': 'huber'}

In [8]:
#keep model for future comparison
gbrModel = GradientBoostingRegressor(loss='huber')

In [29]:
#uncomment and run to refit data if needed
ridgeModel.fit(X_train,y_train)
svrModel.fit(X_train,y_train)
rfrModel.fit(X_train,y_train)
gbrModel.fit(X_train,y_train)

GradientBoostingRegressor(loss='huber')

In [31]:
#initial comparison on test data
ridgeScore = ridgeModel.score(X_test,y_test)
svrScore = svrModel.score(X_test,y_test)
rfrScore = rfrModel.score(X_test,y_test)
gbrScore = gbrModel.score(X_test,y_test)
allScores = pd.DataFrame({'Scores':[ridgeScore,svrScore,rfrScore,gbrScore]}, index=['Ridge','SVR','RandomForest','GradientBoost'])
allScores

Unnamed: 0,Scores
Ridge,0.84777
SVR,0.811553
RandomForest,0.986738
GradientBoost,0.923424


In [32]:
#random forest seems promising, dig more
models = {'ridge':ridgeModel, 'SVR':svrModel, 'RandomForest':rfrModel, 'GradientBoost':gbrModel}
modelData = pd.DataFrame(columns=['explainedVariance','maxError','MSE','r2'], index=models.keys())
for mod in models.keys():
    y_pred = models[mod].predict(X_test)
    modelData.loc[mod,'explainedVariance'] = explained_variance_score(y_test, y_pred)
    modelData.loc[mod,'maxError'] = max_error(y_test, y_pred)
    modelData.loc[mod,'MSE'] = mean_squared_error(y_test, y_pred)
    modelData.loc[mod,'r2'] = r2_score(y_test, y_pred)
modelData

Unnamed: 0,explainedVariance,maxError,MSE,r2
ridge,0.84777,100575.31161,14997268.358761,0.84777
SVR,0.811553,104391.173484,18565238.680877,0.811553
RandomForest,0.986739,67040.58,1306581.474119,0.986738
GradientBoost,0.923621,82720.587871,7544041.068188,0.923424


In [34]:
%%time
#random forest performs best across the board but is the most costly
rfrModel.fit(X_train,y_train)

Wall time: 19.1 s


RandomForestRegressor()

In [39]:
%%time
gbrModel.fit(X_train,y_train)

Wall time: 7.31 s


GradientBoostingRegressor(loss='huber')

Given that gradient boosting gives a fairly close result with less than half the power/time required, it would be the best alternative if needed.