In [3]:
""" importing packages """

""" general computing packages """
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

""" packages for data preperation and transformations"""
import sklearn.impute as sk_imputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

""" packages for machine learning algorithms """
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

""" packages for model metrics and model tuning and selection """
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, learning_curve, cross_validate
from sklearn.base import BaseEstimator, TransformerMixin
%matplotlib inline

In [4]:
""" Getting processed data """

data_imputed = pd.read_csv('cleaned_imputed_data.csv')

In [5]:
""" seperating features from the labels """

features = data_imputed.drop(columns=['selling_price'], inplace=False)
label = data_imputed['selling_price']



'\nX_train, X_test, y_train, y_test = train_test_split(features, \n                                                                label, train_size=0.8)\n'

In [6]:
"""
Column Transformer - categorical encoding and scaling and transformation of numeric variables

use for regression and any other distance based algorithms, no harm in using it with others
"""

pre_processing = ColumnTransformer(transformers= [ 
    ('winsorization', RobustScaler(with_centering=False, with_scaling=False, quantile_range=(5.0,95.0), copy=False) ,['km_driven']),
    ('num_transform', PowerTransformer(copy=False), ['year','km_driven','mileage','engine_cc','max_power_bhp']),
    ('categorical_enc',OneHotEncoder(drop='first', sparse=False),['Company','fuel','owner']),
    ('transmission_binary', OneHotEncoder(drop='if_binary', sparse=False), ['transmission','seller_type'])    
], remainder='passthrough', n_jobs=-1)


In [7]:
""" model scoring metrics to evaluate hyperparameters """

scores = ['neg_mean_absolute_error','r2']

# Random Forest Classifier

In [15]:
""" hyperparameters to search on the grid """

parameters = {
    'n_estimators' : [100,200,300,350,400,450,500],
    'max_depth': [4,5,6,7,8,9,10],
    'criterion' : ["squared_error", "poisson"],
    'oob_score' : [True],
    'max_samples': [0.9]
}

In [16]:
""" applying transformations on the data and creating GridSearcg object """

features_processed = pre_processing.fit_transform(features)
randForest = RandomForestRegressor()

parameter_search = GridSearchCV(estimator = randForest, 
                                    param_grid= parameters,
                                    scoring=scores,
                                    n_jobs=-1,
                                    refit=False,
                                    cv=5,
                                    verbose=2)

In [17]:
""" searching for best fit parameters """

RandForest_parameters = parameter_search.fit(features_processed, label)

Fitting 5 folds for each of 98 candidates, totalling 490 fits


In [20]:
RF_result = pd.DataFrame(RandForest_parameters.cv_results_)

RF_result[RF_result.rank_test_neg_mean_absolute_error.isin([1,2,3,4,5])]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_samples,param_n_estimators,param_oob_score,params,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
92,10.021508,0.206506,0.11271,0.007627,poisson,10,0.9,200,True,"{'criterion': 'poisson', 'max_depth': 10, 'max...",...,2549.572809,5,0.959842,0.966695,0.978182,0.925544,0.965198,0.959092,0.017808,29
93,15.450038,0.310727,0.174056,0.009704,poisson,10,0.9,300,True,"{'criterion': 'poisson', 'max_depth': 10, 'max...",...,2315.716623,4,0.959001,0.966718,0.978245,0.932122,0.965386,0.960294,0.015393,16
94,17.645932,0.265636,0.237142,0.044436,poisson,10,0.9,350,True,"{'criterion': 'poisson', 'max_depth': 10, 'max...",...,2481.987991,3,0.959449,0.967083,0.978767,0.927103,0.966708,0.959822,0.017491,25
95,21.44483,0.666559,0.259136,0.022848,poisson,10,0.9,400,True,"{'criterion': 'poisson', 'max_depth': 10, 'max...",...,2429.078297,1,0.959203,0.96706,0.978913,0.930367,0.965423,0.960193,0.01622,19
97,26.447549,1.01167,0.285862,0.039336,poisson,10,0.9,500,True,"{'criterion': 'poisson', 'max_depth': 10, 'max...",...,2513.407021,2,0.96092,0.966923,0.978612,0.928635,0.965245,0.960067,0.016772,20


In [21]:
RF_result.to_csv('randomForestresults.csv')

# XGBoost Regressor

In [27]:
""" hyperparameters to search on the grid """

parameters_xg = {
    'n_estimators' : [250, 300, 350, 400, 500, 700],
    'max_depth' : [8,9,10,11,12,13],
    'learning_rate' : [0.01, 0.1, 0.2, 0.4],
    'booster' : ['gbtree','gblinear'],
    'objective': ["reg:squarederror"],
    'n_jobs' : [-1]
}

In [28]:
""" creating GridSearch object and xgbregressor object """

XGBoost = XGBRegressor()

parameter_search_xgb = GridSearchCV(estimator = XGBoost, 
                                    param_grid= parameters_xg,
                                    scoring=scores,
                                    n_jobs=-1,
                                    refit=False,
                                    cv=5,
                                    verbose=2)

In [29]:
xgb_parameters = parameter_search_xgb.fit(features_processed, label)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [30]:
xgb_result = pd.DataFrame(xgb_parameters.cv_results_)

xgb_result.to_csv('XGBresults.csv')

In [31]:
xgb_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_depth,param_n_estimators,param_n_jobs,param_objective,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
0,5.498080,0.105503,0.031530,0.002313,gbtree,0.01,8,250,-1,reg:squarederror,...,2554.341141,144,0.944449,0.949333,0.968869,0.914946,0.952307,0.945981,0.017554,144
1,7.868291,1.225429,0.063617,0.015991,gbtree,0.01,8,300,-1,reg:squarederror,...,2487.086864,138,0.953723,0.958424,0.976791,0.924736,0.962164,0.955167,0.017060,138
2,11.482382,0.347530,0.061914,0.002021,gbtree,0.01,8,350,-1,reg:squarederror,...,2802.841892,132,0.957774,0.962533,0.979594,0.928990,0.966561,0.959090,0.016708,125
3,13.986926,0.276564,0.070937,0.005377,gbtree,0.01,8,400,-1,reg:squarederror,...,2956.645872,126,0.959460,0.964393,0.980887,0.931301,0.968436,0.960895,0.016411,67
4,17.191923,0.325524,0.086853,0.005063,gbtree,0.01,8,500,-1,reg:squarederror,...,2996.876506,122,0.960824,0.965406,0.981275,0.933809,0.969965,0.962256,0.015763,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,1.532077,0.040699,0.013389,0.004036,gblinear,0.4,13,300,-1,reg:squarederror,...,2827.032088,175,0.701860,0.688872,0.740340,0.640742,0.690453,0.692453,0.031851,175
284,1.808085,0.098817,0.011301,0.000611,gblinear,0.4,13,350,-1,reg:squarederror,...,2731.758646,163,0.707104,0.693160,0.749041,0.643073,0.695706,0.697617,0.033861,163
285,2.125230,0.087595,0.016812,0.008475,gblinear,0.4,13,400,-1,reg:squarederror,...,2675.702544,157,0.711233,0.696410,0.756112,0.644803,0.699873,0.701686,0.035553,157
286,3.003710,0.037149,0.015657,0.003548,gblinear,0.4,13,500,-1,reg:squarederror,...,2646.640593,151,0.717115,0.700810,0.766655,0.647086,0.705894,0.707512,0.038181,151
