<a href="https://colab.research.google.com/github/olufunmiruth/g05-used-cars/blob/new_branch/main_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import catboost
import xgboost
from sklearn.metrics import mean_squared_error, r2_score
from category_encoders.cat_boost import CatBoostEncoder

from sklearn.ensemble import RandomForestRegressor

import datetime

In [None]:
# load the processed data (Full_data)
data = pd.read_csv('clean_df.csv')
data.head(2)

Unnamed: 0,odometer_trans,car_age_trans,point_x,point_y,point_z,price_log,region,year,manufacturer,model,condition,cylinders,fuel,title_status,transmission,drive,type,paint_color,state,year_Imputed,manufacturer_Imputed,model_Imputed,condition_Imputed,cylinders_Imputed,fuel_Imputed,title_status_Imputed,transmission_Imputed,drive_Imputed,type_Imputed,paint_color_Imputed
0,-0.828596,0.375,-0.305961,0.23599,0.922332,10.491024,auburn,2010.0,chevrolet,corvette grand sport,good,8 cylinders,gas,clean,other,rwd,other,white,al,0,0,0,0,0,0,0,0,0,0,1
1,-0.069958,-0.125,-0.339755,0.255719,0.905082,8.922792,auburn,2014.0,hyundai,sonata,excellent,4 cylinders,gas,clean,automatic,fwd,sedan,white,al,0,0,0,0,0,0,0,0,0,0,1


In [None]:
data.shape

(458213, 30)

In [None]:
# Load the data and pick the target variable because the preprocesed data does not contain the target variable
data2 = pd.read_csv('new_vehicle.csv')
data2.shape

(458213, 19)

In [None]:
target = data2[['price']]

In [None]:
# Perform a log transform on the target variable
target = np.log((target + 1))

In [None]:
target.tail(2)

Unnamed: 0,price
458211,7.170888
458212,10.038936


In [None]:
# Combine features and target into one dataset
dataset = pd.concat([data, target], axis=1)

In [None]:
dataset.head(2)

Unnamed: 0,odometer_trans,car_age_trans,point_x,point_y,point_z,price_log,region,year,manufacturer,model,condition,cylinders,fuel,title_status,transmission,drive,type,paint_color,state,year_Imputed,manufacturer_Imputed,model_Imputed,condition_Imputed,cylinders_Imputed,fuel_Imputed,title_status_Imputed,transmission_Imputed,drive_Imputed,type_Imputed,paint_color_Imputed,price
0,-0.828596,0.375,-0.305961,0.23599,0.922332,10.491024,auburn,2010.0,chevrolet,corvette grand sport,good,8 cylinders,gas,clean,other,rwd,other,white,al,0,0,0,0,0,0,0,0,0,0,1,10.491024
1,-0.069958,-0.125,-0.339755,0.255719,0.905082,8.922792,auburn,2014.0,hyundai,sonata,excellent,4 cylinders,gas,clean,automatic,fwd,sedan,white,al,0,0,0,0,0,0,0,0,0,0,1,8.922792


In [None]:
# Split data into train and test set
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=2020)

In [None]:
# Split into features and targets
X_train_ = train.drop(['price'], axis=1)
y_train_ = train['price']

X_test_ = test.drop(['price'], axis=1)
y_test_ = test['price']


print(X_train_.shape)
print(y_train_.shape)
print(X_test_.shape)
print(y_test_.shape)

(366570, 30)
(366570,)
(91643, 30)
(91643,)


In [None]:
# Check for missing value
X_train_.isnull().sum()

odometer_trans            0
car_age_trans             0
point_x                   0
point_y                   0
point_z                   0
price_log                 0
region                    0
year                    839
manufacturer              0
model                     0
condition                 0
cylinders                 0
fuel                      0
title_status              0
transmission              0
drive                     0
type                      0
paint_color               0
state                     0
year_Imputed              0
manufacturer_Imputed      0
model_Imputed             0
condition_Imputed         0
cylinders_Imputed         0
fuel_Imputed              0
title_status_Imputed      0
transmission_Imputed      0
drive_Imputed             0
type_Imputed              0
paint_color_Imputed       0
dtype: int64

# Catboost

In [None]:
# import library for cross validation 
from sklearn.model_selection import KFold

In [None]:
# select categorical features
cat_indices = np.where(X_train_.dtypes != np.float)[0]

In [None]:
from math import sqrt

# Function for training the model and calculating the root mean squared error
def Training_Validation_Error_Catboost(model,train_data,train_labels,test_data,test_labels):
    model.fit(train_data,train_labels ,cat_features=cat_indices)
    
    predicted = model.predict(test_data)
    
    rmse = sqrt(mean_squared_error(test_labels,predicted))
    
    print ('rmse = '+str(rmse))
           
    return(rmse)

In [None]:
kf = KFold(n_splits=10)  # Splits the training data into 10 folds

from catboost import CatBoostRegressor
#KFOLD CROSSVAL FOR CATBOOST ALGORITHM

model=CatBoostRegressor(iterations=1500, depth=4, learning_rate=0.03,l2_leaf_reg=10,
                        loss_function='RMSE', random_state=350, verbose=0, early_stopping_rounds=300)
rmse_list=[]

for train_index, val_index in kf.split(X_train_):
    X = X_train_
    Y = y_train_
    X_train,X_val=X.iloc[train_index], X.iloc[val_index]
    Y_train,Y_val=Y.iloc[train_index], Y.iloc[val_index]
    
    # Train the model
    rmse=Training_Validation_Error_Catboost(model,train_data = X_train,train_labels = Y_train,test_data = X_val,test_labels = Y_val)
    rmse_list.append(rmse)
    

print("mean rmse is :",np.mean(rmse_list))
print("range is :", max(rmse_list) - min(rmse_list))

rmse = 0.0634753555175912
rmse = 0.07501508094295523
rmse = 0.05390886731770203
rmse = 0.06356417634271415
rmse = 0.07691602970328418
rmse = 0.055839610050888416
rmse = 0.06717161261877638
rmse = 0.05500574293223206
rmse = 0.07346704679923638
rmse = 0.07273832026405047
mean rmse is : 0.06571018424894305
range is : 0.023007162385582147


In [None]:
from sklearn.metrics import r2_score

In [None]:
# Predict on the test data
pred = model.predict(X_test_)

print("R2 training", model.score(X_train_, y_train_))

print("R2 test", model.score(X_test_, y_test_))

print("R2 prediction", r2_score(y_test_, pred))

R2 training 0.9995274398938019
R2 test 0.9994683009515066
R2 prediction 0.9994683009515066


# XGBoost

In [None]:
#view the columns 
X_train_.columns

Index(['odometer_trans', 'car_age_trans', 'point_x', 'point_y', 'point_z',
       'price_log', 'region', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'type',
       'paint_color', 'state', 'year_Imputed', 'manufacturer_Imputed',
       'model_Imputed', 'condition_Imputed', 'cylinders_Imputed',
       'fuel_Imputed', 'title_status_Imputed', 'transmission_Imputed',
       'drive_Imputed', 'type_Imputed', 'paint_color_Imputed'],
      dtype='object')

In [None]:
# categorical features with few groups
col = ['condition', 'cylinders','fuel', 'title_status', 'transmission', 'drive']

# Creating dummy variables for the features in the list 'col'
X = pd.get_dummies(X_train_, columns=col, prefix=col, prefix_sep=':', drop_first=True)

X_test = pd.get_dummies(X_test_, columns=col, prefix=col, prefix_sep=':', drop_first=True)

In [None]:
# features with high dimensionality
feature_list = ['region', 'year', 'manufacturer', 'model', 'type', 'paint_color', 'state']

target_ = y_train_

# Perform encoding on the high dimensionality features
CBE_encoder = CatBoostEncoder()

train_cbe = CBE_encoder.fit_transform(X_train_[feature_list], target_)
print(train_cbe.shape)
test_cbe = CBE_encoder.transform(X_test_[feature_list])
print(test_cbe.shape)

(366570, 7)
(91643, 7)


In [None]:
# Drop the original features
X_d = X.drop(feature_list, axis=1) 
X_te = X_test.drop(feature_list, axis=1)

In [None]:
# Concatenate both dataframes
X_T = pd.concat([X_d, train_cbe], axis=1)
X_Test = pd.concat([X_te, test_cbe], axis=1)

In [None]:
X_T.head(2)

Unnamed: 0,odometer_trans,car_age_trans,point_x,point_y,point_z,price_log,year_Imputed,manufacturer_Imputed,model_Imputed,condition_Imputed,cylinders_Imputed,fuel_Imputed,title_status_Imputed,transmission_Imputed,drive_Imputed,type_Imputed,paint_color_Imputed,condition:fair,condition:good,condition:like new,condition:new,condition:salvage,cylinders:12 cylinders,cylinders:3 cylinders,cylinders:4 cylinders,cylinders:5 cylinders,cylinders:6 cylinders,cylinders:8 cylinders,cylinders:other,fuel:electric,fuel:gas,fuel:hybrid,fuel:other,title_status:lien,title_status:missing,title_status:parts only,title_status:rebuilt,title_status:salvage,transmission:manual,transmission:other,drive:fwd,drive:rwd,region,year,manufacturer,model,type,paint_color,state
100626,0.030638,-0.75,-0.942061,-0.277648,-0.188236,6.685861,0,1,0,1,1,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,8.558749,2019.0,8.558749,8.558749,8.558749,8.558749,8.558749
444588,1.256395,1.625,-0.339575,-0.842393,0.418405,6.908755,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,8.558749,2000.0,8.558749,8.558749,7.622305,7.622305,8.558749


In [None]:
# create a function for training the model and calculate the root mean squared error
def Training_Validation_Error(model,train_data,train_labels,test_data,test_labels):
    model.fit(train_data,train_labels)
    
    predicted = model.predict(test_data)
    
    rmse = sqrt(mean_squared_error(test_labels,predicted))
    
    print ('rmse = '+str(rmse))
           
    return(rmse)

In [None]:
kf = KFold(n_splits=10)

# XGBoost model
model_x = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.03,
                 max_depth=10,
                 min_child_weight=1.5,
                 n_estimators=1500,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42, early_stopping_rounds=300)
#KFOLD CROSSVAL FOR XGBOOST ALGORITHM


rmse_list=[]

for train_index, test_index in kf.split(X_T):
    X = X_T
    Y = y_train_
    X_train,X_val=X.iloc[train_index], X.iloc[test_index]
    Y_train,Y_val=Y.iloc[train_index], Y.iloc[test_index]   
    rmse=Training_Validation_Error(model_x,train_data = X_train,train_labels = Y_train,test_data = X_val,test_labels = Y_val)
    rmse_list.append(rmse)
    
print("mean rmse is :",np.mean(rmse_list))
print("range is :", max(rmse_list) - min(rmse_list))

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


rmse = 0.1278561202696727
Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


rmse = 0.09704355187477365
Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


rmse = 0.09056927624958307
Parameters: { early_stopping_

In [None]:
#prediction
predi = model_x.predict(X_Test)

print("R2 training", model_x.score(X_T, y_train_))

print("R2 test", model_x.score(X_Test, y_test_))

print("R2 prediction", r2_score(y_test_, predi))

R2 training 0.999589353714514
R2 test 0.9989882818762329
R2 prediction 0.9989882818762329


# Random Forest

In [None]:
#check top rows
X_T.head(2)

Unnamed: 0,odometer_trans,car_age_trans,point_x,point_y,point_z,price_log,year_Imputed,manufacturer_Imputed,model_Imputed,condition_Imputed,cylinders_Imputed,fuel_Imputed,title_status_Imputed,transmission_Imputed,drive_Imputed,type_Imputed,paint_color_Imputed,condition:fair,condition:good,condition:like new,condition:new,condition:salvage,cylinders:12 cylinders,cylinders:3 cylinders,cylinders:4 cylinders,cylinders:5 cylinders,cylinders:6 cylinders,cylinders:8 cylinders,cylinders:other,fuel:electric,fuel:gas,fuel:hybrid,fuel:other,title_status:lien,title_status:missing,title_status:parts only,title_status:rebuilt,title_status:salvage,transmission:manual,transmission:other,drive:fwd,drive:rwd,region,year,manufacturer,model,type,paint_color,state
100626,0.030638,-0.75,-0.942061,-0.277648,-0.188236,6.685861,0,1,0,1,1,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,8.558749,2019.0,8.558749,8.558749,8.558749,8.558749,8.558749
444588,1.256395,1.625,-0.339575,-0.842393,0.418405,6.908755,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,8.558749,2000.0,8.558749,8.558749,7.622305,7.622305,8.558749


In [None]:
#check for null values
X_T.isnull().sum()

odometer_trans               0
car_age_trans                0
point_x                      0
point_y                      0
point_z                      0
price_log                    0
year_Imputed                 0
manufacturer_Imputed         0
model_Imputed                0
condition_Imputed            0
cylinders_Imputed            0
fuel_Imputed                 0
title_status_Imputed         0
transmission_Imputed         0
drive_Imputed                0
type_Imputed                 0
paint_color_Imputed          0
condition:fair               0
condition:good               0
condition:like new           0
condition:new                0
condition:salvage            0
cylinders:12 cylinders       0
cylinders:3 cylinders        0
cylinders:4 cylinders        0
cylinders:5 cylinders        0
cylinders:6 cylinders        0
cylinders:8 cylinders        0
cylinders:other              0
fuel:electric                0
fuel:gas                     0
fuel:hybrid                  0
fuel:oth

In [None]:
# Create a duplicate of the training dataframe
X_ = X_T.copy()

In [None]:
# fill missing values in year with mode 
X_['year'] = X_T['year'].fillna(X_T['year'].mode()[0])

In [None]:
X_.isnull().sum()

odometer_trans             0
car_age_trans              0
point_x                    0
point_y                    0
point_z                    0
price_log                  0
year_Imputed               0
manufacturer_Imputed       0
model_Imputed              0
condition_Imputed          0
cylinders_Imputed          0
fuel_Imputed               0
title_status_Imputed       0
transmission_Imputed       0
drive_Imputed              0
type_Imputed               0
paint_color_Imputed        0
condition:fair             0
condition:good             0
condition:like new         0
condition:new              0
condition:salvage          0
cylinders:12 cylinders     0
cylinders:3 cylinders      0
cylinders:4 cylinders      0
cylinders:5 cylinders      0
cylinders:6 cylinders      0
cylinders:8 cylinders      0
cylinders:other            0
fuel:electric              0
fuel:gas                   0
fuel:hybrid                0
fuel:other                 0
title_status:lien          0
title_status:m

In [None]:
kf = KFold(n_splits=10)

from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
#KFOLD CROSSVAL FOR XGBOOST ALGORITHM


rmse_list=[]

for train_index, test_index in kf.split(X_):
    X = X_
    Y = y_train_
    X_train,X_val=X.iloc[train_index], X.iloc[test_index]
    Y_train,Y_val=Y.iloc[train_index], Y.iloc[test_index]   
    rmse=Training_Validation_Error(model_rf,train_data = X_train,train_labels = Y_train,test_data = X_val,test_labels = Y_val)
    rmse_list.append(rmse)
    
print("mean rmse is :",np.mean(rmse_list))
print("range is :", max(rmse_list) - min(rmse_list))

rmse = 0.005755925182066736
rmse = 0.011194106837176762
rmse = 0.0007698114801286997
rmse = 0.0017925266159340395
rmse = 0.012583982854946852
rmse = 0.00040082876422286163
rmse = 0.0015909197212371962
rmse = 0.00030609711868356023
rmse = 0.01092437649380766
rmse = 0.0019506783717651455
mean rmse is : 0.004726925343996951
range is : 0.012277885736263292


In [None]:
# Fill missing values for year in the test data
X_Test['year'] = X_Test['year'].fillna(X_Test['year'].mode()[0])

In [None]:
#prediction
predic = model_rf.predict(X_Test)

print("R2 training", model_rf.score(X_, y_train_))

print("R2 test", model_rf.score(X_Test, y_test_))

print("R2 prediction", r2_score(y_test_, predic))

R2 training 0.9999991862642162
R2 test 0.9999983725822464
R2 prediction 0.9999983725822464


In [None]:
# Overall RMSE on test data for the different models
print("Catboost RMSE test_data :", sqrt(mean_squared_error(y_test_, pred)))
print("XGboost RMSE test_data :", sqrt(mean_squared_error(y_test_, predi)))
print("Random Forest RMSE test_data :", sqrt(mean_squared_error(y_test_, predic)))

Catboost RMSE test_data : 0.0615340254190468
XGboost RMSE test_data : 0.08488132494440145
Random Forest RMSE test_data : 0.003404332020741455


In [None]:
# Transforming the predictions made by each model
catboost_predictions = np.expm1(pred)
xgboost_predictions = np.expm1(predi)
random_forest_predictions = np.expm1(predic)

In [None]:
pd.set_option('max_rows', None)

In [None]:
# convert prediction arrays to lists
catb = list(catboost_predictions)
xgbb = list(xgboost_predictions)
rf_m = list(random_forest_predictions)

In [None]:
model_dictionary = {'Catboost': catb, 'XGBoost':xgbb, 'Random_forest':rf_m}

In [None]:
# Create a dataframe of the predictions made by each model
df = pd.DataFrame(model_dictionary)

In [None]:
df.head()

Unnamed: 0,Catboost,XGBoost,Random_forest
0,10032.704965,10015.484375,9995.0
1,27981.156583,28404.640625,27990.0
2,24711.558628,24992.650391,25500.0
3,7858.420043,8244.301758,7995.0
4,2540.786855,2484.983887,2500.0
