In [40]:
import numpy as np
import pandas as pd
import os

import pickle
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime

In [41]:
PROJ_ROOT = os.path.join(os.pardir)
raw_data=os.path.join(PROJ_ROOT, "data", "interim", "data_cleaned.csv")
data = pd.read_csv(raw_data, index_col=None, header=0, dtype='string')

In [42]:
columns_for_analysis=['odolbl','Price',	'Location',	'Vehicle_Name',	'Vehicle_Model','Year','Condition']
data["Price"] = pd.to_numeric(data["Price"], downcast="float")
data["odolbl"] = pd.to_numeric(data["odolbl"], downcast="float")
data["Year"] = pd.to_numeric(data["Year"], downcast="integer")
df=data[columns_for_analysis]
df.head(5)

Unnamed: 0,odolbl,Price,Location,Vehicle_Name,Vehicle_Model,Year,Condition
0,48500.0,23995.0,"Barrie, ON",Kia,Forte,2019,Used
1,130000.0,12995.0,"Barrie, ON",Kia,Forte-Koup,2016,Used
2,37006.0,32426.0,"Gloucester, ON",Subaru,Crosstrek,2018,Used
3,67367.0,42826.0,"Gloucester, ON",Volkswagen,Atlas,2019,Used
4,65591.0,23994.0,"Thornhill, ON",Volkswagen,Jetta,2019,Used


In [43]:
df['Location']=df['Location'].str.replace(", ON","")

In [44]:
df.dtypes

odolbl           float32
Price            float32
Location          string
Vehicle_Name      string
Vehicle_Model     string
Year               int16
Condition         string
dtype: object

In [45]:
string_features= df.select_dtypes(include='string')
numeric_features= df.select_dtypes(exclude='string')


df=pd.concat([string_features,numeric_features], axis=1)

In [46]:
# shift column 'Price' to last position
last_column = df.pop('Price')
df['Price']=last_column

In [47]:
df=df.drop(['Location'], axis=1)

In [48]:
save_path=os.path.join(PROJ_ROOT, "data", "processed", "data_for_model.csv")
df.to_csv(save_path,index=False)

In [49]:
print(f'Length of dataframe before taking a random sample = {len(df)}')
df_sample=df.sample(frac=1.0, random_state=101)
print(f'Length of dataframe after taking a random sample = {len(df_sample)}')
df_sample.head(3)

Length of dataframe before taking a random sample = 139101
Length of dataframe after taking a random sample = 139101


Unnamed: 0,Vehicle_Name,Vehicle_Model,Condition,odolbl,Year,Price
130144,Toyota,Tacoma,Used,94754.0,2016,36998.0
96003,Chevrolet,Silverado-1500,Used,22848.0,2021,74990.0
13122,Honda,Accord-Sedan,Used,50025.0,2018,29990.0


In [50]:
X = df_sample.iloc[:, :-1].values
y = df_sample.iloc[:, -1].values

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,2])], remainder='passthrough')

ct.fit(X)


with open("final_encoder.pkl", 'wb') as file:  
    pickle.dump(ct, file)

X = ct.fit_transform(X).toarray()

In [52]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [53]:
# test_size = 0.5(50% of 30% other -----> test = 15% of all data)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size = 0.3, random_state = 101)

In [54]:
#Feature Scaling...attempt
# from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler(with_mean=False)
sc_X.fit(X_train[:,-2:])

with open("final_scaler.pkl", 'wb') as file:  
    pickle.dump(sc_X, file)

X_train[:,-2:]=sc_X.transform(X_train[:,-2:])
X_val[:,-2:]=sc_X.transform(X_val[:,-2:])
X_test[:,-2:]=sc_X.transform(X_test[:,-2:])


In [16]:
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error



#function for models

name = []
r_squared = []
r_squared_std_dev=[]

mae=[]
mae_std_dev=[]

time=[]

def model_evaluation(model,Name):
    cycle_start = datetime.now()
    model_instance = model()
    model_instance.fit(X_train,y_train)
    y_val = model_instance.predict(X_val)
    
    scores = cross_validate(estimator = model_instance, X = X_val, y =  y_val, cv = 12, 
                            scoring=['r2','neg_mean_absolute_error'])

    cycle_end = datetime.now()
    cycle_duration=cycle_end - cycle_start

    mins, secs = divmod(cycle_duration.total_seconds(), 60)
    hrs, mins = divmod(mins, 60)
    duration=f'{hrs:.2f}:{mins:.2f}:{secs:.2f}'


    name.append(Name)
    time.append(duration)

    r_squared_mean=scores['test_r2'].mean()
    r_squared_std=scores['test_r2'].std()
    r_squared.append(r_squared_mean)
    r_squared_std_dev.append(r_squared_std)

    mae_mean=scores['test_neg_mean_absolute_error'].mean()
    mae_std=scores['test_neg_mean_absolute_error'].std()
    mae.append(mae_mean)
    mae_std_dev.append(mae_std)

    print(f'RMSE = {r_squared_mean*100:.2f}%')
    print(f'RMSE_std = {r_squared_std*100:.2f}')
    print()
    print(f'MAE = {abs(mae_mean):.2f}')
    print(f'MAE_std = {mae_std*100:.2f}')
    print()
    print(f'Cycle Time = {cycle_duration}')

    return model_instance
    

In [17]:
from xgboost import XGBRegressor
XGB_model = model_evaluation(XGBRegressor,'XGBRegressor')

RMSE = 87.77%
RMSE_std = 0.44

MAE = 4272.89
MAE_std = 5011.15

Cycle Time = 1:13:07.315497


In [18]:
from sklearn.ensemble import RandomForestRegressor
RandForest_model = model_evaluation(RandomForestRegressor,'RandomForestRegressor')

RMSE = 93.45%
RMSE_std = 0.34

MAE = 2566.44
MAE_std = 3564.29

Cycle Time = 16:26:34.463182


In [19]:
pd.options.display.float_format = "{:,.2f}".format
df_results = pd.DataFrame([name,time,r_squared,r_squared_std_dev,mae,mae_std_dev])
df_results = df_results.transpose()
df_results = df_results.rename(columns={0:'Model',1:'Time',2:'RMSE_mean',3:'RMSE_std',4:'MAE_mean',5:'MAE_std'}).sort_values(by='RMSE_mean',ascending=False)
df_results

Unnamed: 0,Model,Time,RMSE_mean,RMSE_std,MAE_mean,MAE_std
1,RandomForestRegressor,16.00:26.00:34.46,0.93,0.0,-2566.44,35.64
0,XGBRegressor,1.00:13.00:7.32,0.88,0.0,-4272.89,50.11


# Final Performance Metrics

In [20]:
score = XGB_model.score(X_test, y_test)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

# Predict the Labels using the reloaded Model
Ypredict = XGB_model.predict(X_test)  

Test score: 87.72 %


In [21]:
score = RandForest_model.score(X_test, y_test)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

# Predict the Labels using the reloaded Model
Ypredict = RandForest_model.predict(X_test)  


Test score: 93.82 %


In [22]:
final_XGB_model=model_evaluation(XGBRegressor,'XGBRegressor')
final_RandForest_model = model_evaluation(RandomForestRegressor,'RandomForestRegressor')


final_XGB_model.fit(X,y)
final_RandForest_model.fit(X,y)

RMSE = 87.77%
RMSE_std = 0.44

MAE = 4272.89
MAE_std = 5011.15

Cycle Time = 1:02:38.193431
RMSE = 93.45%
RMSE_std = 0.35

MAE = 2564.42
MAE_std = 3761.64

Cycle Time = 19:01:01.325158


RandomForestRegressor()

In [23]:
XGB_model_path=os.path.join(PROJ_ROOT, "models", "final_XGB_model.pkl")
RandForest_model_path=os.path.join(PROJ_ROOT, "models", "final_RandForest_model.pkl")

with open(XGB_model_path, 'wb') as file:  
    pickle.dump(XGB_model, file)

with open(RandForest_model_path, 'wb') as file:  
    pickle.dump(RandForest_model, file)

In [39]:
final_XGB_model.save_model("final_XGB_model.json")