In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as s
import pickle

import warnings
warnings.filterwarnings('ignore')

import datetime as dt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import shapiro

import joblib

import AlgoData as alg

## Finding best parameters for the model

In [2]:
df = pd.read_csv('TrainData_Processed.csv', index_col=[0])
X = df.drop(['Price','EngineVolume_BIN'], axis=1)
y = df['Price']
#Randomized Search CV for searching the best parameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
rf = RandomForestRegressor()
rf_random_model = RandomizedSearchCV(estimator = rf,
                                     param_distributions = random_grid,
                                     scoring='neg_mean_squared_error',
                                    n_iter = 10,
                                    cv = 5,
                                    verbose=2,
                                    random_state=42,
                                    n_jobs = 1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=10)


## Training the model

In [36]:
rf_random_model.fit(Xtrain, ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   9.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.2s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  11.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  11.2s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

## Scoring the model 

In [37]:
y_pred = rf_random_model.predict(Xtest)
MAPE = mean_absolute_percentage_error(np.exp(ytest), np.exp(y_pred))
RMSE = mean_squared_error(np.exp(ytest), np.exp(y_pred))
MAE = mean_absolute_error(np.exp(ytest), np.exp(y_pred))

## Storing the model (1.0.1)

In [38]:
joblib.dump(rf_random_model,"C:\\Users\\91748\\Desktop\\RF_final_1_0_1.obj")

['C:\\Users\\91748\\Desktop\\RF_final_1_0_1.obj']

In [39]:
print(f'{MAPE =}')
print(f'{RMSE =}')
print(f'{MAE =}')

MAPE =0.9972735855898415
RMSE =71050969.36153123
MAE =4126.268940904072


In [41]:
Xtest

Unnamed: 0,Levy,Prod. year,Leather interior,Engine volume,Mileage,Cylinders,Airbags,Turbo,Mileage_BIN,Manufacturer,Model,Category,Fuel type,Gear box type,Drive wheels,Doors,Wheel,Color
6217,779.0,2013,0,2.5,75000.0,4.0,12,0,2.0,36.0,412.0,9.0,2.0,0.0,1.0,1.0,0.0,7.0
6075,781.0,2012,1,2.5,90000.0,4.0,6,0,2.0,36.0,412.0,9.0,5.0,0.0,1.0,1.0,0.0,12.0
3031,891.0,2016,1,2.0,134413.0,4.0,4,0,3.0,15.0,1238.0,4.0,1.0,0.0,1.0,1.0,0.0,12.0
7241,915.0,2014,1,3.0,343306.2,6.0,0,0,9.0,1.0,223.0,3.0,5.0,0.0,0.0,1.0,0.0,7.0
2762,1091.0,2016,1,2.5,124708.0,4.0,4,0,3.0,15.0,793.0,10.0,1.0,0.0,1.0,1.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13697,0.0,2011,0,1.8,74900.0,4.0,10,0,2.0,15.0,607.0,9.0,5.0,0.0,1.0,1.0,0.0,1.0
16424,0.0,2010,1,1.5,180000.0,4.0,5,1,5.0,27.0,986.0,9.0,1.0,1.0,1.0,1.0,0.0,1.0
357,697.0,2015,0,1.8,85000.0,4.0,12,0,2.0,15.0,607.0,9.0,5.0,2.0,1.0,1.0,0.0,1.0
15287,1018.0,2011,1,3.0,275862.0,6.0,12,0,8.0,24.0,940.0,4.0,1.0,0.0,0.0,1.0,0.0,12.0


## Accessing the model


In [3]:
rf_model = joblib.load("C:\\Users\\91748\\Desktop\\RF_final_1_0_1.obj")

In [4]:
y_pred = rf_model.predict(Xtest)

## Getting output from input

Levy	

Prod. year

Leather interior	

Engine volume	

Mileage	

Cylinders	

Airbags	

Turbo	

Mileage_BIN	

Manufacturer	

Model	

Category	

Fuel type	

Gear box type	

Drive wheels	

Doors	

Wheel	

Color

In [5]:
levy = 0
year = 2015
leather = 1
engine_vol = 1.5
mileage = 100000
mileage_BIN = alg.GetBINVal(mileage)
cylinders = 4
airbags = 10
turbo = 1

manufacturer = 'HONDA'
model = 'FIT'
category = 'Hatchback'
fuel_type = 'Petrol'
gear_box_type = 'Variator'
drive_wheels = 'Front'
doors = ''
wheel = 'Right-hand drive'
color = 'Silver'