In [53]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.linear_model import LinearRegression

In [35]:
data = pd.read_csv('/Users/rayanesahi/Car Price Predictor/predictor/data/car data.csv.xls')

In [125]:
data.head(40)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,4522.5,7546.5,27000,Petrol,Dealer,Manual,0
1,sx4,2013,6412.5,12879.0,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,9787.5,13297.5,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,3847.5,5602.5,5200,Petrol,Dealer,Manual,0
4,swift,2014,6210.0,9274.5,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,12487.5,13270.5,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,9112.5,10962.0,18796,Petrol,Dealer,Manual,0
7,s cross,2015,8775.0,11623.5,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,11812.5,12001.5,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,10057.5,12042.0,42367,Diesel,Dealer,Manual,0


In [36]:
# converting price (lakhs --> usd)
rate = 1350
data['Selling_Price'] = data['Selling_Price']*rate
data['Present_Price'] = data['Present_Price']*rate
data

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,4522.5,7546.5,27000,Petrol,Dealer,Manual,0
1,sx4,2013,6412.5,12879.0,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,9787.5,13297.5,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,3847.5,5602.5,5200,Petrol,Dealer,Manual,0
4,swift,2014,6210.0,9274.5,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,12825.0,15660.0,33988,Diesel,Dealer,Manual,0
297,brio,2015,5400.0,7965.0,60000,Petrol,Dealer,Manual,0
298,city,2009,4522.5,14850.0,87934,Petrol,Dealer,Manual,0
299,city,2017,15525.0,16875.0,9000,Diesel,Dealer,Manual,0


In [235]:
#selecting columns
df = data[['Selling_Price', 'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']].dropna()
df

Unnamed: 0,Selling_Price,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,4522.5,2014,7546.5,27000,Petrol,Dealer,Manual,0
1,6412.5,2013,12879.0,43000,Diesel,Dealer,Manual,0
2,9787.5,2017,13297.5,6900,Petrol,Dealer,Manual,0
3,3847.5,2011,5602.5,5200,Petrol,Dealer,Manual,0
4,6210.0,2014,9274.5,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
296,12825.0,2016,15660.0,33988,Diesel,Dealer,Manual,0
297,5400.0,2015,7965.0,60000,Petrol,Dealer,Manual,0
298,4522.5,2009,14850.0,87934,Petrol,Dealer,Manual,0
299,15525.0,2017,16875.0,9000,Diesel,Dealer,Manual,0


In [236]:
#encoding string coloumns
le = LabelEncoder()

df['Fuel_Type'] = le.fit_transform(df['Fuel_Type'])
df['Seller_Type'] = le.fit_transform(df['Seller_Type'])
df['Transmission'] = le.fit_transform(df['Transmission'])

#changing year of fabrication to age of the car
df['Age'] = 2018 - df['Year']
del df['Year']

df.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Age
0,4522.5,7546.5,27000,2,0,1,0,4
1,6412.5,12879.0,43000,1,0,1,0,5
2,9787.5,13297.5,6900,2,0,1,0,1
3,3847.5,5602.5,5200,2,0,1,0,7
4,6210.0,9274.5,42450,1,0,1,0,4


In [275]:
#adding depreciation columns
df['Depreciation'] = df['Present_Price']-df['Selling_Price']
df['Depreciation/Year'] = (df['Present_Price']-df['Selling_Price'])/(df['Age'])
df['Depreciation%'] = ((df['Present_Price']-df['Selling_Price'])/df['Present_Price'])*100

df

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Age,Depreciation,Depreciation/Year,Depreciation%,Depreciation%/Year
0,4522.5,7546.5,27000,2,0,1,0,4,3024.0,756.000000,40.071556,10.017889
1,6412.5,12879.0,43000,1,0,1,0,5,6466.5,1293.300000,50.209644,10.041929
2,9787.5,13297.5,6900,2,0,1,0,1,3510.0,3510.000000,26.395939,26.395939
3,3847.5,5602.5,5200,2,0,1,0,7,1755.0,250.714286,31.325301,4.475043
4,6210.0,9274.5,42450,1,0,1,0,4,3064.5,766.125000,33.042213,8.260553
...,...,...,...,...,...,...,...,...,...,...,...,...
296,12825.0,15660.0,33988,1,0,1,0,2,2835.0,1417.500000,18.103448,9.051724
297,5400.0,7965.0,60000,2,0,1,0,3,2565.0,855.000000,32.203390,10.734463
298,4522.5,14850.0,87934,2,0,1,0,9,10327.5,1147.500000,69.545455,7.727273
299,15525.0,16875.0,9000,1,0,1,0,1,1350.0,1350.000000,8.000000,8.000000


In [266]:
df.loc[(df.Seller_Type == 0) & (df.Age == 1)]

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Age,Depreciation,Depreciation/Year,Depreciation%,Depreciation%/Year
2,9787.5,13297.5,6900,2,0,1,0,1,3510.0,3510.0,26.395939,26.395939
10,3847.5,4860.0,2135,2,0,1,0,1,1012.5,1012.5,20.833333,20.833333
21,6615.0,7708.5,2400,2,0,1,0,1,1093.5,1093.5,14.185639,14.185639
49,10462.5,12541.5,37000,2,0,0,0,1,2079.0,2079.0,16.576964,16.576964
52,24300.0,26689.5,15000,1,0,0,0,1,2389.5,2389.5,8.952959,8.952959
64,44550.0,48910.5,6000,1,0,0,0,1,4360.5,4360.5,8.915264,8.915264
66,26662.5,31252.5,11000,2,0,0,0,1,4590.0,4590.0,14.686825,14.686825
82,31050.0,34276.5,15000,1,0,0,0,1,3226.5,3226.5,9.413155,9.413155
97,22950.0,25164.0,8700,2,0,1,0,1,2214.0,2214.0,8.798283,8.798283
206,7762.5,9625.5,12479,2,0,1,0,1,1863.0,1863.0,19.354839,19.354839


In [267]:
df.corr()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Age,Depreciation,Depreciation/Year,Depreciation%,Depreciation%/Year
Selling_Price,1.0,0.878983,0.029187,-0.509467,-0.550724,-0.367128,-0.088344,-0.236141,0.520881,0.824017,-0.232544,-0.017972
Present_Price,0.878983,1.0,0.203647,-0.440415,-0.51203,-0.348715,0.008057,0.047584,0.864902,0.925182,0.102439,-0.000376
Kms_Driven,0.029187,0.203647,1.0,-0.166801,-0.101419,-0.16251,0.089216,0.524342,0.333832,0.152904,0.505896,-0.119817
Fuel_Type,-0.509467,-0.440415,-0.166801,1.0,0.352415,0.080466,0.055705,0.053643,-0.252129,-0.343088,0.085611,0.128419
Seller_Type,-0.550724,-0.51203,-0.101419,0.352415,1.0,0.06324,0.124269,0.039896,-0.336902,-0.48807,0.037513,0.036591
Transmission,-0.367128,-0.348715,-0.16251,0.080466,0.06324,1.0,-0.050316,-0.000394,-0.237802,-0.34043,0.009392,-0.006338
Owner,-0.088344,0.008057,0.089216,0.055705,0.124269,-0.050316,1.0,0.182104,0.107415,-0.014648,0.222572,-0.010661
Age,-0.236141,0.047584,0.524342,0.053643,0.039896,-0.000394,0.182104,1.0,0.333746,-0.053614,0.848699,-0.354665
Depreciation,0.520881,0.864902,0.333832,-0.252129,-0.336902,-0.237802,0.107415,0.333746,1.0,0.790151,0.428156,0.018227
Depreciation/Year,0.824017,0.925182,0.152904,-0.343088,-0.48807,-0.34043,-0.014648,-0.053614,0.790151,1.0,0.119371,0.27773


In [186]:
#selecting model variables
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [141]:
gb = GradientBoostingRegressor()

# Rate at which correcting is being made
learning_rate = [0.001, 0.01, 0.1, 0.2]
# Number of trees in Gradient boosting
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"learning_rate":learning_rate,
              "n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

In [80]:
#creating model and running rcv to find the best parameters
gb_rs = RandomizedSearchCV(estimator = gb, param_distributions = param_grid, random_state=1)

gb_rs.fit(X_train,y_train)

RandomizedSearchCV(estimator=GradientBoostingRegressor(),
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1,
                                                          0.2],
                                        'max_depth': [4, 8],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 7],
                                        'min_samples_split': [4, 6, 8],
                                        'n_estimators': [500, 600, 700, 800,
                                                         900]},
                   random_state=1)

In [81]:
print("\n The best parameters across ALL searched params:\n", gb_rs.best_params_)


 The best parameters across ALL searched params:
 {'n_estimators': 600, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 4, 'learning_rate': 0.01}


In [270]:
#fiting model with best parameters
priceModel = GradientBoostingRegressor(n_estimators=600, min_samples_split= 4, min_samples_leaf= 1, max_features= 'auto', max_depth=4, learning_rate=0.01)
priceModel.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=4, max_features='auto',
                          min_samples_split=4, n_estimators=600)

In [271]:
predictions = priceModel.predict(X_test)
predictions

array([10111.73942516,  3997.08088168,   713.02976843,  4418.91657176,
        1416.25243359,  6897.52405424,  1123.53965639,  4125.51615225,
       12624.84453494,   859.65680664, 11678.63820464,  4941.80264655,
        6971.62523022,  5804.43577093,   301.70668009,   670.66768789,
       10044.56214165,  7165.9707    ,  6906.0768665 , 10044.56214165,
        7316.28001719,  5040.60195493,  8574.82033587, 10959.61158451,
       12278.75607399,  4647.56018803,  3575.34530647,   929.96486142,
         397.94456475,   699.82279249,   482.63695248,   325.67272758,
        6600.07799945, 32084.7485142 , 30696.14204573,  5804.43577093,
        4468.37501948,  1179.25768152,   452.85194125,  5454.03219978,
       12635.58815653, 15893.04659945,   514.12639171,  6081.35405193,
        6326.33553499,  6088.9476166 ,  8250.54999965,  9685.12626231,
        9950.9315344 ,  1766.23489828,  6324.88991479,   735.33343066,
        3538.65130235,  5978.56425527,  1849.79775476,   408.64304272,
      

In [272]:
print('r2:', r2_score(y_test, predictions))
print('MAE:', mean_absolute_error(y_test, predictions))
print('MSE:', mean_squared_error(y_test, predictions))

r2: 0.9411897284525446
MAE: 771.0538526149886
MSE: 2325219.7378466018


In [273]:
import pickle
file = open("file.pkl", "wb") # opening a new file in write mode
pickle.dump(priceModel, file) # dumping created model into a pickle file

In [342]:
# depreciation model
df2 = df[['Depreciation', 'Age', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

X_1 = df2.iloc[:,1:]
y_1 = df2.iloc[:,0]

X_1

Unnamed: 0,Age,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,4,7546.5,27000,2,0,1,0
1,5,12879.0,43000,1,0,1,0
2,1,13297.5,6900,2,0,1,0
3,7,5602.5,5200,2,0,1,0
4,4,9274.5,42450,1,0,1,0
...,...,...,...,...,...,...,...
296,2,15660.0,33988,1,0,1,0
297,3,7965.0,60000,2,0,1,0
298,9,14850.0,87934,2,0,1,0
299,1,16875.0,9000,1,0,1,0


In [343]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.2, random_state=1)

In [344]:
gb = GradientBoostingRegressor()

# Rate at which correcting is being made
learning_rate = [0.001, 0.01, 0.1, 0.2]
# Number of trees in Gradient boosting
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"learning_rate":learning_rate,
              "n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

In [345]:
pred_sc = RandomizedSearchCV(estimator = gb, param_distributions = param_grid, random_state=1)

pred_sc.fit(X_1_train,y_1_train)

RandomizedSearchCV(estimator=GradientBoostingRegressor(),
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1,
                                                          0.2],
                                        'max_depth': [4, 8],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 7],
                                        'min_samples_split': [4, 6, 8],
                                        'n_estimators': [500, 600, 700, 800,
                                                         900]},
                   random_state=1)

In [346]:
print("\n The best parameters across ALL searched params:\n", pred_sc.best_params_)


 The best parameters across ALL searched params:
 {'n_estimators': 600, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 4, 'learning_rate': 0.01}


In [347]:
depModel = GradientBoostingRegressor(n_estimators=600, min_samples_split= 4, min_samples_leaf= 1, max_features= 'auto', max_depth=4, learning_rate=0.01)

depModel.fit(X_1_train, y_1_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=4, max_features='auto',
                          min_samples_split=4, n_estimators=600)

In [348]:
predictions1 = depModel.predict(X_1_test)

In [358]:
print('r2:', r2_score(y_1_test, predictions1))
print('MAE:', mean_absolute_error(y_1_test, predictions1))
print('MSE:', mean_squared_error(y_1_test, predictions1))

r2: 0.9178174166441134
MAE: 861.6817459825231
MSE: 2853282.39065122


In [363]:
# X = [['Age', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
X = [[1, 12000, 100000, 1, 1, 1, 0]]
X_2 = [[3, 12000, 100000, 1, 1, 1, 0]]

In [364]:
import pickle
file = open("file.pkl", "wb") # opening a new file in write mode
pickle.dump(depModel, file) # dumping created model into a pickle file