In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Data/cars24fullclean.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4490 entries, 0 to 4489
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        4490 non-null   int64 
 1   cars_name         4490 non-null   object
 2   cars_brand        4490 non-null   object
 3   model             4490 non-null   object
 4   model_year        4490 non-null   int64 
 5   car_type          4490 non-null   object
 6   kms               4490 non-null   int64 
 7   owner             4490 non-null   int64 
 8   gasoliene_type    4490 non-null   object
 9   price             4490 non-null   int64 
 10  emi_per_month     4490 non-null   int64 
 11  zero_downpayment  4490 non-null   object
 12  city              4490 non-null   object
 13  state             4490 non-null   object
dtypes: int64(6), object(8)
memory usage: 491.2+ KB


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,cars_name,cars_brand,model,model_year,car_type,kms,owner,gasoliene_type,price,emi_per_month,zero_downpayment,city,state
0,0,Ford Ecosport,Ford,1.5 TITANIUMTDCI OPT,2013,Manual,179045,1,Diesel,507599,11291,Zero downpayment,Hyderabad,Telangana
1,1,Maruti Swift,Maruti,ZDI,2012,Manual,78819,2,Diesel,476699,10604,Zero downpayment,Hyderabad,Telangana
2,2,Honda Mobilio,Honda,1.5 V OPT I DTEC,2014,Manual,97289,2,Diesel,494799,11007,Zero downpayment,Hyderabad,Telangana
3,3,Maruti Wagon R 1.0,Maruti,VXI,2013,Manual,45693,2,Petrol,331999,7385,Zero downpayment,Hyderabad,Telangana
4,4,Hyundai i20 Active,Hyundai,1.2 S,2016,Manual,40586,1,Petrol,620299,13798,Zero downpayment,Hyderabad,Telangana


In [4]:
newdf=df[['cars_name', 'cars_brand', 'model']]
# newdf.drop_duplicates().to_csv("Data/brands.csv")
newdf=df[['state', 'city']]
# newdf.drop_duplicates().to_csv("Data/cities.csv")
newdf=df['model_year']
# newdf.drop_duplicates().to_csv("Data/years.csv")

In [5]:
labels = ['cars_name', 'cars_brand', 'model', 'model_year',
       'car_type', 'gasoliene_type',  'city', 'state']

for i in labels:
    label = LabelEncoder()
    tr =  label.fit_transform(df[i])
    np.save('Data/'+str('classes'+i+'.npy'), label.classes_)
    df[i] = tr 
print(df.head())

   Unnamed: 0  cars_name  cars_brand  model  model_year  car_type     kms  \
0           0         13           5    105           9         1  179045   
1           1         80          16    586           8         1   78819   
2           2         27           7    111          10         1   97289   
3           3         84          16    521           9         1   45693   
4           4         45           8     33          12         1   40586   

   owner  gasoliene_type   price  emi_per_month  zero_downpayment  city  state  
0      1               0  507599          11291  Zero downpayment     5      6  
1      2               0  476699          10604  Zero downpayment     5      6  
2      2               0  494799          11007  Zero downpayment     5      6  
3      2               1  331999           7385  Zero downpayment     5      6  
4      1               1  620299          13798  Zero downpayment     5      6  


In [6]:
x = df.drop(columns='price')
x = x.drop(columns='Unnamed: 0')
x = x.drop(columns='emi_per_month')
x = x.drop(columns='zero_downpayment')
x

Unnamed: 0,cars_name,cars_brand,model,model_year,car_type,kms,owner,gasoliene_type,city,state
0,13,5,105,9,1,179045,1,0,5,6
1,80,16,586,8,1,78819,2,0,5,6
2,27,7,111,10,1,97289,2,0,5,6
3,84,16,521,9,1,45693,2,1,5,6
4,45,8,33,12,1,40586,1,1,5,6
...,...,...,...,...,...,...,...,...,...,...
4485,80,16,501,9,1,35212,1,0,9,3
4486,71,16,203,14,1,23573,1,1,9,3
4487,64,16,353,14,1,33004,1,1,9,3
4488,65,16,356,15,1,57875,1,1,9,3


In [7]:
y = df['price']
y

0       507599
1       476699
2       494799
3       331999
4       620299
         ...  
4485     32799
4486     27399
4487      9699
4488     31299
4489     23199
Name: price, Length: 4490, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [26]:
x_train

Unnamed: 0,cars_name,cars_brand,model,model_year,car_type,kms,owner,gasoliene_type,city,state
1526,65,16,523,14,0,26297,1,1,2,5
2539,80,16,530,14,0,18224,1,1,6,4
2065,80,16,501,10,1,39011,1,0,4,1
2515,64,16,353,13,1,54041,1,1,4,1
1959,121,23,229,10,1,166755,1,0,4,1
...,...,...,...,...,...,...,...,...,...,...
2895,28,7,69,14,1,26293,1,1,7,7
2763,71,16,203,15,1,7947,1,1,6,4
905,129,24,297,9,0,26983,3,1,1,2
3980,82,16,501,15,1,42680,1,0,9,3


In [9]:
regressor_rf = RandomForestRegressor()
regressor_rf.fit(x_train,y_train)
y_train_pred = regressor_rf.predict(x_train)
y_test_pred = regressor_rf.predict(x_test)

mse_rf = metrics.mean_squared_error(y_test,y_test_pred)
rmse_rf = np.sqrt(metrics.mean_squared_error(y_test,y_test_pred))
print('MSE: ',mse_rf)
print('RMSE: ',rmse_rf)

r2_train_rf = r2_score(y_train,y_train_pred)
print('train_r2_score: ',r2_train_rf)
r2_test_rf = r2_score(y_test,y_test_pred)
print('test_r2_score: ',r2_test_rf)

MSE:  9403861330.94974
RMSE:  96973.50839765332
train_r2_score:  0.9863503176500852
test_r2_score:  0.9259406757283595


In [10]:
pickle.dump(regressor_rf, open('Data/Resale_RF_Model.sav', 'wb'))

In [24]:
print(regressor_rf.get_params(True))

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
