### Import required libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import preprocessing 
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pickle

###  Load the dataset

In [2]:
df=pd.read_csv("pp_data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,1,18300,coupe,2011,manual,190,not-declared,125000,5,diesel,audi,Yes
1,2,9800,suv,2004,automatic,163,grand,125000,8,diesel,jeep,not-declared
2,3,1500,small car,2001,manual,75,golf,150000,6,petrol,volkswagen,No
3,4,3600,small car,2008,manual,69,fabia,90000,7,diesel,skoda,No
4,5,650,limousine,1995,manual,102,3er,150000,10,petrol,bmw,Yes


In [4]:
df.drop(columns=df.columns[0],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,18300,coupe,2011,manual,190,not-declared,125000,5,diesel,audi,Yes
1,9800,suv,2004,automatic,163,grand,125000,8,diesel,jeep,not-declared
2,1500,small car,2001,manual,75,golf,150000,6,petrol,volkswagen,No
3,3600,small car,2008,manual,69,fabia,90000,7,diesel,skoda,No
4,650,limousine,1995,manual,102,3er,150000,10,petrol,bmw,Yes


In [6]:
ndf=df.copy()

### Label Encoding

In [7]:
le=preprocessing.LabelEncoder()

In [8]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']
for i in labels:
    ndf[i]=le.fit_transform(ndf[i])
    np.save(str('classes'+i+'.npy'), ndf[i])

In [9]:
ndf.head()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,18300,3,2011,1,190,162,125000,5,1,1,1
1,9800,8,2004,0,163,118,125000,8,1,14,2
2,1500,7,2001,1,75,117,150000,6,7,38,0
3,3600,7,2008,1,69,102,90000,7,1,31,0
4,650,4,1995,1,102,11,150000,10,7,2,1


In [10]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278575 entries, 0 to 278574
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   price                278575 non-null  int64
 1   vehicleType          278575 non-null  int32
 2   yearOfRegistration   278575 non-null  int64
 3   gearbox              278575 non-null  int32
 4   powerPS              278575 non-null  int64
 5   model                278575 non-null  int32
 6   kilometer            278575 non-null  int64
 7   monthOfRegistration  278575 non-null  int64
 8   fuelType             278575 non-null  int32
 9   brand                278575 non-null  int32
 10  notRepairedDamage    278575 non-null  int32
dtypes: int32(6), int64(5)
memory usage: 17.0 MB


In [11]:
df.isnull().sum()

price                  0
vehicleType            0
yearOfRegistration     0
gearbox                0
powerPS                0
model                  0
kilometer              0
monthOfRegistration    0
fuelType               0
brand                  0
notRepairedDamage      0
dtype: int64

In [12]:
ndf.describe()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
count,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0,278575.0
mean,6421.448163,3.680582,2003.366006,0.790703,129.264586,101.153381,125330.664991,6.004703,4.952648,20.154242,0.37214
std,8353.587326,2.479063,6.610599,0.446262,61.746715,71.840166,39353.667823,3.567359,2.746956,13.416768,0.715436
min,100.0,0.0,1950.0,0.0,51.0,0.0,5000.0,0.0,0.0,0.0,0.0
25%,1500.0,1.0,1999.0,1.0,86.0,39.0,100000.0,3.0,1.0,9.0,0.0
50%,3600.0,4.0,2004.0,1.0,116.0,96.0,150000.0,6.0,7.0,21.0,0.0
75%,8200.0,7.0,2008.0,1.0,150.0,162.0,150000.0,9.0,7.0,32.0,0.0
max,150000.0,8.0,2016.0,2.0,871.0,249.0,150000.0,12.0,7.0,39.0,2.0


### Split the data into dependent and independent variables.

In [13]:
Y = ndf.iloc[:,0].values
X = ndf.iloc[:,1:].values

In [14]:
X

array([[   3, 2011,    1, ...,    1,    1,    1],
       [   8, 2004,    0, ...,    1,   14,    2],
       [   7, 2001,    1, ...,    7,   38,    0],
       ...,
       [   0, 1996,    1, ...,    1,   38,    0],
       [   1, 2002,    1, ...,    1,   38,    2],
       [   4, 2013,    1, ...,    7,    2,    0]], dtype=int64)

In [15]:
Y

array([18300,  9800,  1500, ...,  9200,  3400, 28990], dtype=int64)

In [16]:
Y = Y.reshape(-1,1)

In [17]:
Y

array([[18300],
       [ 9800],
       [ 1500],
       ...,
       [ 9200],
       [ 3400],
       [28990]], dtype=int64)

In [18]:
print(X.shape,Y.shape)

(278575, 10) (278575, 1)


### Train test split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X ,Y ,test_size = 0.3,random_state =42)

In [20]:
X_train

array([[   7, 1997,    1, ...,    7,   27,    2],
       [   4, 2000,    0, ...,    7,   38,    0],
       [   1, 1997,    0, ...,    7,   24,    1],
       ...,
       [   1, 2004,    1, ...,    1,   38,    0],
       [   7, 2001,    1, ...,    7,   10,    2],
       [   7, 2000,    1, ...,    7,   24,    2]], dtype=int64)

### Choosing appropriate model

### Multilinear Regression

In [21]:
multiple_lin_reg = LinearRegression()
multiple_lin_reg.fit(X_train,Y_train)

LinearRegression()

In [22]:
y_pred_mlr = multiple_lin_reg.predict(X_test)

### Metrics Evaluation

In [23]:
mae = mean_absolute_error(Y_test, y_pred_mlr)
mse = mean_squared_error(Y_test, y_pred_mlr)
rmse = np.sqrt(mse)
rmsle = np.log(rmse)
n,k = X_train.shape
r2=r2_score(Y_test,y_pred_mlr)
adj_r2= 1 - ((1-r2)*(n-1)/(n-k-1))
print(mae,mse,rmse,rmsle,r2,adj_r2)

3157.4973204078387 31031568.420813024 5570.598569347196 8.625257790226078 0.568129176812601 0.5681070285686725


### Random Forest Regressor

In [24]:
regressor = RandomForestRegressor(n_estimators = 300,max_depth=10,random_state=42)

In [25]:
regressor.fit(X_train,np.ravel(Y_train,order='C'))

RandomForestRegressor(max_depth=10, n_estimators=300, random_state=42)

In [26]:
y_pred = regressor.predict(X_test)

### Metrics Evaluation

In [27]:
mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.log(rmse)
n,k = X_train.shape
r2=r2_score(Y_test,y_pred)
adj_r2= 1 - ((1-r2)*(n-1)/(n-k-1))
print(mae,mse,rmse,rmsle,r2,adj_r2)

1633.657823398074 11287655.732912757 3359.7106620827867 8.119610136773996 0.8429080635847128 0.8429000072161412


#### R^2 score is an indicator of accuracy of Regression Models, and the accuracy is measured as close to 1 of this value. Therefore, as seen, Random Forest Regression is better than  Multiple Linear Regression Model  on this dataset when comparing their R^2 scores.

### Save the model

In [30]:
filename = 'crvp.sav'
pickle.dump(regressor,open(filename,'wb'))