In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [3]:
#Importing the cleaned dataset

In [4]:
data=pd.read_csv('Cleaned car.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,4,Ford Figo,Ford,2012,175000,41000,Diesel


In [6]:
data.shape

(815, 7)

In [7]:
#Removing an extra unnamed column

In [8]:
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

In [9]:
data.shape

(815, 6)

In [10]:
data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [11]:
#Splitting the dataset into train and test dataset and extracting features

In [12]:
x=data.drop(columns='Price')
y=data['Price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [13]:
#Preprocessing the Categorical Data

In [14]:
o=OneHotEncoder()
o.fit(x[['name','company','fuel_type']])

OneHotEncoder()

In [15]:
column_trans= make_column_transformer((OneHotEncoder(categories=o.categories_),['name','company','fuel_type']),
                                     remainder='passthrough')

In [16]:
# Linear Regression Model

In [17]:
from sklearn.linear_model import LinearRegression
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
model1= LinearRegression()
pipeLR=make_pipeline(column_trans,model1)
pipeLR.fit(x_train,y_train)
y_pred=pipeLR.predict(x_test)
r2_score(y_test,y_pred)

0.7167729502540936

In [18]:
# Checked different random state for max R2 Score

In [19]:
score=[]
for i in range(1000):
    x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=i)
    pipeLR=make_pipeline(column_trans,model1)
    pipeLR.fit(x_train,y_train)
    y_pred=pipeLR.predict(x_test)
    score.append(r2_score(y_test,y_pred))
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))
pipeLR=make_pipeline(column_trans,model1)
pipeLR.fit(x_train,y_train)
y_pred=pipeLR.predict(x_test)
r2_score(y_test,y_pred)

0.8900436342356015

In [20]:
# DecisionTree Regression Model

In [21]:
from sklearn.tree import DecisionTreeRegressor
model2= DecisionTreeRegressor(max_depth=15)
pipeDT=make_pipeline(column_trans,model2)
pipeDT.fit(x_train,y_train)
y_pred=pipeDT.predict(x_test)
r2_score(y_test,y_pred)

0.7593580650446885

In [22]:
# Checked different random state for max R2 Score

In [23]:
score=[]
for i in range(1000):
    x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=i)
    pipeDT=make_pipeline(column_trans,model2)
    pipeDT.fit(x_train,y_train)
    y_pred=pipeDT.predict(x_test)
    score.append(r2_score(y_test,y_pred))
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))
pipeDT=make_pipeline(column_trans,model2)
pipeDT.fit(x_train,y_train)
y_pred=pipeDT.predict(x_test)
r2_score(y_test,y_pred)

0.8503040649021445

In [24]:
# Random Forest Model

In [25]:
from sklearn.ensemble import RandomForestRegressor
model3= RandomForestRegressor(n_estimators=1000,
                             random_state=3,
                             max_samples=0.5,
                             max_features=0.75,
                             max_depth=15)
pipeRF=make_pipeline(column_trans,model3)
pipeRF .fit(x_train,y_train)
y_pred=pipeRF.predict(x_test)
r2_score(y_test,y_pred)

0.7885067505057635

In [26]:
score=[]
for i in range(100):
    x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=i)
    pipeRF=make_pipeline(column_trans,model3)
    pipeRF.fit(x_train,y_train)
    y_pred=pipeRF.predict(x_test)
    score.append(r2_score(y_test,y_pred))
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))
pipeRF=make_pipeline(column_trans,model3)
pipeRF.fit(x_train,y_train)
y_pred=pipeRF.predict(x_test)
r2_score(y_test,y_pred)

0.8099410876728591

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [39]:
y_pred = pipeLR.predict(x_test)

print('MAE:', mean_absolute_error(y_pred, y_test))
print('MSE:',mean_squared_error(y_pred, y_test))

MAE: 49373.99439641225
MSE: 6170001442.712499


In [27]:
# Checked different random state for max R2 Score

In [28]:
#The Linear Regression Model had the best Accuracy, so saving the model using pickle

In [29]:
pickle.dump(pipeLR,open('LinearRegressionModel.pkl','wb'))

In [30]:
data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [31]:
# Using the model to predict

In [32]:
pipeLR.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']], columns=['name','company','year','kms_driven','fuel_type']))

array([401350.63170536])

In [33]:
pipeLR.predict(pd.DataFrame([['Ford Endeavor 4x4','Ford',2009,50000,'Petrol']], columns=['name','company','year','kms_driven','fuel_type']))

array([2696324.86666457])