Import Librairies

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Data Preprocessing

In [52]:
#load car file in pandas dataframe
dataset = pd.read_csv("/Users/HP/Desktop/Car Price Predection/ford.csv")
dataset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


In [53]:
#checking for missing values
dataset.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [54]:
print(dataset['transmission'].unique())
print(dataset['fuelType'].unique())

['Automatic' 'Manual' 'Semi-Auto']
['Petrol' 'Diesel' 'Hybrid' 'Electric' 'Other']


In [55]:
#encoding the categprical transmission column
dataset.replace({'transmission':{'Automatic':0,'Manual':1,'Semi-Auto':2}},inplace=True)

#encoding the categorical fuelType column
dataset.replace({'fuelType':{'Petrol':0,'Diesel':1,'Hybrid':2,'Electric':3,'Other':4}},inplace=True)

dataset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,0,15944,0,150,57.7,1.0
1,Focus,2018,14000,1,9083,0,150,57.7,1.0
2,Focus,2017,13000,1,12456,0,150,57.7,1.0
3,Fiesta,2019,17500,1,10460,0,145,40.3,1.5
4,Fiesta,2019,16500,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,1,16700,0,150,47.1,1.4
17962,B-MAX,2014,7499,1,40700,0,30,57.7,1.0
17963,Focus,2015,9999,1,7010,1,20,67.3,1.6
17964,KA,2018,8299,1,5007,0,145,57.7,1.2


In [56]:
x = dataset.drop(['model','price'],axis=1)
x

Unnamed: 0,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,2017,0,15944,0,150,57.7,1.0
1,2018,1,9083,0,150,57.7,1.0
2,2017,1,12456,0,150,57.7,1.0
3,2019,1,10460,0,145,40.3,1.5
4,2019,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...
17961,2017,1,16700,0,150,47.1,1.4
17962,2014,1,40700,0,30,57.7,1.0
17963,2015,1,7010,1,20,67.3,1.6
17964,2018,1,5007,0,145,57.7,1.2


Splitting data into features and labels

In [57]:
x = dataset.drop(['model','price'],axis=1).values
y = dataset['price']

In [58]:
x

array([[2.0170e+03, 0.0000e+00, 1.5944e+04, ..., 1.5000e+02, 5.7700e+01,
        1.0000e+00],
       [2.0180e+03, 1.0000e+00, 9.0830e+03, ..., 1.5000e+02, 5.7700e+01,
        1.0000e+00],
       [2.0170e+03, 1.0000e+00, 1.2456e+04, ..., 1.5000e+02, 5.7700e+01,
        1.0000e+00],
       ...,
       [2.0150e+03, 1.0000e+00, 7.0100e+03, ..., 2.0000e+01, 6.7300e+01,
        1.6000e+00],
       [2.0180e+03, 1.0000e+00, 5.0070e+03, ..., 1.4500e+02, 5.7700e+01,
        1.2000e+00],
       [2.0150e+03, 1.0000e+00, 5.0070e+03, ..., 2.2000e+01, 5.7700e+01,
        1.0000e+00]])

Lets standardize the data

In [59]:
scalar = StandardScaler()
scalar.fit(x)


StandardScaler()

In [60]:
standarsized_x = scalar.transform(x)
standarsized_x

array([[ 0.06512772, -2.67003231, -0.38099808, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.55286624,  0.04135139, -0.73335899, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.06512772,  0.04135139, -0.56013157, ...,  0.59135805,
        -0.02044162, -0.81138621],
       ...,
       [-0.91034931,  0.04135139, -0.83982222, ..., -1.50505332,
         0.92766777,  0.57636151],
       [ 0.55286624,  0.04135139, -0.94269045, ...,  0.51072684,
        -0.02044162, -0.34880364],
       [-0.91034931,  0.04135139, -0.94269045, ..., -1.47280084,
        -0.02044162, -0.81138621]])

In [61]:
x = standarsized_x
y = dataset['price']

In [62]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=42)
print(x.shape,x_train.shape,x_test.shape)
print(y.shape,y_train.shape,y_test.shape)

(17966, 7) (16169, 7) (1797, 7)
(17966,) (16169,) (1797,)


In [63]:
#load the model
model = LinearRegression()

In [64]:
#fit our training data into the model
model.fit(x_train,y_train)

LinearRegression()

Model Evaluation

In [65]:
#Prediction on training data
training_data_pred = model.predict(x_train)

#R2 score on training data
score_1 = metrics.r2_score(y_train,training_data_pred)

#Mean absolute error
mse = metrics.mean_absolute_error(y_train,training_data_pred)
print("R2 score on training data : ",score_1)
print("Mean absolute error on training data : ",mse)


R2 score on training data :  0.7368957498477838
Mean absolute error on training data :  1743.2031376013817


In [66]:
#Prediction on testing data
testing_data_pred = model.predict(x_test)

#R2 score on training data
score_1 = metrics.r2_score(y_test,testing_data_pred)

#Mean absolute error
mse = metrics.mean_absolute_error(y_test,testing_data_pred)
print("R2 score on testing data : ",score_1)
print("Mean absolute error on testing data : ",mse)


R2 score on testing data :  0.7284562376019509
Mean absolute error on testing data :  1799.5811063879883


Making Predictions

In [69]:
input_data = (2017,0,15944,0,150,57.7,1.0)
#changing the input into numpy array and reshaping
input_changed = np.array(input_data).reshape(1,-1)

#Standardize the input
std_input = scalar.transform(input_changed)
prediction = model.predict(std_input)
print(prediction)
print("This is car price estimation prediction = ",prediction)

[11296.37758226]
This is car price estimation prediction =  [11296.37758226]


Saving our model and scalar

In [70]:
import joblib

#save the model
joblib.dump(model,'model.pkl')

#save the standard scalar
joblib.dump(scalar,'scalar.pkl')

['scalar.pkl']