# Predicting the Car Prices by using Linear Regression

Created API of car price prediction as well, link to github: https://github.com/Lovepreet12a/Car_Price_Predictor.github.io

Following are points covered in this project:
    
        - Data Analyzed 
        - Data Cleaned 
        - Data explored
        - Use of OneHotEncoder 
        - Model building
        - Finding of best random state 
        - Improving the accuracy of the model\


In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Car_prediction.csv")

In [3]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
df.shape

(892, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
df.isnull().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

In [7]:
#Data Cleaning

In [8]:
# Checking for the year column
df.year.unique()

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',
       'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',
       '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',
       'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',
       ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',
       't xe', 'EV2', 'r...', 'zest'], dtype=object)

As we can see that the year column has so many non-numeric values.

In [9]:
df = df[df.year.str.isnumeric()]

In [10]:
df.year.dtypes

dtype('O')

Need to convert object data type of year into integer

In [11]:
df["year"] = df.year.astype("int64")

In [12]:
df.year.dtypes

dtype('int64')

In [13]:
# Checking for the price column

In [14]:
df.Price.unique()

array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',
       '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',
       '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',
       '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',
       '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',
       '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',
       '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',
       '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',
       '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',
       '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',
       '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',
       '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',
       '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',
       '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',
       '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,5

As we can see that there is one value called "Ask For Price" in price column, so need to remove it.

In [15]:
df = df[df.Price != "Ask For Price"]

In [16]:
df.Price.dtypes

dtype('O')

In [17]:
# Now, need to remove commas and change the data type

In [18]:
df["Price"] = df.Price.str.replace(",", "").astype(int)

In [19]:
df.Price

0       80000
1      425000
3      325000
4      575000
6      175000
        ...  
886    300000
888    260000
889    390000
890    180000
891    160000
Name: Price, Length: 819, dtype: int32

In [20]:
# Need to work on the column kms_driven
df.kms_driven

0        45,000 kms
1            40 kms
3        28,000 kms
4        36,000 kms
6        41,000 kms
           ...     
886    1,32,000 kms
888      27,000 kms
889      40,000 kms
890          Petrol
891          Petrol
Name: kms_driven, Length: 819, dtype: object

In [21]:
df.kms_driven.unique()

array(['45,000 kms', '40 kms', '28,000 kms', '36,000 kms', '41,000 kms',
       '25,000 kms', '24,530 kms', '60,000 kms', '30,000 kms',
       '32,000 kms', '48,660 kms', '4,000 kms', '16,934 kms',
       '43,000 kms', '35,550 kms', '39,522 kms', '39,000 kms',
       '55,000 kms', '72,000 kms', '15,975 kms', '70,000 kms',
       '23,452 kms', '35,522 kms', '48,508 kms', '15,487 kms',
       '82,000 kms', '20,000 kms', '68,000 kms', '38,000 kms',
       '27,000 kms', '33,000 kms', '46,000 kms', '16,000 kms',
       '47,000 kms', '35,000 kms', '30,874 kms', '15,000 kms',
       '29,685 kms', '1,30,000 kms', '19,000 kms', '54,000 kms',
       '13,000 kms', '38,200 kms', '22,000 kms', '50,000 kms',
       '13,500 kms', '3,600 kms', '45,863 kms', '60,500 kms',
       '12,500 kms', '18,000 kms', '13,349 kms', '29,000 kms',
       '44,000 kms', '42,000 kms', '14,000 kms', '49,000 kms',
       '36,200 kms', '51,000 kms', '1,04,000 kms', '33,333 kms',
       '33,600 kms', '5,600 kms', '7,500 km

Need to remove commas and kms from the kms_driven column

In [22]:
df["kms_driven"] = df.kms_driven.str.replace(",", "")

In [23]:
df.kms_driven

0       45000 kms
1          40 kms
3       28000 kms
4       36000 kms
6       41000 kms
          ...    
886    132000 kms
888     27000 kms
889     40000 kms
890        Petrol
891        Petrol
Name: kms_driven, Length: 819, dtype: object

In [24]:
df["kms_driven"] = df.kms_driven.str.split(" ").str.get(0)

In [25]:
# Now we will be keeping only the numeric values in the column "kms_driven"

df = df[df.kms_driven.str.isnumeric()]

In [26]:
# changing the data types of kms_driven to interegr

df["kms_driven"] = df["kms_driven"].astype(int)

In [27]:
df.kms_driven.dtypes

dtype('int32')

In [28]:
# Checking on the column fuel_type

df.fuel_type.unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

As we can see that there are some nan values in fuel_type column.

In [29]:
df.fuel_type.value_counts(dropna = False)

Petrol    428
Diesel    386
LPG         2
NaN         1
Name: fuel_type, dtype: int64

In [30]:
# Going to remove this NaN value 

df = df[~df.fuel_type.isnull()]

In [31]:
df.fuel_type.unique()

array(['Petrol', 'Diesel', 'LPG'], dtype=object)

As we see that fuel_type column has no null values anymore.

In [32]:
# Now, checking with the name column

In [33]:
df.name

0        Hyundai Santro Xing XO eRLX Euro III
1                     Mahindra Jeep CL550 MDI
3      Hyundai Grand i10 Magna 1.2 Kappa VTVT
4            Ford EcoSport Titanium 1.5L TDCi
6                                   Ford Figo
                        ...                  
883                Maruti Suzuki Ritz VXI ABS
885                 Tata Indica V2 DLE BS III
886                      Toyota Corolla Altis
888                       Tata Zest XM Diesel
889                        Mahindra Quanto C8
Name: name, Length: 816, dtype: object

As we can see that there are quite long names are mentioned so going to pick first 3 words to the name. 

In [34]:
df["name"] = df.name.str.split(" ").str.slice(0, 3).str.join(" ")

In [35]:
df = df.reset_index(drop = True)

In [36]:
df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [37]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


As we can see here that 75% of the cars come under the price 5 hundred thousand INR but maximum is showing under 8.5 hundred thousand INR, so we need to see if there is any outlier or not.  

In [38]:
# we are checking that how many cars we have above the price of 6 hundred thousands INR.

df[df.Price > 6e6]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
534,Mahindra XUV500 W6,Mahindra,2014,8500003,45000,Diesel


So, here we can see that this is an only car above 6 hundred thousand INR or rather this is an outlier..

In [39]:
# Need to remove this outlier

df = df[df.Price < 6e6]

In [40]:
df = df.reset_index(drop = True)

In [41]:
df.to_csv("Cleaned_car_data")

In [42]:
# Making the Machine Learning model

In [43]:
df.columns

Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')

In [44]:
X = df[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = df[["Price"]]

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [47]:
from sklearn.preprocessing import OneHotEncoder

In [48]:
One = OneHotEncoder()

In [49]:
One.fit(X[["name", "company", "fuel_type"]])

OneHotEncoder()

In [50]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [51]:
column_trans = make_column_transformer((OneHotEncoder(categories = One.categories_), ["name", "company", "fuel_type"]), 
                                      remainder = "passthrough")

In [52]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [53]:
pipe = make_pipeline(column_trans, lr)

In [54]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', 'LPG', 'Pe

In [55]:
# Finding predictions
y_predict = pipe.predict(X_test)
y_predict 

array([[ 242278.46494771],
       [ 281381.15743522],
       [ 188082.93996611],
       [ 301602.70792273],
       [1478593.58193655],
       [ 334348.9594737 ],
       [ 222475.55222018],
       [ 509371.20628317],
       [  66242.71995426],
       [ 115536.97749334],
       [ 434392.37713209],
       [ 225128.17404114],
       [ 239451.29342207],
       [ 401874.20734149],
       [ 106745.01426569],
       [ 403524.94005951],
       [ 150307.64063273],
       [ 768170.67642561],
       [  76512.62918875],
       [ 263142.47371422],
       [ 403218.78407267],
       [ 314300.04708116],
       [ 544251.69151879],
       [  24943.20933362],
       [ 282570.85500082],
       [ 282766.19840299],
       [ 233809.57171794],
       [ 457051.12346502],
       [ 617915.49576996],
       [ 415251.37011954],
       [ 172198.77108075],
       [ 522504.49995596],
       [ 461405.88051365],
       [ 330352.11278456],
       [ 108207.10776451],
       [ 436821.76181563],
       [ 238658.78573877],
 

In [56]:
from sklearn.metrics import r2_score 

In [57]:
score = r2_score(y_test, y_predict)
score 

0.7101477080500824

In [58]:
# Finding the best random state to improve the score 

scores = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, y_train)
    y_predict = pipe.predict(X_test)
    scores.append(r2_score(y_test, y_predict))

In [59]:
# This is the random score which we can use to have the good accuracy.  
np.argmax(scores)

661

In [60]:
scores[np.argmax(scores)]

0.8897680821375169

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, y_train)
y_predict = pipe.predict(X_test)
r2_score(y_test, y_predict)

0.8897680821375169

In [62]:
import pickle 

In [63]:
pickle.dump(pipe, open("LinearRegressionModel.pkl", "wb"))

## Summary 

       - This model's accuracy is around 89%. 
       - 80% of the data is trained to test on 20% of the data. 
       - The model is able to predict the price of the cars. 