In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import *

In [22]:
df= pd.read_csv('car.csv')
df.head

<bound method NDFrame.head of                                         name  year          Price  kms_driven  \
0       Hyundai Santro Xing XO eRLX Euro III  2007          80000  45,000 kms   
1                    Mahindra Jeep CL550 MDI  2006       4,25,000      40 kms   
2                 Maruti Suzuki Alto 800 Vxi  2018  Ask For Price  22,000 kms   
3     Hyundai Grand i10 Magna 1.2 Kappa VTVT  2014       3,25,000  28,000 kms   
4           Ford EcoSport Titanium 1.5L TDCi  2014       5,75,000  36,000 kms   
...                                      ...   ...            ...         ...   
8123                       Hyundai i20 Magna  2013         320000      110000   
8124                   Hyundai Verna CRDi SX  2007         135000      119000   
8125                  Maruti Swift Dzire ZDi  2009         382000      120000   
8126                         Tata Indigo CR4  2013         290000       25000   
8127                         Tata Indigo CR4  2013         290000       25000  

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          8128 non-null   object
 1   year          8128 non-null   object
 2   Price         8128 non-null   object
 3   kms_driven    8076 non-null   object
 4   fuel_type     8073 non-null   object
 5   transmission  8128 non-null   object
 6   owner         8128 non-null   object
dtypes: object(7)
memory usage: 444.6+ KB


Cleaning Data

In [24]:
df.year.unique()

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',
       'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',
       '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',
       'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',
       ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',
       't xe', 'EV2', 'r...', 'zest', '2020', '1999', '1996', '1994',
       '1998', '1997', '1991'], dtype=object)

In [25]:
df=df[df.year.str.isnumeric()]
df.year = df['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.year = df['year'].astype(int)


In [26]:
df= df[df['Price'] != 'Ask For Price' ]

In [27]:
df.Price = df['Price'].str.replace(',','')
df.Price

0        80000
1       425000
3       325000
4       575000
6       175000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: Price, Length: 8055, dtype: object

In [28]:
df.Price = df.Price.astype(int)

In [29]:
df['kms_driven']=df['kms_driven'].str.split(' ').str.get(0).str.replace(',','') 

In [30]:
df= df[df['kms_driven'].str.isnumeric()]

In [31]:
df['kms_driven']= df['kms_driven'].astype(int)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8053 entries, 0 to 8127
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          8053 non-null   object
 1   year          8053 non-null   int32 
 2   Price         8053 non-null   int32 
 3   kms_driven    8053 non-null   int32 
 4   fuel_type     8052 non-null   object
 5   transmission  8053 non-null   object
 6   owner         8053 non-null   object
dtypes: int32(3), object(4)
memory usage: 408.9+ KB


In [33]:
df=df[~df.fuel_type.isna()]

In [34]:
df.name=df.name.str.split(' ').str.slice(0,3).str.join(' ')

In [35]:
df.head()

Unnamed: 0,name,year,Price,kms_driven,fuel_type,transmission,owner
0,Hyundai Santro Xing,2007,80000,45000,Petrol,Manual,First Owner
1,Mahindra Jeep CL550,2006,425000,40,Diesel,Manual,First Owner
3,Hyundai Grand i10,2014,325000,28000,Petrol,Manual,First Owner
4,Ford EcoSport Titanium,2014,575000,36000,Diesel,Manual,First Owner
6,Ford Figo,2012,175000,41000,Diesel,Manual,First Owner


In [36]:
df.reset_index(drop = True)

Unnamed: 0,name,year,Price,kms_driven,fuel_type,transmission,owner
0,Hyundai Santro Xing,2007,80000,45000,Petrol,Manual,First Owner
1,Mahindra Jeep CL550,2006,425000,40,Diesel,Manual,First Owner
2,Hyundai Grand i10,2014,325000,28000,Petrol,Manual,First Owner
3,Ford EcoSport Titanium,2014,575000,36000,Diesel,Manual,First Owner
4,Ford Figo,2012,175000,41000,Diesel,Manual,First Owner
...,...,...,...,...,...,...,...
8047,Hyundai i20 Magna,2013,320000,110000,Petrol,Manual,First Owner
8048,Hyundai Verna CRDi,2007,135000,119000,Diesel,Manual,Fourth & Above Owner
8049,Maruti Swift Dzire,2009,382000,120000,Diesel,Manual,First Owner
8050,Tata Indigo CR4,2013,290000,25000,Diesel,Manual,First Owner


In [37]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,8052.0,8052.0,8052.0
mean,2013.662196,614167.0,67742.75
std,4.071909,772227.1,56401.58
min,1991.0,29999.0,0.0
25%,2011.0,250000.0,33015.0
50%,2014.0,434499.5,60000.0
75%,2017.0,655000.0,90000.0
max,2020.0,8500003.0,2360457.0


In [38]:
df=df[df.Price <= 7200000]

In [39]:
df.reset_index(drop= True, inplace= True)

In [40]:
#df.to_csv('car_clean.csv')

In [41]:
#lb= LabelEncoder()
		
#df['fuel_type']= df['fuel_type'].map({'Petrol':1, 'Diesel':2, 'LPG':3, 'CNG':4})
#df['transmission']= df['transmission'].map({'Manual':1, 'Automatic':2})
#df['owner']= df['owner'].map({'First Owner':1, 'Second Owner':2, 'Third Owner':4,'Fourth & Above Owner':4, 'Test Drive Car':0})
#df.head()

Model

In [42]:
x= df.drop('Price',axis=1)
y= df.Price

In [43]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size= 0.2)

In [44]:
ohe = OneHotEncoder()
ohe.fit(x[['name','fuel_type','transmission','owner']])

OneHotEncoder()

In [45]:
ohe.categories_

[array(['Ambassador CLASSIC 1500', 'Ambassador Classic 2000',
        'Ambassador Grand 1500', 'Ambassador Grand 2000',
        'Ashok Leyland Stile', 'Audi A3 35', 'Audi A3 40',
        'Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A4 35',
        'Audi A6 2.0', 'Audi A6 35', 'Audi A8', 'Audi Q3 2.0',
        'Audi Q3 35', 'Audi Q5 2.0', 'Audi Q5 3.0', 'Audi Q5 35TDI',
        'Audi Q5 45', 'Audi Q7', 'Audi Q7 3.0', 'BMW 3 Series',
        'BMW 5 Series', 'BMW 6 Series', 'BMW 7 Series', 'BMW X1',
        'BMW X1 sDrive', 'BMW X1 sDrive20d', 'BMW X1 sDrive20i',
        'BMW X1 xDrive20d', 'BMW X3 xDrive20d', 'BMW X4 M', 'BMW X5 3.0d',
        'BMW X6 xDrive30d', 'BMW X7 xDrive', 'Chevrolet Aveo 1.4',
        'Chevrolet Aveo U-VA', 'Chevrolet Beat', 'Chevrolet Beat Diesel',
        'Chevrolet Beat LS', 'Chevrolet Beat LT', 'Chevrolet Beat PS',
        'Chevrolet Captiva 2.2', 'Chevrolet Captiva LT',
        'Chevrolet Cruze LT', 'Chevrolet Cruze LTZ', 'Chevrolet Enjoy',
     

#Creating Pipeline

In [46]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [47]:
column_trans= make_column_transformer((OneHotEncoder(categories= ohe.categories_),['name','fuel_type','transmission','owner']), remainder= 'passthrough')

In [48]:
lr= LinearRegression()
scale= MaxAbsScaler()

In [49]:
pipeline = make_pipeline(column_trans,scale,lr)

In [50]:
pipeline.fit(xtrain,ytrain)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Ambassador CLASSIC 1500', 'Ambassador Classic 2000',
       'Ambassador Grand 1500', 'Ambassador Grand 2000',
       'Ashok Leyland Stile', 'Audi A3 35', 'Audi A3 40',
       'Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A4 35',
       'Audi A6 2.0', 'Audi A6 3...
       'Volvo XC60 Inscription'], dtype=object),
                                                                            array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object),
                                                                            array(['Automatic', 'Manual'], dtype=object),
                                                                            array(['First Owner', 'Fourth & Above Owner', 'Second Owner',
       'Test Drive Car', 'Third O

In [51]:
ytrainpred = pipeline.predict(xtrain)
r2_score(ytrain,ytrainpred)

0.9724602470979565

In [52]:
ypred = pipeline.predict(xtest)

In [53]:
r2_score(ytest,ypred)

0.9055101691980644

In [54]:
score=[]
for i in range(1000):
    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size= 0.2,random_state= i)
    lr= LinearRegression()
    pipeline = make_pipeline(column_trans,scale,lr)
    pipeline.fit(xtrain,ytrain)
    ypred = pipeline.predict(xtest)
    score.append(r2_score(ytest,ypred))


In [55]:
np.argmax(score)

295

In [56]:
score[np.argmax(score)]

0.9641643637766034

In [57]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size= 0.2,random_state= np.argmax(score))
lr= LinearRegression()
pipeline = make_pipeline(column_trans,scale,lr)
pipeline.fit(xtrain,ytrain)
ytrain_pred= pipeline.predict(xtrain)
print('R2 score of train data:',r2_score(ytrain,ytrain_pred))
ypred = pipeline.predict(xtest)
print('R2 score of test data:',r2_score(ytest,ypred))

R2 score of train data: 0.9709189746652515
R2 score of test data: 0.9641643637766034


In [58]:
validation= cross_val_score(pipeline,xtrain,ytrain,cv=5)
np.mean(validation)

0.8948043529455745

In [59]:
validation= cross_val_score(pipeline,xtest,ytest,cv=5)
np.mean(validation)

0.876429349660443

In [60]:
import pickle
pickle.dump(pipeline,open('carpriceprediction.pkl','wb'))