# Preprocessing Data

## To maintain quality we have Drop Rows.
- Year Column having some alpha value and they are object we have to convert it to int.
- Price Column having some alpha value and they are object we have to convert it to int and remove commas.
- Kms_driven Column having some alpha value and they are object we have to convert it to int and remove commas and kms at the end.
- Fuel having nan data we have to remove.
- Keep first three words of name


In [1]:
import pandas as pd

In [2]:
# Reading CSV
carData=pd.read_csv("quikr_car.csv")

In [3]:
backup=carData.copy()

## Cleaning

Cleaning Year Columns

In [4]:
carData=carData[carData['year'].str.isnumeric()]
carData['year']=carData['year'].astype(int)

Clenaing Price Column

In [5]:
carData=carData[carData['Price']!="Ask For Price"]
carData['Price']=carData['Price'].str.replace(',','').astype(int)

Cleaning Kms_Driven Data

In [6]:
carData['kms_driven']=carData['kms_driven'].str.replace(',','')
carData['kms_driven']=carData['kms_driven'].str.replace(' kms','')
carData=carData[carData['kms_driven'].str.isnumeric()]
carData['kms_driven']=carData['kms_driven'].astype(int)

Clenaing Fuel Type

In [7]:
carData=carData[~carData['fuel_type'].isna()]

Cleaning Name

In [8]:
carData['name']=carData['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [9]:
carData.reset_index(drop=True)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


# Now Analyzing preprocessed Data to find lies

In [10]:
carData.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [11]:
carData[carData['Price']>6e6] #There is only  one car which is more than 60 lakhs so this data should be removed

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
562,Mahindra XUV500 W6,Mahindra,2014,8500003,45000,Diesel


In [12]:
carData=carData[carData['Price']<=6e6] 

In [13]:
carData.reset_index(drop=True)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
810,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
811,Tata Indica V2,Tata,2009,110000,30000,Diesel
812,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
813,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [14]:
carData.to_csv("cleanCarData.csv")

# Model

In this on x axis we have all data then price and on y axis we have data of price

In [15]:
x=carData.drop(columns='Price')
y=carData['Price']

# Spilting into test and train set
We have split data into test and train with ratio of 4:1

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_Test=train_test_split(x,y,test_size=0.2,random_state=7314)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# OneHotEncoder
We use one hot encoder for classifying the Category by numbers because machine learning is all about mathematical equation.It only understand numeric value.

In [32]:
ohe=OneHotEncoder()
ohe.fit(x[['name','company','fuel_type']])

In [33]:
colums_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),remainder="passthrough")

In [34]:
colums_trans

In [35]:
lr=LinearRegression()

In [36]:
pipe=make_pipeline(colums_trans,lr)

In [37]:
pipe.fit(X_train,Y_train)

In [38]:
y_predict=pipe.predict(X_test)

# Calculating Accuracy

In [39]:
r2_score(y_predict,Y_Test)*100

87.41546464609924

# This stage is done to find max r2 score at ith random_state

In [28]:
scores=[]
for i in range(10000):
    X_train,X_test,Y_train,Y_Test=train_test_split(x,y,test_size=0.2,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(colums_trans,lr)
    pipe.fit(X_train,Y_train)
    y_predict=pipe.predict(X_test)
    scores.append(r2_score(y_predict,Y_Test)*100)

In [43]:
max(scores)

87.41546464609924

We Reached at 87.41 % of accuracy

# Saving Model

In [44]:
import pickle

In [46]:
pickle.dump(pipe,open("CarPricePredictionModel.pkl","wb"))

# Prediction

In [61]:
data=['Hyundai Santro Xing','Hyundai',2020,45000,'Diesel']
columns=['name','company','year','kms_driven','fuel_type']
pipe.predict(pd.DataFrame([data],columns=columns))

array([673031.16824959])

In [62]:
carData['fuel_type'].unique()

array(['Petrol', 'Diesel', 'LPG'], dtype=object)

In [63]:
carData['company'].unique()

array(['Hyundai', 'Mahindra', 'Ford', 'Maruti', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)