In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
car=pd.read_csv('train-data.csv')
car.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [None]:
## How much data we have
car.shape

(6019, 14)

We have 6019 row entries and 13 column entries.

In [None]:
## Checking the features of the dataset
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


There are total of 14 features, but we are going to use only limited features to predict car price accurately.
So, we are going to drop some extra columns from the dataset.

In [None]:
## Creating a Backup file first.
backup=car.copy()

## Dropping the extra columns
car.drop(columns=['Power','Seats','New_Price','Engine','Owner_Type','Mileage'], inplace=True)

In [None]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Price              6019 non-null   float64
dtypes: float64(1), int64(3), object(4)
memory usage: 376.3+ KB


## Checking the Qualtiy of the data


In [None]:
car['Name'].unique()
## Name values are too big and contains some non-significant character, so we will keep only the first 3 words of the name.

array(['Maruti Wagon R LXI CNG', 'Hyundai Creta 1.6 CRDi SX Option',
       'Honda Jazz V', ..., 'Volkswagen Polo IPL II 1.2 Petrol Highline',
       'Tata Bolt Revotron XT', 'Mahindra Xylo D4 BSIV'], dtype=object)

In [None]:
car['Location'].unique()
## Location column does not have any anomaly and also there is no NaN values included

array(['Mumbai', 'Pune', 'Chennai', 'Coimbatore', 'Hyderabad', 'Jaipur',
       'Kochi', 'Kolkata', 'Delhi', 'Bangalore', 'Ahmedabad'],
      dtype=object)

In [None]:
car['Year'].unique()
## Year values does not have any anomalies as all values corresponds to year values and are stored in int64 format.

array([2010, 2015, 2011, 2012, 2013, 2016, 2018, 2014, 2017, 2007, 2009,
       2008, 2019, 2006, 2005, 2004, 2002, 2000, 2003, 1999, 2001, 1998])

In [None]:
car['Kilometers_Driven'].unique()
## Kilometers_Driven values does not have nay anomalies and are stored in int64 format.

array([72000, 41000, 46000, ..., 45004, 70602, 27365])

In [None]:
car['Fuel_Type'].unique()
## Fuel type values does not have any anomalies or any Nan values.

array(['CNG', 'Diesel', 'Petrol', 'LPG', 'Electric'], dtype=object)

In [None]:
car['Transmission'].unique()
## Transmission column does not have any anomaly and also there is no NaN values included

array(['Manual', 'Automatic'], dtype=object)

In [None]:
car['Price'].unique()
## price values does not have any anomalies and are stored in float64 format and these amount are stored in lakhs

array([ 1.75, 12.5 ,  4.5 , ...,  2.27, 17.56,  7.43])

## Removing the Name column Anomaly

In [None]:
## Removing the anomaly from Name column by keeping only first three words.
car['Name']=car['Name'].str.split(' ').str.slice(0,3).str.join(' ') 
## Will split the name and then we slice the name accordingly and then we will join it.
car

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Price
0,0,Maruti Wagon R,Mumbai,2010,72000,CNG,Manual,1.75
1,1,Hyundai Creta 1.6,Pune,2015,41000,Diesel,Manual,12.50
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,4.50
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,6.00
4,4,Audi A4 New,Coimbatore,2013,40670,Diesel,Automatic,17.74
...,...,...,...,...,...,...,...,...
6014,6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,4.75
6015,6015,Hyundai Xcent 1.1,Jaipur,2015,100000,Diesel,Manual,4.00
6016,6016,Mahindra Xylo D4,Jaipur,2012,55000,Diesel,Manual,2.90
6017,6017,Maruti Wagon R,Kolkata,2013,46000,Petrol,Manual,2.65


In [None]:
car.describe()

Unnamed: 0,Year,Kilometers_Driven,Price
count,6019.0,6019.0,6019.0
mean,2013.358199,58738.38,9.479468
std,3.269742,91268.84,11.187917
min,1998.0,171.0,0.44
25%,2011.0,34000.0,3.5
50%,2014.0,53000.0,5.64
75%,2016.0,73000.0,9.95
max,2019.0,6500000.0,160.0


In [None]:
car.to_csv('Cleaned_data.csv')

In [None]:
from google.colab import files
files.download('Cleaned_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### MODEL


In [None]:
#Building the model
#selecting columns for x variable except price column.
X = car.drop(columns= 'Price')
#selecting columns for y variable and price will be output of our model.
y=car['Price']

In [None]:
X

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission
0,Maruti Wagon R,Mumbai,2010,72000,CNG,Manual
1,Hyundai Creta 1.6,Pune,2015,41000,Diesel,Manual
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual
4,Audi A4 New,Coimbatore,2013,40670,Diesel,Automatic
...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual
6015,Hyundai Xcent 1.1,Jaipur,2015,100000,Diesel,Manual
6016,Mahindra Xylo D4,Jaipur,2012,55000,Diesel,Manual
6017,Maruti Wagon R,Kolkata,2013,46000,Petrol,Manual


In [None]:
y

0        1.75
1       12.50
2        4.50
3        6.00
4       17.74
        ...  
6014     4.75
6015     4.00
6016     2.90
6017     2.65
6018     2.50
Name: Price, Length: 6019, dtype: float64

In [None]:
#Now putting data for training and testing.

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=0)

In [None]:
#Importing libraries.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
#Creating object for one hot encoder.

ohe = OneHotEncoder()
ohe.fit(X[['Name','Location','Fuel_Type','Transmission']])
ct = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['Name','Location','Fuel_Type','Transmission']),remainder='passthrough')


In [None]:
#Creating object for linear regression.
lr=LinearRegression()

In [None]:
## Pipelines function by allowing a linear series of data transforms to be linked together, resulting in a measurable modeling process.
pipe = make_pipeline(ct,lr)

In [None]:
#Fitting training data.
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Ambassador Classic Nova', 'Audi A3 35', 'Audi A4 1.8',
       'Audi A4 2.0', 'Audi A4 3.0', 'Audi A4 3.2', 'Audi A4 30',
       'Audi A4 35', 'Audi A4 New', 'Audi A6 2.0', 'Audi A6 2.7',
       'Audi A6 2.8', 'Audi A6 2011-2015', 'Audi A6 3.0', 'Audi A6 35',
       'Audi A7 2011-2...
       'Volvo XC60 D5', 'Volvo XC90 2007-2015'], dtype=object),
                                                                            array(['Ahmedabad', 'Bangalore', 'Chennai', 'Coimbatore', 'Delhi',
       'Hyderabad', 'Jaipur', 'Kochi', 'Kolkata', 'Mumbai', 'Pune'],
      dtype=object),
                                                                            array(['CNG', 'Diesel', 'Electric', 'LPG', 'Petrol'], dtype=object),
                        

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
y_pred

array([ 7.47924523,  1.36138508,  3.83880064, ...,  4.02903913,
       10.24427677,  6.00652044])

In [None]:
r2_score(y_test,y_pred)

0.7555138632085819

In [None]:
#for loop running for selecting appropriate random state to get minimum error. 
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(ct,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [None]:
np.argmax(scores)

29

In [None]:
scores[np.argmax(scores)]

0.9033630870286415

In [None]:
#Calculating best training data on the random state calculated above.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(ct,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.9033630870286415

In [None]:
## Converts a Python object hierarchy into a byte stream. This converted byte stream can be written to a buffer
import pickle

In [None]:
pickle.dump(pipe,open('LinearRegression_model.pkl','wb'))

In [None]:
#Output of car price in lakhs.

pipe.predict(pd.DataFrame([['Chevrolet Beat Diesel','Mumbai',2019,100,'Petrol','Manual']], columns=['Name','Location','Year','Kilometers_Driven','Fuel_Type','Transmission']))

array([7.26259833])