In [39]:
import pandas as pd
import numpy as np

In [2]:
car = pd.read_csv('quikr_car.csv')

In [3]:
car.head(1)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol


In [4]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


# Cleaning the data 

In [5]:
backup = car.copy()

In [6]:
backup.shape

(892, 6)

In [7]:
# Keeping only the rows where the 'year' column contains numeric characters
car = car[car['year'].str.isnumeric()]

In [8]:
car.shape

(842, 6)

In [9]:
# Converting the type from object to int
car['year'] = car['year'].astype(int)

In [10]:
car = car[car['Price']!='Ask For Price']

In [11]:
#Removing the commas from values and then convert them to integer 
car['Price'] = car['Price'].str.replace(',','').astype(int)

In [12]:
#remove 'kms' from kms_driven and remove commas 
car['kms_driven'] = car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [13]:
#keep numeric data and convert object to int
car = car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)

In [14]:
car.head(1)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol


In [15]:
#removing rows with value nan for 'fuel_type'
car = car[~car['fuel_type'].isna()]

In [16]:
car['name'] = car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [17]:
#The indices have broken the sequence so reset it
car=car.reset_index(drop=True)

In [18]:
#removing an outlier 
car = car[car['Price']<6e6].reset_index(drop=True)

In [19]:
#save this cleaned data to csv file 
car.to_csv('cleaned_data.csv')

# Model

In [20]:
#Target column is Price 
X = car.drop(columns='Price')
y = car['Price']

In [21]:
#Splitting the dataset into training and testing sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [23]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [27]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['name','company','fuel_type']),
                                       remainder='passthrough',
                                      force_int_remainder_cols=False)
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)

In [28]:
pipe.fit(X_train, y_train)

In [29]:
y_pred = pipe.predict(X_test)

In [30]:
y_pred

array([ 2.40584004e+05,  2.26070250e+06,  4.42310222e+05,  5.85574679e+05,
        3.36235627e+05,  2.67444562e+05,  1.66365644e+05,  6.12808218e+05,
        6.47300576e+05,  5.36445802e+05,  2.91542851e+05,  6.78741158e+04,
        5.40281337e+05,  5.91864499e+04,  2.26339154e+05,  2.29129829e+05,
        1.38741893e+06,  4.52063571e+05,  9.94636021e+04,  4.53042027e+05,
        5.86210068e+05,  2.80494132e+05,  5.68301034e+05,  5.40329046e+05,
        7.28747976e+05,  4.90421196e+05,  6.26003217e+05,  2.12431588e+05,
        4.77751451e+05,  2.58428730e+05,  1.08582213e+06,  3.25491983e+05,
        8.03771218e+05,  2.43138672e+05,  2.65090075e+05, -2.07993455e+04,
        5.17046109e+05,  4.69868457e+05,  2.66135460e+05,  5.59792853e+05,
        2.26090975e+06,  7.74718610e+04,  3.98911062e+05,  3.01857087e+05,
        5.29813821e+05,  3.29023861e+05,  2.57449550e+04,  6.74596170e+04,
        5.98870970e+05,  1.41921195e+06,  2.85166575e+04,  6.55289468e+05,
        6.12808218e+05,  

In [36]:
r2_score(y_test, y_pred)

0.5926684087890187

In [37]:
scores=[]
for i in range(1000):
    X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    scores.append(r2_score(y_test, y_pred))


In [42]:
np.argmax(scores)

433

In [43]:
scores[np.argmax(scores)]

0.8457046438151008

In [44]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=433)
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.8457046438151008

In [45]:
import pickle

In [46]:
pickle.dump(pipe, open('LRmodel.pkl','wb'))

In [49]:
pipe.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']], columns=['name','company','year','kms_driven','fuel_type']))

array([458899.18891749])