In [1]:
# libraries imported
import pandas as pd 
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [2]:
# loading the dataset
df = pd.read_csv("quikr_car.csv")
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [3]:
# shape of dataset
df.shape

(892, 6)

In [4]:
# detail about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [5]:
#for column in df.columns:
    # Get unique values of the current column
    #unique_values = df[column].unique()
    
    # Print the column name and its unique values
    #print(f'Column: {column}')
    #print(f'Unique Values: {unique_values}')
    #print('---')

## problems with data
- change datatype in year
- year does not have year value
- price change datatype
- remove strings from price
- kms driven change datatype 
- remove nan , kms
- fuel remove nan
- name inconsisitent change to short name 


# cleaning 

In [6]:
# storing backup of dataset
backup = df.copy()

In [7]:
# removing unwanted value from year column and keeping only numeric value
df=df[df['year'].str.isnumeric()]


In [8]:
# changing data type of year column
df['year']=df['year'].astype(int)


In [9]:
# price keep only numeric
df=df[df['Price']!='Ask For Price']

In [10]:
# changing data type of Price column and removing ','
df['Price']=df['Price'].str.replace(',','').astype(int)

In [11]:
# removing units and ',' 
df['kms_driven']=df['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [12]:
# keeping only numeric
df=df[df['kms_driven'].str.isnumeric()]

In [13]:
# changing datatype
df['kms_driven']=df['kms_driven'].astype(int)

In [14]:
# keeping only not null value
df=df[~df['fuel_type'].isna()]

In [15]:
# keeping only first 3 word 
df['name']=df['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [16]:
# rseting the index
df=df.reset_index(drop=True)

In [17]:
df.sample(10)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
434,Hyundai Elite i20,Hyundai,2015,400000,30000,Petrol
194,Maruti Suzuki Alto,Maruti,2013,125000,39000,Petrol
135,Toyota Corolla Altis,Toyota,2012,349999,59000,Petrol
638,Mahindra KUV100,Mahindra,2017,360000,35000,Diesel
440,Hyundai Elite i20,Hyundai,2015,419000,20000,Petrol
598,Nissan Sunny XL,Nissan,2011,230000,52000,Petrol
179,Mahindra Quanto C8,Mahindra,2013,340000,37000,Diesel
444,Hyundai i10 Magna,Hyundai,2008,275000,100200,Petrol
506,Ford Fiesta SXi,Ford,2009,250000,56400,Petrol
634,Honda City SV,Honda,2014,475000,34000,Diesel


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   Price       816 non-null    int32 
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 28.8+ KB


In [19]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [20]:
# removing outliers value
df=df[df['Price']<6e6].reset_index(drop=True)

In [21]:
new_df=df.to_csv('carDataset.csv')

In [22]:
# splitting features and target for model train
X=df[['name','company','year','kms_driven','fuel_type']]
y=df['Price']

In [23]:
# traing the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [24]:
# encoding categorical column
ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])


In [25]:
# column transformer
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')


In [26]:
# using linear regression model
lr=LinearRegression()

In [27]:
# creating pipeline
pipe=make_pipeline(column_trans,lr)

In [28]:
# fitting the data to machine
pipe.fit(X_train,y_train)


In [29]:
# predicting the result
y_pred=pipe.predict(X_test)

In [30]:
# checking r2 score
r2_score(y_test,y_pred)

0.745818540825665

In [31]:
# calculating random_state value which gives max accuracy
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [32]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

array([430345.10228051])

In [33]:
np.argmax(scores)

302

In [34]:
scores[np.argmax(scores)]

0.8959285359819742

In [35]:
# again training, fitting and predicting the data at max random_state
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8959285359819742

In [36]:
# saving our model
import pickle
pickle.dump(pipe,open('carModel.pkl','wb'))