In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Manju410/MLPractice/main/data/Cardetails.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


- Car Details Dataset has 8128 rows and 13 columns

In [3]:
df.shape

(8128, 13)

In [4]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque',
       'seats'],
      dtype='object')

- mileage, engine, max power, torque columns has null values

In [5]:
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

- many of the features has object data type even they have numbers

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


1. identify or seperate target variable (create X,y)
  - 1.1: y = selling price
2. split the data into train and test
  - test:approx 400 - train: remaining(include dev)
  - random state: 578
3. Clean up Dataset(Missing values ,etc)
  - fill by mean(if no outliers: by distribution)
  - fill by median(if outliers are present)
  - fill by mode (if categorical)
4. Convert categorical to numerical
  - One hot encoding(dummies)
  - Label encoding (DONT USE THIS): It is oly for target variable
  - ordinal encoding(range or scaling data)
5. Normalization and standardization if required.
6. Build model
7. Evalute the model 

In [7]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


- Seperating name column into Brand and Model columns and droping out name column after seperated

In [8]:
df['Brand'] = df.name.str.split(' ').str[0]
df['Model'] = df.name.str.split(' ').str[1]
df.drop('name',axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,Brand,Model
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift


- Checking for torque column to additional modification on splitting out torque and speed

In [10]:
df.torque.value_counts()

190Nm@ 2000rpm             530
200Nm@ 1750rpm             445
90Nm@ 3500rpm              405
113Nm@ 4200rpm             223
114Nm@ 4000rpm             171
                          ... 
128Nm@ 3100rpm               1
72.9Nm@ 2250rpm              1
155 Nm at 1600-2800 rpm      1
510Nm@ 1600-2800rpm          1
96  Nm at 3000  rpm          1
Name: torque, Length: 441, dtype: int64

- Below integer function will give numerical values from string if not return nan value

In [11]:
from numpy.core.numeric import NaN
import re
def integer(val):
  z = re.match("[0-9.]+",str(val));
  if z: return str(z.group())
  else: return NaN

- applying integer function to mileage, engine, maxpower, torque columns to get numeric data

In [12]:
df.mileage = df.mileage.apply(integer)
df['mileage'] = df['mileage'].astype(float)

In [13]:
df.engine = df.engine.apply(integer)
df['engine'] = df['engine'].astype(float)

In [14]:
df.max_power = df.max_power.apply(integer)
df['max_power'] = df['max_power'].astype(float)

In [15]:
df['Torque'] = df.torque.apply(integer)
df['Torque'] = df['Torque'].astype(float)

- splitting out torque column into torque and speed by below technique and droping out torque column after seperated

In [16]:
df['Speed'] = df['torque'].str.findall(r'([,0-9]+)').str[-1]
df['Speed']= df['Speed'].str.replace(",", "", case = False)
df['Speed']= df['Speed'].astype(float)
df.drop('torque',axis=1,inplace=True)

In [17]:
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,Brand,Model,Torque,Speed
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,Maruti,Swift,190.0,2000.0
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Skoda,Rapid,250.0,2500.0
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,Honda,City,12.7,2700.0
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,Hyundai,i20,22.4,2750.0
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,Maruti,Swift,11.5,4500.0


In [18]:
df['Age'] = 2022 - df['year']
df.drop('year',axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,Brand,Model,Torque,Speed,Age
0,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,Maruti,Swift,190.0,2000.0,8
1,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Skoda,Rapid,250.0,2500.0,8
2,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,Honda,City,12.7,2700.0,16
3,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,Hyundai,i20,22.4,2750.0,12
4,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,Maruti,Swift,11.5,4500.0,15


In [20]:
#df.drop('Model',axis=1,inplace=True)
#df.drop('Brand',axis=1,inplace=True)

In [21]:
#df.Brand.unique()

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   selling_price  8128 non-null   int64  
 1   km_driven      8128 non-null   int64  
 2   fuel           8128 non-null   object 
 3   seller_type    8128 non-null   object 
 4   transmission   8128 non-null   object 
 5   owner          8128 non-null   object 
 6   mileage        7907 non-null   float64
 7   engine         7907 non-null   float64
 8   max_power      7912 non-null   float64
 9   seats          7907 non-null   float64
 10  Brand          8128 non-null   object 
 11  Model          8128 non-null   object 
 12  Torque         7906 non-null   float64
 13  Speed          7906 non-null   float64
 14  Age            8128 non-null   int64  
dtypes: float64(6), int64(3), object(6)
memory usage: 952.6+ KB


- Creating X and y vaiables

In [23]:
target = 'selling_price'
X = df.drop(target,axis=1)
y = df.loc[:,target]

In [24]:
y.isna().sum()

0

- spliting entire dataset into train and test data which contains 400 records for test and 7.7k for train which will divide train data further into train and dev dataset

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.05,random_state=578)
X_train.shape, X_test.shape,y_train.shape

((7721, 14), (407, 14), (7721,))

- Creating numerical columns and Categorical columns using make column selector

In [26]:
from sklearn.compose import make_column_selector
num_col = make_column_selector(dtype_exclude=object)
cat_col = make_column_selector(dtype_include=object)

- importing imputation, column transformer, encoding & pipeline libraries

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder
from sklearn.pipeline import make_pipeline

- Creating imputation strategies for mean , median, mode

In [28]:
imp_mean = SimpleImputer(strategy='mean')
imp_median = SimpleImputer(strategy='median')
imp_cat = SimpleImputer(strategy='most_frequent')

- Creating one hot encoding and ordinal encoding strategies

In [29]:
onehot = OneHotEncoder(handle_unknown='ignore',categories='auto')
ordencode = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
labelcoder = LabelEncoder()

- Creating column transformer strategy for mean imputaion with one hot encoding and median imputation with ordinal encoding

-https://stackoverflow.com/questions/55600774/valueerror-could-not-convert-string-to-float-while-using-onehotencoder-for-ma

In [30]:
col_transform_mean = make_column_transformer(
    (make_pipeline(imp_mean),num_col),
    (make_pipeline(imp_cat,onehot),cat_col),
    remainder='passthrough'
)

col_transform_median = make_column_transformer(
    (make_pipeline(imp_median),num_col),
    (make_pipeline(imp_cat,ordencode),cat_col),
    remainder='passthrough'
)

- Libraries for normalization and standardization

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

- Creating instance for linear regression, normalization, standardization

In [32]:
lrmodel = LinearRegression()
minmax = MinMaxScaler()
Stdscaler = StandardScaler()

- Creating KFOLD cross validation with 10 splits

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10,shuffle=True,random_state=578)

In [34]:
from sklearn.feature_selection import RFE
rfe1 = RFE(lrmodel, n_features_to_select= 73, step = 1)

In [35]:
pipe_mean = make_pipeline(col_transform_mean,rfe1,lrmodel)

In [36]:
X_train.shape, X_test.shape

((7721, 14), (407, 14))

In [37]:
pipe_mean.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f65ad6293d0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_trans

In [44]:
pipe_mean.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f65ad6293d0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                           

In [38]:
pipe_mean.score(X_test,y_test)

0.8302366738826223

In [47]:
col_transform_mean

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f65ad6293d0>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f65ad629710>)])

In [48]:
col_transform_mean.transformers[1]

('pipeline-2',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]),
 <sklearn.compose._column_transformer.make_column_selector at 0x7f65ad629710>)

In [49]:
col_transform_mean.transformers[1][1]

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

In [50]:
col_transform_mean.transformers[1][1][1]

OneHotEncoder(handle_unknown='ignore')

In [54]:
col_transform_mean.transformers_[1][1][1].get_feature_names_out().shape

(249,)

In [56]:
rfe1.estimator

LinearRegression()

In [73]:
rfe1.estimator_

LinearRegression()

In [66]:
rfe1.n_features_in_

257

In [60]:
rfe1.get_feature_names_out()

array(['x20', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29',
       'x31', 'x33', 'x35', 'x36', 'x38', 'x39', 'x42', 'x45', 'x46',
       'x47', 'x49', 'x51', 'x52', 'x53', 'x55', 'x56', 'x60', 'x69',
       'x80', 'x83', 'x85', 'x91', 'x99', 'x101', 'x107', 'x114', 'x118',
       'x121', 'x122', 'x123', 'x132', 'x134', 'x147', 'x149', 'x150',
       'x151', 'x156', 'x158', 'x160', 'x162', 'x165', 'x170', 'x179',
       'x183', 'x184', 'x185', 'x187', 'x191', 'x192', 'x195', 'x196',
       'x220', 'x223', 'x233', 'x234', 'x235', 'x236', 'x237', 'x238',
       'x239', 'x240', 'x241', 'x242', 'x245'], dtype=object)

In [57]:
rfe1.estimator.coef_

array([ 2684012.16664647,  -543058.60393907,  -367057.22127071,
        1818688.17036448,  2259478.30914771,  -396182.95074399,
        -295029.93909175,  -355297.07185201,  -366946.18957866,
        -194104.63857504,  -208861.52954361,  2244307.87071879,
        1484902.20532867,  1631470.13659964,  2241470.1085001 ,
        -264357.21218859,  -200249.70081062,  -299531.10522943,
        -214457.02751387,  -345044.98664444,  -192423.54436856,
        3632204.84853147, -1106538.07873967,  3005462.92823189,
       -1551538.46963321,  -732122.91260416,  -299531.10522943,
        1882939.59234209,  1281377.71955166,   629897.62829799,
        -463136.01936617,  1694725.94308354,  2241470.1085001 ,
         830338.97444562,  -422659.66221674,  1284599.30999692,
        3467940.45148205,  1591106.31801075,  2632941.27320894,
        1344653.95737951,   970128.01621203,  2358439.93772156,
        -513059.83111638,  2832941.17284091,  -417060.47925477,
        1767939.92734983,  2032939.70129

In [62]:
df_coeff = pd.DataFrame({'columns':rfe1.get_feature_names_out(),'coeff':rfe1.estimator.coef_})
df_coeff

Unnamed: 0,columns,coeff
0,x20,2.684012e+06
1,x22,-5.430586e+05
2,x23,-3.670572e+05
3,x24,1.818688e+06
4,x25,2.259478e+06
...,...,...
68,x239,4.273459e+06
69,x240,-5.259312e+05
70,x241,1.200735e+06
71,x242,5.700733e+06


In [85]:
df_coeff.sort_values('coeff',key=abs,ascending=False).round(3).head()


Unnamed: 0,columns,coeff
71,x242,5700732.878
57,x192,5295440.021
68,x239,4273458.894
21,x52,3632204.849
36,x121,3467940.451


In [86]:
df_coeff.sort_values('coeff',key=abs,ascending=False).round(3).tail()

Unnamed: 0,columns,coeff
16,x45,-200249.701
9,x31,-194104.639
20,x51,-192423.544
50,x170,-130295.069
55,x187,-83314.022
