In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Manju410/MLPractice/main/data/Cardetails.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


- Car Details Dataset has 8128 rows and 13 columns

In [None]:
df.shape

(8128, 13)

In [None]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque',
       'seats'],
      dtype='object')

- mileage, engine, max power, torque columns has null values

In [None]:
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

- many of the features has object data type even they have numbers

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


1. identify or seperate target variable (create X,y)
  - 1.1: y = selling price
2. split the data into train and test
  - test:approx 400 - train: remaining(include dev)
  - random state: 578
3. Clean up Dataset(Missing values ,etc)
  - fill by mean(if no outliers: by distribution)
  - fill by median(if outliers are present)
  - fill by mode (if categorical)
4. Convert categorical to numerical
  - One hot encoding(dummies)
  - Label encoding (DONT USE THIS): It is oly for target variable
  - ordinal encoding(range or scaling data)
5. Normalization and standardization if required.
6. Build model
7. Evalute the model 

In [None]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


- Seperating name column into Brand and Model columns and droping out name column after seperated

In [None]:
df['Brand'] = df.name.str.split(' ').str[0]
df['Model'] = df.name.str.split(' ').str[1]
df.drop('name',axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,Brand,Model
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift


- Checking for torque column to additional modification on splitting out torque and speed

In [None]:
df.torque.value_counts()

190Nm@ 2000rpm             530
200Nm@ 1750rpm             445
90Nm@ 3500rpm              405
113Nm@ 4200rpm             223
114Nm@ 4000rpm             171
                          ... 
128Nm@ 3100rpm               1
72.9Nm@ 2250rpm              1
155 Nm at 1600-2800 rpm      1
510Nm@ 1600-2800rpm          1
96  Nm at 3000  rpm          1
Name: torque, Length: 441, dtype: int64

- Below integer function will give numerical values from string if not return nan value

In [None]:
from numpy.core.numeric import NaN
import re
def integer(val):
  z = re.match("[0-9.]+",str(val));
  if z: return str(z.group())
  else: return NaN

- applying integer function to mileage, engine, maxpower, torque columns to get numeric data

In [None]:
df.mileage = df.mileage.apply(integer)
df['mileage'] = df['mileage'].astype(float)

In [None]:
df.engine = df.engine.apply(integer)
df['engine'] = df['engine'].astype(float)

In [None]:
df.max_power = df.max_power.apply(integer)
df['max_power'] = df['max_power'].astype(float)

In [None]:
df['Torque'] = df.torque.apply(integer)
df['Torque'] = df['Torque'].astype(float)

- splitting out torque column into torque and speed by below technique and droping out torque column after seperated

In [None]:
df['Speed'] = df['torque'].str.findall(r'([,0-9]+)').str[-1]
df['Speed']= df['Speed'].str.replace(",", "", case = False)
df['Speed']= df['Speed'].astype(float)
df.drop('torque',axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,Brand,Model,Torque,Speed
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,Maruti,Swift,190.0,2000.0
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Skoda,Rapid,250.0,2500.0
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,Honda,City,12.7,2700.0
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,Hyundai,i20,22.4,2750.0
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,Maruti,Swift,11.5,4500.0


In [None]:
df['Age'] = 2022 - df['year']
df.drop('year',axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,Brand,Model,Torque,Speed,Age
0,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,Maruti,Swift,190.0,2000.0,8
1,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Skoda,Rapid,250.0,2500.0,8
2,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,Honda,City,12.7,2700.0,16
3,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,Hyundai,i20,22.4,2750.0,12
4,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,Maruti,Swift,11.5,4500.0,15


In [None]:
#df.drop('Model',axis=1,inplace=True)
#df.drop('Brand',axis=1,inplace=True)

In [None]:
#df.Brand.unique()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   selling_price  8128 non-null   int64  
 1   km_driven      8128 non-null   int64  
 2   fuel           8128 non-null   object 
 3   seller_type    8128 non-null   object 
 4   transmission   8128 non-null   object 
 5   owner          8128 non-null   object 
 6   mileage        7907 non-null   float64
 7   engine         7907 non-null   float64
 8   max_power      7912 non-null   float64
 9   seats          7907 non-null   float64
 10  Brand          8128 non-null   object 
 11  Model          8128 non-null   object 
 12  Torque         7906 non-null   float64
 13  Speed          7906 non-null   float64
 14  Age            8128 non-null   int64  
dtypes: float64(6), int64(3), object(6)
memory usage: 952.6+ KB


- Creating X and y vaiables

In [None]:
target = 'selling_price'
X = df.drop(target,axis=1)
y = df.loc[:,target]

In [None]:
y.isna().sum()

0

- spliting entire dataset into train and test data which contains 400 records for test and 7.7k for train which will divide train data further into train and dev dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.05,random_state=578)
X_train.shape, X_test.shape,y_train.shape

((7721, 14), (407, 14), (7721,))

- Creating numerical columns and Categorical columns using make column selector

In [None]:
from sklearn.compose import make_column_selector
num_col = make_column_selector(dtype_exclude=object)
cat_col = make_column_selector(dtype_include=object)

- importing imputation, column transformer, encoding & pipeline libraries

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import make_pipeline

- Creating imputation strategies for mean , median, mode

In [None]:
imp_mean = SimpleImputer(strategy='mean')
imp_median = SimpleImputer(strategy='median')
imp_cat = SimpleImputer(strategy='most_frequent')

- Creating one hot encoding and ordinal encoding strategies

In [None]:
onehot = OneHotEncoder(handle_unknown='ignore')
ordencode = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)

- Creating column transformer strategy for mean imputaion with one hot encoding and median imputation with ordinal encoding

In [None]:
col_transform_mean = make_column_transformer(
    (make_pipeline(imp_mean),num_col),
    (make_pipeline(imp_cat,onehot),cat_col),
    remainder='passthrough'
)

col_transform_median = make_column_transformer(
    (make_pipeline(imp_median),num_col),
    (make_pipeline(imp_cat,ordencode),cat_col),
    remainder='passthrough'
)

- Libraries for normalization and standardization

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

- Creating instance for linear regression, normalization, standardization

In [None]:
lrmodel = LinearRegression()
minmax = MinMaxScaler()
Stdscaler = StandardScaler()

- Creating KFOLD cross validation with 10 splits

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10,shuffle=True,random_state=578)

- Creating ridge and lasso models with various strategy

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
ridge_model = Ridge()
lasso_model = Lasso()

-https://stackoverflow.com/questions/45261459/invalid-parameter-loss-for-estimator-pipeline/45261491

In [None]:
pipe_lasso_mean = make_pipeline(col_transform_mean,lasso_model)
lasso_alphas = np.linspace(0, 0.2, 11)
grid = dict()
grid['lasso__alpha'] = lasso_alphas
Lasso_grid_model = GridSearchCV(pipe_lasso_mean,grid,cv=kfold, n_jobs=-1)

In [None]:
pipe_lasso_mean.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f926cf0da90>),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f926cf0dc50>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 'columntra

In [None]:
Lasso_grid_model.fit(X_train,y_train)

  positive,


GridSearchCV(cv=KFold(n_splits=10, random_state=578, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f926cf0da90>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),

In [None]:
Lasso_grid_model.best_params_

{'lasso__alpha': 0.2}

In [None]:
Lasso_grid_model.best_score_

0.9311601972572261

In [None]:
Lasso_grid_model.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f926d092c50>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_trans

In [None]:
pipe_ridge_mean = make_pipeline(col_transform_mean,ridge_model)
ridge_alphas = np.linspace(100, 500, 11)
grid = dict()
grid['ridge__alpha'] = ridge_alphas
ridge_grid_model = GridSearchCV(pipe_ridge_mean,grid,cv=kfold, n_jobs=-1)

In [None]:
ridge_grid_model.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=578, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f926cf0da90>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),

In [None]:
ridge_grid_model.best_params_

{'ridge__alpha': 140.0}

In [None]:
ridge_grid_model.best_score_

0.4803892222451968

In [None]:
ridge_grid_model.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f926d034790>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_trans

In [1]:
import pickle

In [2]:
with open('lrpipe.pkl','rb') as FH:
  lr_model_mean = pickle.load(FH)

In [3]:
lr_model_mean

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f7396067510>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_trans

In [4]:
!uname -a

Linux aedd9660a955 5.4.144+ #1 SMP Tue Dec 7 09:58:10 PST 2021 x86_64 x86_64 x86_64 GNU/Linux


In [5]:
!date

Sun Feb 20 06:05:49 UTC 2022


In [6]:
!ls -l

total 8
-rw-r--r-- 1 root root 1300 Feb 20 05:59 lrpipe.pkl
drwxr-xr-x 1 root root 4096 Feb  1 14:32 sample_data


In [7]:
import joblib

In [8]:
lr_model2 = joblib.load('lrpipe.jlb')

In [9]:
lr_model2

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f73951a67d0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_trans