In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('mpg')

In [3]:
df


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [4]:
df.drop('name',axis = 1 , inplace = True)

In [5]:
df.head(5)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [7]:
df.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,6
weight,0
acceleration,0
model_year,0
origin,0


In [8]:
df.dtypes

Unnamed: 0,0
mpg,float64
cylinders,int64
displacement,float64
horsepower,float64
weight,int64
acceleration,float64
model_year,int64
origin,object


In [9]:
df.horsepower.median()

np.float64(93.5)

In [10]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [11]:
df.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,0
weight,0
acceleration,0
model_year,0
origin,0


In [12]:
df['origin'].value_counts()

Unnamed: 0_level_0,count
origin,Unnamed: 1_level_1
usa,249
japan,79
europe,70


In [13]:
df['origin'] = df['origin'].map({'usa': 1 , 'japan' : 2 , 'europe' : 3})

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 25.0 KB


In [15]:
X = df.drop('mpg',axis = 1 )
y = df.mpg

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state=1)

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
model = LinearRegression()

In [20]:
model.fit(X_train, y_train)

In [21]:
model.coef_

array([-0.2165378 ,  0.01987994, -0.01270482, -0.0071647 ,  0.09047637,
        0.84604207,  1.42800007])

In [22]:
for i , col_name in enumerate(X_train.columns):
  print(f"The coefficient for {col_name} is {model.coef_[i]}")

The coefficient for cylinders is -0.2165378021683322
The coefficient for displacement is 0.01987993620887507
The coefficient for horsepower is -0.012704816241841353
The coefficient for weight is -0.00716469815766328
The coefficient for acceleration is 0.09047636715999363
The coefficient for model_year is 0.8460420672591425
The coefficient for origin is 1.4280000651600893


In [23]:
#coeffiecient is very small if one if one independent variable changes there will be not much effect on the model that is known is a smoother model
# the features might not be contributing much

In [24]:
from sklearn.metrics import r2_score

In [25]:
y_pred = model.predict(X_test)

In [26]:
r2_linear = r2_score(y_test,y_pred)
print(f"The r2 of this model is {r2_linear}")

The r2 of this model is 0.856301245145641


In [27]:
#ridge_regression


In [28]:
from sklearn.linear_model import Ridge

In [35]:
Ridge_regression_model = Ridge(alpha = 0.1 )
Ridge_regression_model

In [36]:
Ridge_regression_model.fit(X_train,y_train)

In [37]:
for i , col_name in enumerate(X_train.columns):
  print(f"The coefficient for {col_name} is {Ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.2161530502602711
The coefficient for displacement is 0.019862215811114647
The coefficient for horsepower is -0.01269732411659296
The coefficient for weight is -0.007164428584491182
The coefficient for acceleration is 0.09045740164184107
The coefficient for model_year is 0.8459747074062308
The coefficient for origin is 1.4266691004781444


In [38]:
y_pred_ridge = Ridge_regression_model.predict(X_test)

In [39]:
r2_ridge = r2_score(y_test,y_pred_ridge)

In [40]:
r2_ridge

0.8563026607885915

In [41]:
# there is not much change in the r2 coz the data was already smooth

In [43]:
#Lasso
from sklearn.linear_model import Lasso

In [44]:
Lasso_reg_model = Lasso(alpha =0.5)

In [45]:
Lasso_reg_model.fit(X_train,y_train)

In [46]:
for i , col_name in enumerate(X_train.columns):
  print(f"The coefficient for {col_name} is {Lasso_reg_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.0019772716551741116
The coefficient for horsepower is -0.00929169211171875
The coefficient for weight is -0.0065899285654646925
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7532080062488489
The coefficient for origin is 0.0


In [47]:
y_pred_Lasso = Lasso_reg_model.predict(X_test)

In [48]:
r2_score_Lasso = r2_score(y_test,y_pred_Lasso)

In [49]:
r2_score_Lasso

0.8457765614927952

In [52]:
from sklearn.linear_model import ElasticNet

In [53]:
Elasticnet_model = ElasticNet(alpha = 1 , l1_ratio = 0.5)

In [54]:
Elasticnet_model.fit(X_train,y_train)

In [55]:
y_pred_Elasticnet_model = Elasticnet_model.predict(X_test)

In [56]:
r2_elasticnet = r2_score(y_test,y_pred_Elasticnet_model)

In [59]:
#regularization with cross-validation


In [60]:
from sklearn.linear_model import LassoCV

In [65]:
Lasso_cv = LassoCV(cv=5 , verbose=3)

In [66]:
Lasso_cv.fit(X_train,y_train)

(array([-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -2.05159276e-05,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00]), -1.8189894035458565e-12, 1.553369358267717, 2)
(array([-0.        , -0.        , -0.        , -0.00053551,  0.        ,
        0.        ,  0.        ]), 0.0, 1.553369358267717, 2)
(array([-0.       , -0.       , -0.       , -0.0010158,  0.       ,
        0.       ,  0.       ]), 0.0, 1.553369358267717, 2)
(array([-0.        , -0.        , -0.        , -0.00146371,  0.        ,
        0.        ,  0.        ]), 1.8189894035458565e-12, 1.553369358267717, 2)
(array([-0.        , -0.        , -0.        , -0.00188144,  0.        ,
        0.        ,  0.        ]), 0.0, 1.553369358267717, 2)
(array([-0.        , -0.        , -0.        , -0.00227102,  0.        ,
        0.        ,  0.        ]), 0.0, 1.553369358267717, 2)
(array([-0.        , -0.        , -0.        , -0.00263434,  0.        ,
        0.        ,  0.        ]), 0.0, 1.553369358267717

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [67]:
y_pred_lasso_cv = Lasso_cv.predict(X_test)

In [68]:
r2_Lasso_cv = r2_score(y_test,y_pred_lasso_cv)

In [73]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(cv=5)
ridge_cv.fit(X_train, y_train)
y_pred_ridge_cv = ridge_cv.predict(X_test)
r2_ridge_cv = r2_score(y_test, y_pred_ridge_cv)

In [74]:
from sklearn.linear_model import ElasticNetCV

elasticnet_cv = ElasticNetCV(cv=5)
elasticnet_cv.fit(X_train, y_train)
y_pred_elasticnet_cv = elasticnet_cv.predict(X_test)
r2_elasticnet_cv = r2_score(y_test, y_pred_elasticnet_cv)

In [75]:
print(f'R2 score of normal model: {r2_linear}')
print(f'R2 score of Ridge model: {r2_ridge}')
print(f'R2 score of Lasso model: {r2_score_Lasso}')
print(f'R2 score of ElasticNet model: {r2_elasticnet}')
print(f'R2 score of LassoCV model: {r2_Lasso_cv}')
print(f'R2 score of RidgeCV model: {r2_ridge_cv}')
print(f'R2 score of ElasticNetCV model: {r2_elasticnet_cv}')

R2 score of normal model: 0.856301245145641
R2 score of Ridge model: 0.8563026607885915
R2 score of Lasso model: 0.8457765614927952
R2 score of ElasticNet model: 0.8477301010951965
R2 score of LassoCV model: 0.8410032034906063
R2 score of RidgeCV model: 0.8563494248817898
R2 score of ElasticNetCV model: 0.8269714661183815
