In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = sns.load_dataset('mpg')

In [3]:
df.drop('name', axis=1, inplace=True)

In [4]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [5]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [6]:
df['horsepower'].median()

93.5

In [7]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [8]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [9]:
df.info()   # object there then it convert into numberical columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [10]:
df.origin.unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [11]:
df.origin.value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [12]:
df['origin'] = df['origin'].map({'usa':1, "japan":2, "europe":3})

In [13]:
df['origin']

0      1
1      1
2      1
3      1
4      1
      ..
393    1
394    3
395    1
396    1
397    1
Name: origin, Length: 398, dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 25.0 KB


In [15]:
X= df.drop("mpg", axis=1)
y= df['mpg']

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state =1)

In [18]:
X_train.shape, X_test.shape

((278, 7), (120, 7))

In [19]:
from sklearn.linear_model import LinearRegression
model_regression = LinearRegression()
model_regression

In [20]:
model_regression.fit(X_train, y_train)

In [21]:
for i,col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {model_regression.coef_[i]}")

The coefficient for cylinders is -0.3176142302799369
The coefficient for displacement is 0.026237482599078946
The coefficient for horsepower is -0.018270764913124595
The coefficient for weight is -0.007487750398361897
The coefficient for acceleration is 0.0504067346197138
The coefficient for model_year is 0.8470951427061365
The coefficient for origin is 1.5190958387975024


In [22]:
# what enumerate do
for i,col_name in enumerate(X_train.columns):
    print(i , col_name)

0 cylinders
1 displacement
2 horsepower
3 weight
4 acceleration
5 model_year
6 origin


In [23]:
from sklearn.metrics import r2_score
y_pred_linear = model_regression.predict(X_test)
r2_score(y_test, y_pred_linear)

0.8348001123742285

In [24]:
# ridge regression
from sklearn.linear_model import Ridge

In [25]:
ridge_regression_model = Ridge(alpha = 0.1)
ridge_regression_model.fit(X_train, y_train)
for i,col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.317003210100688
The coefficient for displacement is 0.026213249757983868
The coefficient for horsepower is -0.018263252481448933
The coefficient for weight is -0.007487326050213144
The coefficient for acceleration is 0.05036896947442996
The coefficient for model_year is 0.8470062938903175
The coefficient for origin is 1.5174528285653952


In [26]:
from sklearn.metrics import r2_score
y_pred = ridge_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8348084889168356

In [27]:
# lasso regression
from sklearn.linear_model import Lasso

In [28]:
lasso_regression_model = Lasso(alpha = 0.1)
lasso_regression_model.fit(X_train, y_train)
for i,col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.017751964528123634
The coefficient for horsepower is -0.019307657818043437
The coefficient for weight is -0.007285668218708932
The coefficient for acceleration is 0.011179016975864573
The coefficient for model_year is 0.8258205650724195
The coefficient for origin is 1.1922557769714675


In [29]:
from sklearn.metrics import r2_score
y_pred = lasso_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8345318641232303

when you want reduce the overfitting go with ridge and want to feature sdelection go wtih lasso

In [30]:
# elastic net regression
from sklearn.linear_model import ElasticNet

In [31]:
elastic_net_regression_model = ElasticNet(alpha =1, l1_ratio=0.5)
elastic_net_regression_model.fit(X_train, y_train)
for i,col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_net_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.005888869953667563
The coefficient for horsepower is -0.012403874933570126
The coefficient for weight is -0.006934550516257631
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7133150744603874
The coefficient for origin is 0.0


In [32]:
from sklearn.metrics import r2_score
y_pred = elastic_net_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8284840073256804