In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("auto-mpg.csv")
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [None]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [None]:
df= df.drop(columns='car name', axis=1)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1
394,44.0,4,97.0,52,2130,24.6,82,2
395,32.0,4,135.0,84,2295,11.6,82,1
396,28.0,4,120.0,79,2625,18.6,82,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [None]:
df = df.replace("?", np.nan).dropna()

In [None]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1
394,44.0,4,97.0,52,2130,24.6,82,2
395,32.0,4,135.0,84,2295,11.6,82,1
396,28.0,4,120.0,79,2625,18.6,82,1


In [None]:
df.horsepower.value_counts()

150    22
90     20
88     19
110    18
100    17
       ..
61      1
93      1
148     1
152     1
82      1
Name: horsepower, Length: 93, dtype: int64

In [None]:
df.horsepower= df.horsepower.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int64  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 27.6 KB


In [None]:
X = df.drop("mpg", axis=1)
y= df["mpg"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.10, random_state=0)

In [None]:
# linear models

In [None]:
def model_inplace(scaler, model, X_train, X_test, y_train, y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model.fit(X_train,y_train)
    print("score on train set:", model.score(X_train, y_train))
    print("score on test set:", model.score(X_test, y_test))


In [None]:
baseline_model = LinearRegression()
ridge = Ridge(alpha=0.3)#L2 regularizer #hyperparameters
lasso = Lasso (alpha=0.1) #L1 regularizer 


In [None]:
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = PolynomialFeatures(degree=2, interaction_only=True) # check docs

In [None]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
1,8,350.0,165,3693,11.5,70,1
329,4,91.0,67,1850,13.8,80,3
319,4,120.0,75,2542,17.5,80,3
136,8,302.0,140,4141,14.0,74,1
233,4,97.0,78,1940,14.5,77,2
...,...,...,...,...,...,...,...
325,4,90.0,48,2085,21.7,80,2
194,6,232.0,90,3085,17.6,76,1
118,4,116.0,75,2158,15.5,73,2
48,6,250.0,88,3139,14.5,71,1


In [None]:
X_poly = scaler3.fit_transform(X_train)

In [None]:
X_poly.shape

(352, 29)

In [None]:
X_poly

array([[1.0000e+00, 8.0000e+00, 3.5000e+02, ..., 8.0500e+02, 1.1500e+01,
        7.0000e+01],
       [1.0000e+00, 4.0000e+00, 9.1000e+01, ..., 1.1040e+03, 4.1400e+01,
        2.4000e+02],
       [1.0000e+00, 4.0000e+00, 1.2000e+02, ..., 1.4000e+03, 5.2500e+01,
        2.4000e+02],
       ...,
       [1.0000e+00, 4.0000e+00, 1.1600e+02, ..., 1.1315e+03, 3.1000e+01,
        1.4600e+02],
       [1.0000e+00, 6.0000e+00, 2.5000e+02, ..., 1.0295e+03, 1.4500e+01,
        7.1000e+01],
       [1.0000e+00, 6.0000e+00, 1.7100e+02, ..., 1.0875e+03, 1.4500e+01,
        7.5000e+01]])

In [None]:
# baseline model with different preprocessors

In [None]:
model_inplace(scaler1, baseline_model, X_train, X_test, y_train, y_test)

score on train set: 0.8215414802243143
score on test set: 0.817850404380267


In [None]:
model_inplace(scaler2, baseline_model, X_train, X_test, y_train, y_test)

score on train set: 0.8215414802243143
score on test set: 0.817850404380267


In [None]:
model_inplace(scaler3, baseline_model, X_train, X_test, y_train, y_test)

score on train set: 0.885080201794461
score on test set: 0.8716383348185536


In [None]:
## takeway: FEATURE ENGINEERING IS EVERYTHING

In [None]:
# ridge model with different preprocessors

In [None]:
model_inplace(scaler1, ridge, X_train, X_test, y_train, y_test)
# the model was well affected by regularization


score on train set: 0.8215346398114114
score on test set: 0.8180349664361533


In [None]:
model_inplace(scaler2, ridge, X_train, X_test, y_train, y_test)
# the model was well affected by regularization


score on train set: 0.820401151479893
score on test set: 0.8205811870078932


In [None]:
model_inplace(scaler3, ridge, X_train, X_test, y_train, y_test)
# the model was well affected by regularization


score on train set: 0.8890855287285576
score on test set: 0.883558810665925


In [None]:
# Lasso model with different preprocessors

In [None]:
model_inplace(scaler1, lasso, X_train, X_test, y_train, y_test)
# the model was not successful

score on train set: 0.8180767064757348
score on test set: 0.8139619983614261


In [None]:
model_inplace(scaler2, lasso, X_train, X_test, y_train, y_test)
# this model was not succesfull

score on train set: 0.8130781830408886
score on test set: 0.816720470399861


In [None]:
model_inplace(scaler3, lasso, X_train, X_test, y_train, y_test)
# this did the the job

score on train set: 0.8769415063317563
score on test set: 0.8954746097461636


  model = cd_fast.enet_coordinate_descent(


In [None]:
# hyperparameter tuning: how to choose the hyper parameters: alphas

## technique sklearn gridsearch

In [None]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1
394,4,97.0,52,2130,24.6,82,2
395,4,135.0,84,2295,11.6,82,1
396,4,120.0,79,2625,18.6,82,1


In [None]:
from sklearn.model_selection import GridSearchCV

# first selection method 


# learn is what is the best alhpa on Lasso
lasso= Lasso() # default is alpha=1.0
params = {"alpha":  np.arange(0,1000,100)}
grid = GridSearchCV(lasso,  param_grid= params, cv=10, verbose=1)
X_train = scaler1.fit_transform(X_train)
X_test = scaler1.transform(X_test)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("coefficients model", best_model.coef_)
print("score on train set:", best_model.score(X_train, y_train))
print("score on test set:", best_model.score(X_test, y_test))

# feature selection is taking features that are irrelevant for the model
print("columns that were removed by Lasso", X.columns[best_model.coef_ ==0])
print("columns that were NOT removed by Lasso", X.columns[best_model.coef_ !=0])
print("best alpha", grid.best_params_)

# 0, 10/1000, 20/1000, ....
# np.arange(0,10,1): 0, 0.1, 0.2, 0.3, 04, 0.5,...,1 --> dividing the interval [0,1] in 10 pieces



Fitting 10 folds for each of 10 candidates, totalling 100 fits
coefficients model [-0.68971103  1.95588825 -0.44694631 -5.69563662  0.37190035  2.7135699
  1.11009439]
score on train set: 0.8215414802243143
score on test set: 0.817850404380267
columns that were removed by Lasso Index([], dtype='object')
columns that were NOT removed by Lasso Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')
best alpha {'alpha': 0}


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_f

In [None]:
best_model.coef_.shape

(436,)

In [None]:
X_poly

array([[1.0000e+00, 8.0000e+00, 3.5000e+02, ..., 8.0500e+02, 1.1500e+01,
        7.0000e+01],
       [1.0000e+00, 4.0000e+00, 9.1000e+01, ..., 1.1040e+03, 4.1400e+01,
        2.4000e+02],
       [1.0000e+00, 4.0000e+00, 1.2000e+02, ..., 1.4000e+03, 5.2500e+01,
        2.4000e+02],
       ...,
       [1.0000e+00, 4.0000e+00, 1.1600e+02, ..., 1.1315e+03, 3.1000e+01,
        1.4600e+02],
       [1.0000e+00, 6.0000e+00, 2.5000e+02, ..., 1.0295e+03, 1.4500e+01,
        7.1000e+01],
       [1.0000e+00, 6.0000e+00, 1.7100e+02, ..., 1.0875e+03, 1.4500e+01,
        7.5000e+01]])

In [None]:
X.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')

In [None]:
from sklearn.model_selection import GridSearchCV

# first selection method 


# learn is what is the best alhpa on Lasso
lasso= Lasso() # default is alpha=1.0
params = {"alpha":  np.arange(0,1000,100)}
grid = GridSearchCV(lasso,  param_grid= params, cv=10, verbose=1)
X_train = scaler3.fit_transform(X_train)
X_test = scaler3.transform(X_test)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("score on train set:", best_model.score(X_train, y_train))
print("score on test set:", best_model.score(X_test, y_test))

# feature selection is taking features that are irrelevant for the model it does not work because we transformed
#the feature space
#print("columns that were removed by Lasso", X.columns[best_model.coef_ ==0])
#print("columns that were NOT removed by Lasso", X.columns[best_model.coef_ !=0])
print("best alpha", grid.best_params_)

# 0, 10/1000, 20/1000, ....
# np.arange(0,10,1): 0, 0.1, 0.2, 0.3, 04, 0.5,...,1 --> dividing the interval [0,1] in 10 pieces


Fitting 10 folds for each of 10 candidates, totalling 100 fits


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_f

  model = cd_fast.enet_coordinate_descent(


score on train set: 0.9545279901175678
score on test set: 0.8503437417021518
best alpha {'alpha': 0}


  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
