In [119]:
import seaborn as sns


mpg = sns.load_dataset("mpg")
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [120]:
mpg = mpg.dropna()
mpg = mpg.drop(["origin", "name"], axis=1)

In [121]:
from sklearn.model_selection import train_test_split
y = mpg["mpg"]
X = mpg.drop(["mpg"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [122]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(X_train, y_train)
print(X.columns)
print(f"Parameters {model_OLS.coef_}")
print(f"Intercept {model_OLS.intercept_}")

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model_year'],
      dtype='object')
Parameters [-0.116173    0.00101347 -0.00227634 -0.00656101  0.06173551  0.76063644]
Intercept -15.057758585282404


In [123]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


def test_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"{mae  = } \n{mse  = } \n{rmse = }")
    return mae, mse, rmse

y_pred_OLS = model_OLS.predict(X_test)
metrics_OLS = test_metrics(y_test, y_pred_OLS)



mae  = 2.503860089776125 
mse  = 10.502370329417303 
rmse = 3.2407360783342574


In [130]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train)

print(f"Parameters {model_OLS.coef_}")
print(f"Intercept {model_OLS.intercept_}")

y_pred_OLS_scaled = model_OLS.predict(scaled_X_test)

metrics_OLS_scaled = test_metrics(y_test, y_pred_OLS_scaled)

Parameters [-0.19723011  0.10499657 -0.08700746 -5.5098621   0.17368415  2.75679957]
Intercept 23.599361022364224
mae  = 2.5038600897761234 
mse  = 10.502370329417294 
rmse = 3.240736078334256


In [139]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(max_iter=10000, random_state=42)
model_SGD.fit(scaled_X_train, y_train)
y_pred_SGD = model_SGD.predict(scaled_X_test)

metrics_SGD = test_metrics(y_test, y_pred_SGD)

mae  = 2.5245014610707175 
mse  = 10.869267167985122 
rmse = 3.296857165238604


In [145]:
from sklearn.preprocessing import PolynomialFeatures

polynomial_instance = PolynomialFeatures(degree=1, include_bias=False)

polynomial_features = polynomial_instance.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(polynomial_features, y, test_size=0.2, random_state=42)

model_POLY_1 = LinearRegression()
model_POLY_1.fit(X_train, y_train)
y_pred_POLY_1 = model_POLY_1.predict(X_test)

metrics_POLY_1 = test_metrics(y_test, y_pred_POLY_1)

mae  = 2.5038600897761247 
mse  = 10.502370329417301 
rmse = 3.2407360783342574


In [148]:
polynomial_instance = PolynomialFeatures(degree=2, include_bias=False)

polynomial_features = polynomial_instance.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(polynomial_features, y, test_size=0.2, random_state=42)

model_POLY_2 = LinearRegression()
model_POLY_2.fit(X_train, y_train)
y_pred_POLY_2 = model_POLY_2.predict(X_test)

metrics_POLY_2 = test_metrics(y_test, y_pred_POLY_2)

mae  = 1.980477209601935 
mse  = 7.419858147786743 
rmse = 2.723941656457925


In [149]:
polynomial_instance = PolynomialFeatures(degree=3, include_bias=False)

polynomial_features = polynomial_instance.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(polynomial_features, y, test_size=0.2, random_state=42)

model_POLY_3 = LinearRegression()
model_POLY_3.fit(X_train, y_train)
y_pred_POLY_3 = model_POLY_3.predict(X_test)

metrics_POLY_3 = test_metrics(y_test, y_pred_POLY_3)

mae  = 2.117882182816585 
mse  = 9.273542051306416 
rmse = 3.0452490951162625


In [156]:
import pandas as pd

df = pd.DataFrame(
    {
        "Linear regr. SVD": metrics_OLS,
        "Linear regr. SVD Scaled": metrics_OLS_scaled,
        "Linear regr. SGD": metrics_SGD,
        "Polynom. regr. deg 1": metrics_POLY_1,
        "Polynom. regr. deg 2": metrics_POLY_2,
        "Polynom. regr. deg 3": metrics_POLY_3,
    }
)
df


Unnamed: 0,Linear regr. SVD,Linear regr. SVD Scaled,Linear regr. SGD,Polynom. regr. deg 1,Polynom. regr. deg 2,Polynom. regr. deg 3
0,2.50386,2.50386,2.524501,2.50386,1.980477,2.117882
1,10.50237,10.50237,10.869267,10.50237,7.419858,9.273542
2,3.240736,3.240736,3.296857,3.240736,2.723942,3.045249
