# regulärisering

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [55]:
df = pd.read_csv("../data/Advertising.csv", index_col=0)

X, y = df.drop("Sales", axis=1), df["Sales"]



In [56]:
# polynomiell feature expansion på hela datan

model_poly = PolynomialFeatures(3, include_bias=False) 
poly_features = model_poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(poly_features,y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape



((134, 19), (66, 19), (134,), (66,))

In [57]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# alltid när vi gör klassifiering ska vi skalera
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform((X_test))

print(f"Scaled X_train mean {scaled_X_train.mean():.2f} std: {scaled_X_train.std():.2f}")
print(f"Scaled X_test mean {scaled_X_train.mean():.2f} std: {scaled_X_test.std():.2f}")

Scaled X_train mean -0.00 std: 1.00
Scaled X_test mean -0.00 std: 1.12


In [58]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

def ridge_regression(X, penalty=0): #penalty = 0 = lambda 0 =  alpha?
    model_ridge = Ridge(alpha=penalty)
    model_ridge.fit(scaled_X_train,y_train)
    y_pred = model_ridge.predict(X)
    return y_pred
y_pred = ridge_regression(scaled_X_test, penalty=0.1)

MSE = mean_squared_error(y_test,y_pred)
RMSE = root_mean_squared_error(y_test,y_pred)
MAE = mean_absolute_error(y_test, y_pred)

print(f"penalty 0.2: MSE: {MSE}, RMSE: {RMSE}, MAE:{MAE}")

y_pred = ridge_regression(scaled_X_test, penalty=0)

MSE = mean_squared_error(y_test,y_pred)
RMSE = root_mean_squared_error(y_test,y_pred)
MAE = mean_absolute_error(y_test, y_pred)

print(f"penalty 0: MSE: {MSE}, RMSE: {RMSE}, MAE:{MAE}")



penalty 0.2: MSE: 0.3176335944841091, RMSE: 0.5635899169468073, MAE:0.43430757663876474
penalty 0: MSE: 0.26504659505538464, RMSE: 0.5148267621786815, MAE:0.3748516441217824


In [59]:
from sklearn.linear_model import Lasso
model_lasso = Lasso(alpha=0.1)
model_lasso.fit(scaled_X_train, y_train)
y_pred = model_lasso.predict(scaled_X_test)
print(root_mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred))

model_lasso.coef_

0.7853962108799019 0.5735346450114959


array([ 1.89480144,  0.42062367,  0.        , -0.        ,  3.55216501,
        0.        ,  0.        ,  0.01110965,  0.        , -0.42677394,
       -0.        , -0.        ,  0.        ,  0.        , -0.        ,
        0.        ,  0.06706906,  0.        ,  0.        ])

In [None]:
''' 
k-fold cross validation, vi sorterar träningsdatan slumpmässigt och delar upp i k olika grupper
 och för varje så tar vi en testa o sen tränar på resten sen börjar om osv,osv
 bra för små dataset, hyperparameter optimering, problemet är att det blir väldigt dyrt men resultatet
 blir oftast väldigt bra
'''

from sklearn.linear_model import RidgeCV

model_ridgeCV = RidgeCV(alphas=[.0001,.001,.01,.1,.5,1,5,10], scoring="neg_mean_squared_error")

model_ridgeCV.fit(scaled_X_train,y_train)
print(model_ridgeCV.alpha_)

0.1


In [68]:
from sklearn.linear_model import LassoCV

model_lassoCV = LassoCV(eps=0.001, n_alphas=100, max_iter=10000, cv=5)

model_lassoCV.fit(scaled_X_train,y_train)

y_pred = model_lassoCV.predict(scaled_X_test)

print(f"RMSE{root_mean_squared_error(y_test,y_pred)}, MAE: {mean_absolute_error(y_test,y_pred)}, alpha: {model_lassoCV.alpha_}")

RMSE0.5785146895301946, MAE: 0.46291883026932745, alpha: 0.004968802520343366


In [70]:
from sklearn.linear_model import ElasticNetCV
model_elastic = ElasticNetCV(l1_ratio=[.1,.5,.7,.9,.95,.97,1], eps=0.001, n_alphas=100, max_iter=10000)
model_elastic.fit(scaled_X_train,y_train)
y_pred = model_elastic.predict(scaled_X_test)

model_elastic.l1_ratio_, model_elastic.alpha_, root_mean_squared_error(y_test,y_pred), mean_absolute_error(y_test,y_pred)

(np.float64(1.0),
 np.float64(0.004968802520343366),
 0.5785146895301946,
 0.46291883026932745)