# Regularization

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
X = df.drop('sales', axis=1)
y = df['sales']

In [11]:
from sklearn.preprocessing import PolynomialFeatures
poly_converter = PolynomialFeatures(degree=3, include_bias=False)
poly_features = poly_converter.fit_transform(X)
print(X.shape)
print(poly_features.shape)

(200, 3)
(200, 19)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.33, random_state=42)

In [17]:
# Normalization 
# we don't normalize y (target)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train) # computing mean and std

In [14]:
X_train = scaler.transform(X_train) # normalizing based on formula
X_test = scaler.transform(X_test) # not fitting because we don't want it to cheat and know the mean and std
X_train[0]

array([ 1.66759336,  0.26512404, -1.39553662,  2.20150373,  1.27757026,
       -0.90502331, -0.02131602, -0.8758223 , -0.83990056,  2.64499976,
        1.82073249, -0.65010896,  0.65667204, -0.64142146, -0.61276841,
       -0.23015661, -0.7133113 , -0.65028766, -0.56911778])

# Regularization
1. L-1 norm (Lasso) --> cost function + sum of absolute values of coefficients
2. L-2 norm (Ridge) --> cost function + sum of square values of coefficients

In [18]:
# L-2 
# if we wanna use Ridge, data must be standardized
from sklearn.linear_model import Ridge 
ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train, y_train)

In [20]:
y_pred = ridge_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
print(MAE, MSE, RMSE)

# our model has gotten worse it seems :D

0.6329556348463365 0.7969472220887763 0.8927190051123457


In [38]:
# giving different alpha 3 values using RidgeCV model 
from sklearn.linear_model import RidgeCV

ridge_cv_model = RidgeCV(alphas=(0.1, 1.0, 10), scoring='neg_mean_absolute_error')
ridge_cv_model.fit(X_train, y_train)

In [39]:
# which alpha is better?
# it atomatically evaluate the best alpha value
ridge_cv_model.alpha_


np.float64(0.1)

In [37]:
from sklearn.metrics._scorer import _SCORERS 
# this is all evaluation techniques
# the bigger the number the better the result (that's why MAE has gotten negative)
print(_SCORERS.keys())

dict_keys(['explained_variance', 'r2', 'neg_max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'd2_absolute_error_score', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'reca

In [41]:
y_pred2 = ridge_cv_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, y_pred2)
MSE = mean_squared_error(y_test, y_pred2)
RMSE = np.sqrt(MSE)
print(MAE, MSE, RMSE)

# our model has gotten better compare to Regression Model

0.4343075766486241 0.31763359449410056 0.5635899169556714


In [44]:
ridge_cv_model.coef_
# no 0 coef

array([ 5.84681185,  0.52142086,  0.71689997, -6.17948738,  3.75034058,
       -1.36283352, -0.08571128,  0.08322815, -0.34893776,  2.16952446,
       -0.47840838,  0.68527348,  0.63080799, -0.5950065 ,  0.61661989,
       -0.31335495,  0.36499629,  0.03328145, -0.13652471])

In [45]:
# L-1 (Lasoo)
# if we wanna use Ridge, data must be standardized
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train) 

In [47]:
lasso_y_pred = lasso_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, lasso_y_pred)
MSE = mean_squared_error(y_test, lasso_y_pred)
RMSE = np.sqrt(MSE)
print(MAE, MSE, RMSE)

# our model has gotten worse it seems :D

0.5735346450114956 0.6168472080645071 0.7853962108799017


In [58]:
from sklearn.linear_model import LassoCV

# here we can't give alpha manually
# eps means first alpha is 0.001, second is 0.002, third is 0.003 and so on...
# n_alphas means do the above process 100 times
# cv means k-fold = 5
lasso_cv_model = LassoCV(eps=0.001, n_alphas=100, cv=5, max_iter=1000000)
lasso_cv_model.fit(X_train, y_train)

In [59]:
lasso_cv_model.alpha_ 

np.float64(0.004968802520343365)

In [60]:
lasoo_y_pred2 = lasso_cv_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, lasoo_y_pred2)
MSE = mean_squared_error(y_test, lasoo_y_pred2)
RMSE = np.sqrt(MSE)
print(MAE, MSE, RMSE)

# our model has gotten better compare to Regression Model

0.4629188302693299 0.3346792460022216 0.5785146895301981


In [61]:
lasso_cv_model.coef_
# here we have a lot of 0 coef here (it believes some featues are not nedeed at all(simplifies the model is good but deghat might get worse))

array([ 5.19612354,  0.43037087,  0.29876351, -4.80417579,  3.46665205,
       -0.40507212,  0.        ,  0.        ,  0.        ,  1.35260206,
       -0.        ,  0.        ,  0.14879719, -0.        ,  0.        ,
        0.        ,  0.09649665,  0.        ,  0.04353956])

we see that some x's have 0 coef but the precision is the same as ridge 
</br>
 so we have a more simple model with good precision and this is better

# ElasticNetCV 
combining L-1 and L-2 

In [64]:
from sklearn.linear_model import ElasticNetCV

# l1-ratio parameter determine how much l-1 and how much l-2 we use
# if it is 1 it would solely use lasoo regression. if it's 0 it would solely use ridge regression
# if it's between 0 and 1 it would use a combination of both
# it's recommended to give numbers more toward 1 and not 0 so lasso would have a bigger impact
ic_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=0.001, n_alphas=100, max_iter=1000000)
ic_model.fit(X_train, y_train)

In [66]:
ic_model.l1_ratio_
# it chode solely lasso for this data

np.float64(1.0)

In [67]:
ic_model.alpha_

np.float64(0.004968802520343365)

In [69]:
ic_y_pred = ic_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, ic_y_pred)
MSE = mean_squared_error(y_test, ic_y_pred)
RMSE = np.sqrt(MSE)
print(MAE, MSE, RMSE)

0.4629188302693299 0.3346792460022216 0.5785146895301981
