### OVERVIEW

In [31]:
# What we will conver in this notebook

# Cross validation in detail:
# 1. train | test split
# 2. train | validation | test split
# 3. scikit-learn cross_val_score (function)
# 4. scikit_learn cross_validate (function)

# Grid Search

### CROSS VALIDATION

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
df = pd.read_csv('Advertising.csv')

In [34]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


##### Train | Test Split Procedure

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [35]:
X = df.drop('sales', axis = 1)

In [36]:
y = df['sales']

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 101)

In [39]:
# scaling the data
from sklearn.preprocessing import StandardScaler

In [40]:
scaler = StandardScaler()

In [41]:
# you always fit to the training data never the test data
# we do that to prevent the leakage
# because we dont want to knwo anything about the test data
scaler.fit(X_train)

StandardScaler()

In [42]:
X_train = scaler.transform(X_train)

In [43]:
# we can scale the Xtest but we can not fit to the test data
X_test = scaler.transform(X_test)

In [44]:
# choosing ridge model to adjust the hyper parameters

In [45]:
from sklearn.linear_model import Ridge

In [46]:
model = Ridge(alpha = 100)

In [47]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [48]:
y_pred = model.predict(X_test)

In [49]:
from sklearn.metrics import mean_squared_error

In [50]:
mean_squared_error(y_test, y_pred)

7.341775789034129

In [51]:
model_two = Ridge(alpha=1)

In [52]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [53]:
y_pred_two = model_two.predict(X_test)

In [54]:
mean_squared_error(y_test, y_pred_two)

2.3190215794287514

In [55]:
# to go through a lot of alpha parameters to select the optimum one is very tedious
# we can apply this through for loop which will be a grid search at a certain point

In [56]:
# when we adjust the alpha parameter due to the prediction based on the other alpha value
# we model has a whiff of the test data and the model now has  little bit of interaction with the test data
# to solve this we keep a little bit of the test data hidden

##### Train | Validation | Test

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [57]:
# if we want a truly fair and final set of performance metrics
# we should get these metrics from a fianal test set that we dont allow to ourselves to adjust on

In [58]:
# we split the data set into three parts
# 1. train
# 2. validation
# 3. test 

In [59]:
# we are not allowed to adjust the hyperparameters after the test set

In [None]:
# This is a HOLD OUT cross validation

In [61]:
X = df.drop('sales',axis=1)

In [62]:
y = df['sales']

In [115]:
from sklearn.model_selection import train_test_split

In [116]:
X_train, X_other, y_train, y_other = train_test_split(X,y, test_size = 0.3, random_state = 101)

In [117]:
X_eval, X_test, y_eval, y_test = train_test_split(X_other ,y_other, test_size = 0.5, random_state = 101)

In [118]:
len(df)

200

In [119]:
len(X_train)

140

In [120]:
len(X_eval)

30

In [121]:
len(X_test)

30

In [122]:
from sklearn.preprocessing import StandardScaler

In [123]:
scaler = StandardScaler()

In [124]:
scaler.fit(X_train)

StandardScaler()

In [125]:
X_train = scaler.transform(X_train)

In [126]:
X_test = scaler.transform(X_test)

In [127]:
X_eval = scaler.transform(X_eval)

In [128]:
from sklearn.linear_model import Ridge

In [129]:
model_one = Ridge(alpha= 100)

In [130]:
model_one.fit(X_train, y_train)

Ridge(alpha=100)

In [134]:
y_eval_pred = model_one.predict(X_eval)

In [135]:
from sklearn.metrics import mean_squared_error

In [136]:
mean_squared_error(y_eval, y_eval_pred)

7.320101458823872

In [137]:
model_two = Ridge(alpha=1)

In [138]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [140]:
y_pred_two = model_two.predict(X_eval)

In [141]:
mean_squared_error(y_eval, y_pred_two)

2.3837830750569866

In [142]:
y_final_test_pred = model_two.predict(X_test)

In [145]:
# this is the finalstage
# after this we cannot adjust the hyperparameters
mean_squared_error(y_test, y_final_test_pred)

2.254260083800517

##### Cross Validation - cross_val_score

In [146]:
# K-fold cross validation

In [147]:
# we split the data into training and test
# the test part is very small and is seperated for the set
# the training is then split into K parts
# the larger the K means more computation
# largest (K = # of rows)
# when you split the training uusing k=5
# this means 4 parts are training and the 1 left is the validation part
# this can be done 5 times
# and there will be 5 errors

----

<img src="grid_search_cross_validation.png">

----

In [148]:
# k-fold cross validation is easy to do with cross_val_score function

In [149]:
df = pd.read_csv('Advertising.csv')

In [150]:
X = df.drop('sales', axis = 1)

In [151]:
y = df['sales']

In [152]:
from sklearn.model_selection import train_test_split

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [174]:
from sklearn.preprocessing import StandardScaler

In [175]:
scaler.fit(X_train)

StandardScaler()

In [176]:
X_train = scaler.transform(X_train)

In [177]:
X_test = scaler.transform(X_test)

In [185]:
model = Ridge(alpha=100)

In [179]:
from sklearn.model_selection import cross_val_score

In [180]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv = 5)
# cv is K-fold value

In [181]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [182]:
abs(scores.mean())

8.215396464543607

In [None]:
# another model

In [196]:
model_two = Ridge(alpha=1)

In [197]:
scores = cross_val_score(model_two, X_train, y_train, scoring='neg_mean_squared_error', cv = 5)
# cv is K-fold value

In [198]:
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [199]:
abs(scores.mean())

3.344839296530695

In [203]:
# we have to fit the model on the training data once again before predicting
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [201]:
y_final_test_pred = model.predict(X_test)

In [202]:
mean_squared_error(y_test, y_final_test_pred)

7.341775789034129

##### Cross Validate - cross_validate

In [227]:
# the cross_validate function allows us to view multiple performance metrics from cross validation on a me=odel and explore how much time fitting and testing took

In [205]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [206]:
from sklearn.model_selection import cross_validate

In [207]:
model = Ridge(alpha=100)

In [209]:
# SCORING OPTIONS:
# https://scikit-learn.org/stable/modules/model_evaluation.html
scores = cross_validate(model,X_train,y_train,
                         scoring=['neg_mean_absolute_error','neg_mean_squared_error','max_error'],cv=10)

In [212]:
scores = pd.DataFrame(scores)

In [213]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_max_error
0,0.001995,0.001994,-1.810212,-6.060671,-5.893055
1,0.001999,0.000992,-2.541958,-10.627031,-6.065249
2,0.000998,0.000994,-1.469594,-3.993426,-5.264242
3,0.001995,0.002995,-1.862769,-5.009494,-4.298812
4,0.001992,0.001998,-2.520697,-9.1418,-5.61586
5,0.001992,0.001998,-2.459995,-13.086256,-10.602074
6,0.001021,0.000977,-1.451971,-3.839405,-4.736134
7,0.000992,0.001992,-2.377395,-9.058786,-6.520936
8,0.001995,0.000997,-2.443344,-9.055457,-7.370495
9,0.000997,0.000997,-1.899797,-5.778882,-5.419462


In [215]:
scores.mean()

fit_time                        0.001598
score_time                      0.001593
test_neg_mean_absolute_error   -2.083773
test_neg_mean_squared_error    -7.565121
test_max_error                 -6.178632
dtype: float64

In [216]:
model = Ridge(alpha=1)

In [217]:
scores = cross_validate(model,X_train,y_train,
                         scoring=['neg_mean_absolute_error','neg_mean_squared_error','max_error'],cv=10)

In [218]:
scores = pd.DataFrame(scores)

In [221]:
scores.mean()

fit_time                        0.001696
score_time                      0.001295
test_neg_mean_absolute_error   -1.308467
test_neg_mean_squared_error    -3.323018
test_max_error                 -4.127257
dtype: float64

In [222]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [225]:
y_final_pred = model.predict(X_test)

In [228]:
mean_squared_error(y_test, y_final_pred)

2.3190215794287514

### GRID SEARCH

In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [230]:
df = pd.read_csv('Advertising.csv')

In [231]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [232]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [234]:
from sklearn.linear_model import ElasticNet

In [235]:
# to use the elastic net we need to have alpha and L1 ration values
# to find the optimal value for these param we do not need to apply for loop
# the for loop part will be done through grid search

In [236]:
base_elastic_net_model = ElasticNet()

In [237]:
# this is like a dictionary
param_grid = {'alpha':[0.1,1,5,10,50,100], 'l1_ratio':[.1,.5,.7,.95,.99,1]}

In [238]:
from sklearn.model_selection import GridSearchCV

In [249]:
# this is running the grid search and cross vaidation
grid_model = GridSearchCV(estimator = base_elastic_net_model,
                         param_grid = param_grid,
                         scoring = 'neg_mean_squared_error',
                         verbose=2, cv=5)

In [242]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.7; total time=   0.0s
[CV] END ............................alpha=0.1,

[CV] END ...............................alpha=10, l1_ratio=1; total time=   0.0s
[CV] END ...............................alpha=10, l1_ratio=1; total time=   0.0s
[CV] END ...............................alpha=10, l1_ratio=1; total time=   0.0s
[CV] END ...............................alpha=10, l1_ratio=1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.1; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.5; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.5; total time=   0.0s
[CV] END .............................alpha=50, l1_ratio=0.5; total time=   0.0s
[CV] END ...................

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.1, 1, 5, 10, 50, 100],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.95, 0.99, 1]},
             scoring='neg_mean_squared_error', verbose=2)

In [243]:
grid_model.best_estimator_

ElasticNet(alpha=0.1, l1_ratio=1)

In [245]:
grid_model.best_params_

{'alpha': 0.1, 'l1_ratio': 1}

In [246]:
pd.DataFrame(grid_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004916,0.003855,0.00056,0.000462,0.1,0.1,"{'alpha': 0.1, 'l1_ratio': 0.1}",-3.453021,-1.40519,-5.789125,-2.187302,-4.645576,-3.496043,1.591601,6
1,0.002432,0.000831,0.000623,0.000511,0.1,0.5,"{'alpha': 0.1, 'l1_ratio': 0.5}",-3.32544,-1.427522,-5.59561,-2.163089,-4.451679,-3.392668,1.506827,5
2,0.003559,0.000802,0.000506,0.000486,0.1,0.7,"{'alpha': 0.1, 'l1_ratio': 0.7}",-3.26988,-1.442432,-5.502437,-2.16395,-4.356738,-3.347088,1.462765,4
3,0.002192,0.000404,0.000699,0.000397,0.1,0.95,"{'alpha': 0.1, 'l1_ratio': 0.95}",-3.213052,-1.472417,-5.396258,-2.177452,-4.24108,-3.300052,1.406248,3
4,0.002219,0.000806,0.001139,0.00129,0.1,0.99,"{'alpha': 0.1, 'l1_ratio': 0.99}",-3.208124,-1.478489,-5.380242,-2.181097,-4.222968,-3.294184,1.396953,2
5,0.003764,0.001711,0.001271,0.000882,0.1,1.0,"{'alpha': 0.1, 'l1_ratio': 1}",-3.206943,-1.480065,-5.376257,-2.182076,-4.21846,-3.29276,1.394613,1
6,0.00333,0.00085,0.001057,0.000131,1.0,0.1,"{'alpha': 1, 'l1_ratio': 0.1}",-9.827475,-5.261525,-11.875347,-7.449195,-8.542329,-8.591174,2.222939,12
7,0.002225,0.000347,0.000957,0.000214,1.0,0.5,"{'alpha': 1, 'l1_ratio': 0.5}",-8.707071,-4.214228,-10.879261,-6.204545,-7.173031,-7.435627,2.255532,11
8,0.002481,0.000516,0.000391,0.000479,1.0,0.7,"{'alpha': 1, 'l1_ratio': 0.7}",-7.92087,-3.549562,-10.024877,-5.379553,-6.324836,-6.63994,2.206213,10
9,0.002899,0.000883,0.000717,0.000183,1.0,0.95,"{'alpha': 1, 'l1_ratio': 0.95}",-6.729435,-2.591285,-8.709842,-4.156317,-5.329916,-5.503359,2.102835,9


In [247]:
# we can call predict directly to the grid model
# it takes the best params from teh grid model and predicts using them
y_pred = grid_model.predict(X_test) 

In [248]:
from sklearn.metrics import mean_squared_error

In [250]:
mean_squared_error(y_test, y_pred)

2.3873426420874737