In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


Train/Test Split:

In [4]:
#Train/Test Split Procedure:
#1. Clean and adjust data as necessary for X and y
#2. Split Data in Train/Test for both X and y
#3. Fit/Train Scaler on Training X Data
#4. Scale X Test Data
#5. Create Model
#6. Fit/Train Model on X Train Data
#7. Evaluate Model on X Test Data (by creating predictions and comparint to Y_test)
#8. Adjust Parameters as Necessary and repeat steps 6 and 7

In [5]:
X = df.drop('sales', axis = 1) #Step 2 (Step 1 is assumed to already be done)

In [6]:
y = df['sales']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101) #Step 3

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X_train) #Step 3

In [12]:
X_train = scaler.transform(X_train)

In [13]:
X_test = scaler.transform(X_test) #Step 4

In [14]:
from sklearn.linear_model import Ridge #Throughout this notebook, we will slowly adjust cross-validation methods

In [15]:
model = Ridge(alpha = 100) #Step 5

In [16]:
model.fit(X_train, y_train) #Step 6

In [17]:
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error

In [19]:
mean_squared_error(y_test, y_pred) #Step 7

7.341775789034128

In [20]:
model_two = Ridge(alpha = 1) #Step 8

In [21]:
model_two.fit(X_train, y_train) #Step 6 repeated

In [22]:
y_pred_two = model_two.predict(X_test)

In [23]:
mean_squared_error(y_test, y_pred_two) #Step 7 repeated

2.3190215794287514

In [24]:
#Advantage:
#Very simple to understand what's going on

#Disadvantages:
#Getting the optimal hyperparameters can be tedious via brute-force
#Mean squared error isn't the most fair evaluation/performance report
#Above is because hyperparameters are adjusted based on the test set

Train/Validation/Test Split:

In [25]:
#Train/Validation/Test Split Procedure:
#1. Clean and adjust data as necessary for X and y
#2. Split Data in Train/Validation/Test for both X and y by splitting twice
#3. Fit/Train Scaler on Training X Data
#4. Scale X Eval Data
#5. Create Model
#6. Fit/Train Model on X Train Data
#7. Evaluate Model on X Evaluation Data (by creating predictions and comparint to Y_eval)
#8. Adjust Parameters as Necessary and repeat steps 6 and 7
#9. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [26]:
X = df.drop('sales', axis = 1)

In [27]:
y = df['sales']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size = 0.3, random_state = 101) #Step 2 - First Split

In [30]:
# test_size = 0,5 (50% of 30% other ---> test = 15% of all data)
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size = 0.5, random_state = 101) #Second split

In [31]:
len(df)

200

In [32]:
len(X_train)

140

In [33]:
len(X_eval)

30

In [34]:
len(X_test)

30

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
scaler = StandardScaler()

In [37]:
scaler.fit(X_train) #Step 3

In [38]:
X_train = scaler.transform(X_train) #Step 4 - Must be applied to all 3 X subsets

In [39]:
X_test = scaler.transform(X_test)

In [40]:
X_eval = scaler.transform(X_eval)

In [41]:
from sklearn.linear_model import Ridge

In [42]:
model_one = Ridge(alpha = 100) #Step 5

In [43]:
model_one.fit(X_train, y_train) #Step 6

In [44]:
y_eval_predictions = model.predict(X_eval)

In [45]:
from sklearn.metrics import mean_squared_error

In [46]:
mean_squared_error(y_eval, y_eval_predictions) #Step 7

7.320101458823869

In [47]:
model_two = Ridge(alpha = 1) #Step 8

In [48]:
model_two.fit(X_train, y_train) #Step 6 repeated

In [49]:
new_pred_eval = model_two.predict(X_eval)

In [50]:
mean_squared_error(y_eval, new_pred_eval) #Step 7 repeated
#Is this value a fair performance metric? Yes and no:
#It's fair in the sense that the model was never fit to the validation set
#However, the hyperparameters (alpha) were chosen due to previous performance on the evaluation set
#This is where the final test set comes in, as it's the most fair performance metric

2.3837830750569853

In [51]:
y_final_test_pred = model_two.predict(X_test) #Step 9 - Note: You cannot go back once you introduce the model to the final test set

In [52]:
mean_squared_error(y_test, y_final_test_pred)

2.254260083800517

Cross Validation With cross_val_score:

<img src="grid_search_cross_validation.png">

In [53]:
X = df.drop('sales', axis = 1)

In [54]:
y = df['sales']

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101) #The test set is the final holdout test set

In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
scaler = StandardScaler()

In [59]:
scaler.fit(X_train)

In [60]:
X_train = scaler.transform(X_train)

In [61]:
X_test = scaler.transform(X_test)

In [62]:
model = Ridge(alpha = 100) #This process is able to be used for any ML model, even if it doesn't come with a CV version

In [63]:
from sklearn.model_selection import cross_val_score

In [64]:
scores = cross_val_score(model, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
#The scoring parameter is used to select a metric to evaluate performance by; they're adjusted so higher is always better
#The cv parameter determines the the degree (K) of the K-fold cross-validation process

In [65]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [66]:
abs(scores.mean())

np.float64(8.215396464543607)

In [67]:
model = Ridge(alpha = 1)

In [68]:
scores = cross_val_score(model, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
#Note: The model isn't quite 'fit' for the final test yet, as it's only partially fit for each fold of the cross validation

In [69]:
abs(scores.mean())

np.float64(3.344839296530695)

In [70]:
model.fit(X_train, y_train)

In [71]:
y_final_test_pred = model.predict(X_test)

In [72]:
mean_squared_error(y_test, y_final_test_pred)

2.3190215794287514

Cross Validation With cross_validate:

https://scikit-learn.org/stable/modules/model_evaluation.html

In [73]:
## CREATE X and y
X = df.drop('sales', axis = 1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [74]:
from sklearn.model_selection import cross_validate

In [75]:
model = Ridge(alpha = 100)

In [77]:
scores = cross_validate(model, X_train, y_train, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error'], cv = 10)
#The difference between cross_val_score and cross_validate is that unlike the former, the latter can take multiple scoring metrics as a list

In [None]:
scores #It also collects stats about the time it took to fit and score each fold of the cross-validation process

{'fit_time': array([0.00262427, 0.00300217, 0.00399065, 0.00232673, 0.00099707,
        0.00099707, 0.00300193, 0.00299454, 0.00294924, 0.00199103]),
 'score_time': array([0.00350356, 0.00398588, 0.00397491, 0.00098586, 0.00099754,
        0.00099921, 0.00996375, 0.00250888, 0.00498605, 0.00199461]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [79]:
scores = pd.DataFrame(scores)

In [80]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.002624,0.003504,-6.060671,-1.810212
1,0.003002,0.003986,-10.627031,-2.541958
2,0.003991,0.003975,-3.993426,-1.469594
3,0.002327,0.000986,-5.009494,-1.862769
4,0.000997,0.000998,-9.1418,-2.520697
5,0.000997,0.000999,-13.086256,-2.459995
6,0.003002,0.009964,-3.839405,-1.451971
7,0.002995,0.002509,-9.058786,-2.377395
8,0.002949,0.004986,-9.055457,-2.443344
9,0.001991,0.001995,-5.778882,-1.899797


In [81]:
scores.mean()

fit_time                        0.002487
score_time                      0.003390
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [82]:
model = Ridge(alpha = 1)

In [83]:
scores = cross_validate(model, X_train, y_train, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error'], cv = 10)

In [84]:
scores = pd.DataFrame(scores)

In [85]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.000997,0.000997,-2.962508,-1.457174
1,0.002217,0.001122,-3.057378,-1.555308
2,0.002153,0.000996,-2.17374,-1.23877
3,0.000999,0.000997,-0.833034,-0.768938
4,0.000997,0.001995,-3.464018,-1.434489
5,0.000997,0.000998,-8.232647,-1.494316
6,0.001001,0.000999,-1.905864,-1.081362
7,0.00199,0.002003,-2.765048,-1.250011
8,0.000989,0.000997,-4.989505,-1.580971
9,0.001114,0.000913,-2.846438,-1.223326


In [86]:
scores.mean()

fit_time                        0.001345
score_time                      0.001202
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [87]:
model.fit(X_train, y_train)

In [88]:
y_final_pred = model.predict(X_test)

In [89]:
mean_squared_error(y_test, y_final_pred)

2.3190215794287514