In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

#### Imports for Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score #k-Fold Cross Validation
from sklearn.model_selection import GridSearchCV

    
## Model Validation Methods

* Holdout
    - Holdout Validation
* K-fold Cross-Validation
    - Leave-One-Out Cross-Validation
 

### Holdout 

* Different definitions of validation set

#### Validation set as test set (what we have been doing)

* Randomly divide training set into two parts
    - Training set
    - Test set (also called validation or hold-out) set
* train_test_split function in sklearn

#### Holdout Validation

* Separate validation set to determine hyperparameters

* Split data into three sets: train, validation, test
    - Training data: learn model parameters
    - Validation set: Determine optimum hyperparameters
    - Test set: prediction of new data
* Trains model on less data
    - Generally better to train on as much data as possible
    - Often don't have enough data

### Hyperparameter Tuning using validation set

In [None]:
iris = sns.load_dataset('iris')
X = iris.iloc[:, 0:4].values
y = iris.iloc[:, 4].values
X.shape,y.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, 
                                                 test_size = 0.25,
                                                 stratify=y,
                                                 random_state=4)
X_train2,X_val,y_train2,y_val = train_test_split(X_train,y_train,
                                                 test_size = .25,
                                                 stratify=y_train,
                                                 random_state=4)

#Scale featues
sc = StandardScaler()
X_train2 = sc.fit_transform(X_train2)
X_val = sc.transform(X_val)
X_train2.shape,X_val.shape,y_train2.shape,y_val.shape

In [None]:
parameters = (.1,1.0,10.0,100.0)
scores = np.zeros(len(parameters))
for i,param in enumerate(parameters):
    svc_model = SVC(C = param, kernel = 'linear', random_state = 1234)
    svc_model.fit(X_train2, y_train2)
    scores[i] = svc_model.score(X_val,y_val)
print(scores)
best_C = parameters[np.argmax(scores)]
print(f'Best parameter is {best_C}')

#### Refit  and test model with best hyperparameter

In [None]:
#Scale featues
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

final_model = SVC(C = best_C, kernel = 'linear', random_state = 1234)
final_model.fit(X_train2, y_train2)
final_model.score(X_test,y_test)

###  Potential Drawbacks of Holdout Approach

* Highly variable
    - Depends on which observations are included in the test set  
    
* Overestimates test error rate for entire data set 
    - Since trained on fewer observations
    - Increased bias

#### Variability using single sample

In [None]:
df = pd.read_csv("Auto.csv")
df.tail()

In [None]:
x = df.loc[:,"horsepower"].values
X = np.array([x**(n+1) for n in range(5)]).transpose()
y = df.loc[:,"mpg"].values
X[0,:]

In [None]:
def one_epoch(X,y,degree):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20)
    models = [LinearRegression() for i in range(degree)]
    mses = np.zeros(degree)
    for i in range(degree):
        models[i].fit(X_train[:,0:(i+1)], y_train)
        preds = models[i].predict(X_test[:,0:(i+1)])
        mses[i] = np.mean((y_test - preds)**2)
    return mses

In [None]:
num_epochs = 4
num_degrees = 5
mses = [one_epoch(X,y,num_degrees) for i in range(num_epochs)]
print(f'Degree with lowest MSE {list(map(np.argmin,mses))}')
msesT = np.array(mses).T

In [None]:
fig,ax = plt.subplots(figsize = (12,6))
x = (1,2,3,4)
legend = []
for i in range(5):
    ax.plot(x,msesT[i],'o-')
    legend.append(f'degree {i}')
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.legend((legend))
plt.title(f"Variablity due to random sampling - {5} degrees");

In [None]:
fig,ax = plt.subplots(figsize = (12,6))
x = range(1,6)
legend = []
for i in range(num_epochs):
    ax.plot(x,mses[i],'o-')
    legend.append(f'Iteration {i}')
plt.xlabel("Degree")
plt.ylabel("MSE")
plt.legend((legend))
plt.title(f"Variablity due to random sampling - {num_epochs} iterations");


## Cross-Validation

* Used for evaluating the model's test error to 
    - Evaluate the model's performance (model assessment)
    - Select the appropriate level of flexibility (model selection)
    
![](k-fold.png)
$$\text{Figure 2. 5-fold Cross Validation}$$    
    
#### Cross-Validation is a resampling method

* Given a training set, repeatedly draw samples from that set and refit the model on each sample to gain additional information about the fitted model
* Can be computationally expensive but with todays computing power resampling methods are tractable.
* Cross-Validation and Bootstrap(covered in Ensemble methods)

#### Model Assessment ( Accuracy/Test Error Rate)
 
* Accuracy will vary depending on the particular sample. 
* More complex models will vary more from sample to sample
* Cross-Validation will improve accuracy by reducing bias
 
#### Used in Model Selection to determine the best hyperparameters
 
* Two types of parameters:
    - Learned by model
    - Set by modeler (hyperparameters)
 
* Determining the optimum value for a hyperparameter
    - K in KNN
    - C,$\gamma$ in SVM

### k-fold Cross-Validation 

![](grid_search.png)
$$\text{Figure 1. Cross-Validation Workflow}$$


#### Algorithm

1. Obtain a dataset
    - Shuffle the dataset (optional) 
2. Randomly divide the data into k groups or folds of approximately equal size  
3. For each of the k folds  
    a. Make it the validation set  
    b. Fit learning method on k-1 remaining folds  
    c. Predict the validation set  
    d. Calculate Cross Validation score on the MSE of validation set  
4. Average the Cross Validation scores for a final performance measure 

<div style="font-size: 115%;">
$$CV_{(k)} = \frac{1}{k}\sum^k_{i}MSE_i$$
</div>

* Each observation in the original sample is used once in the validation set and k-1 times in the training set
* Lower bias than a simple train/test split.
    - Less prone to underestimate the test error
* Any scaling or tuning of hyperparameters must be done within step three to prevent data leakage
* k = 5 or k = 10 have been found to produce the best results (i.e. best bias-variance tradeoff)
    - k = 1 is the simple train/test split that we have been doing
    - k = n is called Leave-One-Out Cross Validation (see below)
* Stratified Cross Validation is used in Classification.
    - Each fold has the same proportion of observations with a given categorical value.
* Repeated Cross Validation: Repeat the Cross Validation procedure N times

#### k-fold Cross Validation in Classification

* Use number of misclassifications to quantify test error

<div style="font-size: 115%;">
$$CV_{(k)} = \frac{1}{k}\sum^k_{i}I(y_i\ne{\hat{y}_i})$$
</div>

#### Leave-One-Out Cross Validation
 
* Given $(x_1,y_1)$,$(x_2,y_2)$,...,$(x_n,y_n)$
* Let: $(x_1,y_1)$ be the validation set and $(x_2,y_2)$,...,$(x_n,y_n)$ be the training set
* Fit the model on the training set and calculate $MSE_1$
    - Predict $\hat{y_1}$ based on $x_1$
        - $MSE_1 = mean(y_1 - \hat{y_1})^2)$
    - Repeat for 2,...,n
    
<div style="font-size: 115%;">
$$CV_{(n)} = \frac{1}{n}\sum^n_{i}MSE_i$$
</div>

* Less bias since uses almost all the data in the training set
* No randomness in training/test set splits
* But can be computationally expensive and has high variance because:
    - Averaging the ouputs of n fitted models trained on datasets that are highly correlated. 
    - Variance of correlated variables higher than for non-correlated variables

$$Var(X + Y) = Var(X) + Var(Y) + 2\cdot{Cov(X,Y)}$$

### Sklearn Cross-Validation

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

In [None]:
dataset = pd.read_csv('PurchaseData.csv')
X = dataset.iloc[:, [1, 2]].values
y = dataset.iloc[:, 3].values
X.shape

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.25,
                                                    random_state = 12,
                                                    stratify = y)

#Scale featues
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

In [None]:
# Fitting Kernel SVM to the Training set

svc_model = SVC(kernel = 'rbf', random_state = 1234)
svc_model.fit(X_train, y_train)

# Predict test data
y_pred = svc_model.predict(X_test)

# Make the Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = np.trace(cm)/np.sum(cm)
accuracy

In [None]:
svc_model.score(X_train,y_train)

In [None]:
# Applying k-Fold Cross Validation

n_folds = 10

# score function from SVC
scores = cross_val_score(estimator = SVC(), X = X_train, y = y_train, cv = n_folds)
scores

In [None]:
plt.plot(range(1,11),scores,"o-")
print("Mean accuracy: ",scores.mean())
print("Stanard Deviation of accuracies: ",scores.std())
print(f'95% Confidence Interval: {round(scores.mean(),3)} +/- {round(2*scores.std(),3)}')


### Grid search cross validation

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html


* Search for the optimal hyperparameter.

* Optimal is the one with the best Cross Validation score

* Parameters
    - param_grid: dictionary of hyperparamters and values to search over
    - estimator: the model object
    - cv: the number of cv folds
    - scoring: "accuracy" for classification, "r2" for regression
        - calls accuracy_score or r2_score
* Result attributes
    - best_score_
    - best_params_

In [None]:
# Grid Search Python
# Applying Grid Search to find the best model and the best parameters

parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = SVC(),
                           param_grid = parameters,
                           scoring = 'accuracy',
                           iid = 'False',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best accuracy: ", best_accuracy)
print("Best parameters: ", best_parameters )

In [None]:
svc_model = SVC(C = 10, kernel = 'rbf', gamma = 0.3)
svc_model.fit(X_train, y_train)

# Predict test data
y_pred = svc_model.predict(X_test)

# Make the Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = np.trace(cm)/np.sum(cm)
accuracy

#### Figures

Figure 1: Sklearn Cross-Validation Users Guide

Figures 2: "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani 