# SLU11 - Advanced Validation: Exercises notebook

In [None]:
import pandas as pd
import numpy as np

## 1 Bias-variance trade-off

### Exercise 1: Detecting bias and variance in a simple model (not graded)

Imagine you are measuring voting intentions, namely the percentage of people that will vote in a given political party A, as opposed to political party B.

A way to build this model would be to randomly choose 50 numbers from the phone book, call each one and ask the responder who they planned to vote.

Now, consider we got the following results:

| Party A | Party B | Non-Respondents | Total |
|---------|---------|-----------------|-------|
| 13      | 16      | 21              | 50    |

From the data, we estimate the probability of voting A as:

In [None]:
13 / (13 + 16)

Using our (flawed, as we will see) model, we predict a victory for the party B. But can we expect our model to generalize, coming the elections?

In order to understand that, we need to idenfify sources of bias and variance.

Below you will find a list of issues undermining the model. You need to identify which ones are sources of bias and which ones are sources of variance:

1. Only sampling people from the phone book (bias/~~variance~~)
2. Not following-up with non-respondents (bias/variance)
3. Not weighting responses by likeliness to vote (bias/variance)
4. Small sample size (bias/variance)

### Exercise 2: Detecting bias and variance in the real world (not graded)

For each of the following, identify if they are more likely to be sources of bias or variance:

1. Using very flexible models (e.g., non-parametric, non-linear), such as K-nearest neighbors or decision trees (bias/variance)
2. Using models with simplistic assumptions, such as linear or logistic regressions (bias/variance)
3. Increasing the polynomial degree of our hypothesis function (bias/variance)
4. Ignoring important features (bias/variance)

## 2 Train-test split

### Exercise 3: Create training and test datasets (graded)

In [None]:
from sklearn.model_selection import train_test_split


def implement_hold_out_method(X, y, test_size=.4, random_state=0):
    """ 
    Implementing the holdout method, using sklearn.
    
    Args:
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        test_size (float): proportion of the dataset to include in the test set
        random_state (int): the seed used by the random number generator

    Returns:
        X_train (pd.DataFrame): the features for the training examples
        X_test (pd.DataFrame): the features for the test examples
        y_train (pd.Series): target for the training set 
        y_test (pd.Series): target for the test set

    """
    # use train_test_split to create the training and test datasets
    # X_train, X_test, y_train, y_test = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return X_train, X_test, y_train, y_test

In [None]:
"""Check that the solution is correct."""
from random import randint

def generate_test_data(m , n):
    values = np.random.randint(0, m, size=(m, n))
    df = pd.DataFrame(values)
    X = df.copy()
    y = X.pop(0)
    return X, y

X, y = generate_test_data(m=100, n=4)
X_train, X_test, y_train, y_test = implement_hold_out_method(X, y)

### Exercise 4: Creating a validation dataset (graded)

In [None]:
def implement_validation_dataset(X, y, test_size=.25, val_size=.25, random_state=0):
    """ 
    Implementing the holdout method with validation, using sklearn.
    
    Args:
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        test_size (float): proportion of the dataset to include in the test set
        val_size (float): proportion of the dataset to include in the validation set
        random_state (int): the seed used by the random number generator

    Returns:
        X_train (pd.DataFrame): the features for the training examples
        X_test (pd.DataFrame): the features for the test examples
        X_val (pd.DataFrame): the features of the validation examples
        y_train (pd.Series): target for the training set 
        y_test (pd.Series): target for the test set
        y_val (pd.Series): target for the validation set

    """
    # use train_test_split to create the test dataset
    # X_temp, X_test, y_temp, y_test = ... (1 line)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # compute the size of the validation dataset relative to the temp dataset
    # so that the final validation dataset corresponds to the validation_size
    # val_on_temp_size = ... (1 line)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # se train_test_split to create the train and validation datasets
    # X_train, X_val, y_train, y_val = ... (1 line)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
"""Check that the solution is correct."""
X, y = generate_test_data(m=1000, n=5)
X_train, X_test, X_val, y_train, y_test, y_val = implement_validation_dataset(X, y)

## 3 Cross-validation

### Exercise 5: Implementing K-fold cross-validation (graded)

In [None]:
from sklearn.model_selection import KFold

def implement_cross_validation(X, y, n_splits, random_state=0):
    """ 
    Implementing the cross-validation split, to create multiple train
    and test set splits.
    
    Args:
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        n_folds (int): number of floats, must be at least 2
        random_state (int): the seed used by the random number generator

    Returns:
        folds (dict): dictionary containing the multiple train, test splits
    """
    # initialize the KFold cross-validator from sklearn, using n_splits and
    # random_sate
    # kf = ... (1 line)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # initialize empty dictionary 'folds'
    # folds = ... (1 line)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    for train_index, test_index in kf.split(X):
        # use train_index and test_index to create X_train, X_test, y_train,
        # and y_test (for reference, check the sklearn documentation for kf,
        # and remember that X and y are both dataframes)
        # X_train, X_test = ...
        # y_train, y_test = ...
        # YOUR CODE HERE
        raise NotImplementedError()
        
        # create a 'fold' dictionary with keys 'X_train', 'X_test', 'y_train',
        # and 'y_test' and use the respective datasets as values (please make
        # sure you use the correct keys)
        # YOUR CODE HERE
        raise NotImplementedError()
        
        # create a variable k (int) with the number of the fold (each of the
        # iterations of the loop), to be used as key in the dict 'folds'
        # k = ...
        # YOUR CODE HERE
        raise NotImplementedError()
        
        # add the fold to the folds dictionary, using k as key and fold as 
        # value (hint: check dict.update())
        # YOUR CODE HERE
        raise NotImplementedError()
    
    return folds

In [None]:
"""Check that the solution is correct."""
X, y = generate_test_data(m=500, n=5)
folds = implement_cross_validation(X, y, 5)