# Visualising our approach to tuning Ridge Regression

This notebook serves the purpose of presenting our key findings while tuning our ridge regression's parameters and showing how we set out to find these parameters, namely determining the lambda used for regularization and the degrees to which we augment our data. Further, we decide to illustrate why we decided to prioritise lower misclassification in determining these parameters than mean squared error, and reinforce what was taught to us, that the mean squared error can mislead if blindly trusted in classification problems.

In [2]:
import numpy as np
from helpers import build_poly, build_k_indices, fill_nan_closure, minmax_normalize_closure,\
standardize_closure, predict_without_classifying, batch_iter
from proj1_helpers import load_csv_data, predict_labels
from implementations import compute_mse, ridge_regression 
import pickle # Use of pickle to bypass long computation which is for vizualisation purposes

DATA_PATH = '../data/'
seed = 12

# We work with the training data in this notebook
y, x, ids = load_csv_data(DATA_PATH+'train.csv')

# Fill all NA values (-999 in this case) with the median of each column - will compare later median vs mean but not now
x, _ = fill_nan_closure(x, np.nanmedian)

# Minmax normalization of x matrix - will compare minmax vs standardization later in this notebook but not now
minmax_normalize = minmax_normalize_closure(np.min(x, axis=0), np.max(x, axis=0))
x = minmax_normalize(x)

In [3]:
lambdas = np.array([1e-5]) #np.logspace(-10,5,5)
degrees = (1,2) 
k_fold = 7

def cross_validation(y, x, k_indices, k, lambda_, degree):
    
    # Current fold test indices are our k indices, all non-k indices are our train indices
    te_indice, tr_indice = k_indices[k], k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    
    # Split data according to determined indices
    y_te, y_tr = y[te_indice], y[tr_indice]
    x_te, x_tr = x[te_indice], x[tr_indice]
    
    # Build tildeX (including offset term) and augment to provided degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    
    # Compute optimal weights using normal form solution
    w = ridge_regression(y_tr, tx_tr, lambda_)
    
    # Calculate fold MSE for training and test partitions
    mse_tr = compute_mse(y_tr, tx_tr, w)
    mse_te = compute_mse(y_te, tx_te, w)
    
    # Calculate fold misclassification % (ratio of how many inaccurate predictions were made)
    y_tr_pred = predict_labels(w, tx_tr)
    y_te_pred = predict_labels(w, tx_te) # we store the predictions on test to show what may drive mse losses in each degree 

    misclass_tr = sum(y_tr_pred != y_tr)/len(y_tr)
    misclass_te = sum(y_te_pred != y_te)/len(y_te)
    
    y_te_pred_pre_classification = predict_without_classifying(w, tx_te)
    
    return mse_tr, mse_te, misclass_tr, misclass_te, y_te_pred, w

def compare_mse_misclassification_tuning():
   
    # build fold indices to feed into cross validation
    k_indices = build_k_indices(y, k_fold, seed)

    # initiate empty lists to store "best" lambdas and their corresponding average (over the folds)
    # mse on training and test and average misclassification on training on test
    # Note that we say "best" which here we determine by the lowest mse, which is what we want to show is 
    # problematic in the case of classification
    
    best_lambdas = []
    best_mses_te, best_mses_tr = [], []
    best_acc_te, best_acc_tr = [], []
    best_y_pred_te = []
    
    for deg in degrees:
        print(f'Computing for Degree {deg}')
        
        # store mses and misclassification % for all lambdas in the given degree
        mse_te, mse_tr = [], []
        acc_te, acc_tr = [], []
        y_pred_te = []

        # main reason we decide to loop over some lambdas is to optimise towards the mse and give it its "best shot"
        for lambda_ in lambdas:

            mse_te_tmp, mse_tr_tmp = [], []
            acc_te_tmp, acc_tr_tmp = [], []
            y_pred_te_tmp = np.array([]) 
            
            for k in range(k_fold):

                fold_mse_tr, fold_mse_te, fold_acc_tr, fold_acc_te, y_te_fold_pred , _ = cross_validation(y, x, k_indices, k, lambda_, deg)
                
                # store mse, misclassification accuracy and predictions made of fold for logging
                mse_te_tmp.append(fold_mse_te) ; mse_tr_tmp.append(fold_mse_tr)
                acc_te_tmp.append(fold_acc_te) ; acc_tr_tmp.append(fold_acc_tr)
                y_pred_te_tmp = np.hstack((y_pred_te_tmp, y_te_fold_pred))
                
            # average the folds and store for logging    
            mse_te.append(np.mean(mse_te_tmp, axis=0)) ; mse_tr.append(np.mean(mse_tr_tmp, axis=0))
            acc_te.append(np.mean(acc_te_tmp, axis=0)) ; acc_tr.append(np.mean(acc_tr_tmp, axis=0))
            y_pred_te.append(y_pred_te_tmp)
            
        # append only the mse and misclassification % for the lowest mse - this is our "best" lambda
        # we will use this to show why perhaps we want to move to minimizing the misclassification to select our best model
        ind_lambda_opt = np.argmin(mse_te)
        
        best_lambdas.append(lambdas[ind_lambda_opt])
        best_mses_te.append(mse_te[ind_lambda_opt]) ; best_mses_tr.append(mse_tr[ind_lambda_opt])
        best_acc_te.append(acc_te[ind_lambda_opt]) ; best_acc_tr.append(acc_tr[ind_lambda_opt])
        best_y_pred_te.append(y_pred_te[ind_lambda_opt])
        
        # save progressively the values to later avoid re-computing
        with open('../../comparing mse and misclassification tuning.pkl', 'wb') as f:
            pickle.dump([best_lambdas, best_mses_te, best_mses_tr, best_acc_te, best_acc_tr], f)
        
    best_d_mse = degrees[np.argmin(best_mses_te)]
    best_d_misclass = degrees[np.argmin(acc_te)]

    # We recognize we selected the "best" values for each degree according to MSE (instead of misclassification %) but it still shows why MSE is flawed
    print(f'According to the lowest MSE the best degree was {best_d_mse}, while according to the lowest misclassifications it was {best_d_misclass}')
    
    return best_lambdas, best_mses_te, best_mses_tr, best_acc_te, best_acc_tr

best_lambdas, best_mses_te, best_mses_tr, best_acc_te, best_acc_tr = compare_mse_misclassification_tuning()

Computing for Degree 1


TypeError: data type not understood

In [None]:
lambdas = np.array([1e-5]) #np.logspace(-10,5,5)
degrees = (1,2) 
k_fold = 7

def cross_validation(y, x, k_indices, k, lambda_, degree):
    
    # Current fold test indices are our k indices, all non-k indices are our train indices
    te_indice, tr_indice = k_indices[k], k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    
    # Split data according to determined indices
    y_te, y_tr = y[te_indice], y[tr_indice]
    x_te, x_tr = x[te_indice], x[tr_indice]
    
    # Build tildeX (including offset term) and augment to provided degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    
    # Compute optimal weights using normal form solution
    w = ridge_regression(y_tr, tx_tr, lambda_)
    
    # Calculate fold MSE for training and test partitions
    mse_tr = compute_mse(y_tr, tx_tr, w)
    mse_te = compute_mse(y_te, tx_te, w)
    
    # Calculate fold misclassification % (ratio of how many inaccurate predictions were made)
    y_tr_pred = predict_labels(w, tx_tr)
    y_te_pred = predict_labels(w, tx_te) # we store the predictions on test to show what may drive mse losses in each degree 

    misclass_tr = sum(y_tr_pred != y_tr)/len(y_tr)
    misclass_te = sum(y_te_pred != y_te)/len(y_te)
    
    y_te_pred_pre_classification = predict_without_classifying(w, tx_te)
    
    return mse_tr, mse_te, misclass_tr, misclass_te, y_te_pred, w



In [5]:

# build fold indices to feed into cross validation
k_indices = build_k_indices(y, k_fold, seed)

# initiate empty lists to store "best" lambdas and their corresponding average (over the folds)
# mse on training and test and average misclassification on training on test
# Note that we say "best" which here we determine by the lowest mse, which is what we want to show is 
# problematic in the case of classification

best_lambdas = []
best_mses_te, best_mses_tr = [], []
best_acc_te, best_acc_tr = [], []
best_y_pred_te = []

for deg in degrees:
    print(f'Computing for Degree {deg}')

    # store mses and misclassification % for all lambdas in the given degree
    mse_te, mse_tr = [], []
    acc_te, acc_tr = [], []
    y_pred_te = []

    # main reason we decide to loop over some lambdas is to optimise towards the mse and give it its "best shot"
    for lambda_ in lambdas:

        mse_te_tmp, mse_tr_tmp = [], []
        acc_te_tmp, acc_tr_tmp = [], []
        y_pred_te_tmp = np.array([]) 

        for k in range(k_fold):

            fold_mse_tr, fold_mse_te, fold_acc_tr, fold_acc_te, y_te_fold_pred , w = cross_validation(y, x, k_indices, k, lambda_, deg)

            # store mse, misclassification accuracy and predictions made of fold for logging
            mse_te_tmp.append(fold_mse_te) ; mse_tr_tmp.append(fold_mse_tr)
            acc_te_tmp.append(fold_acc_te) ; acc_tr_tmp.append(fold_acc_tr)
            y_pred_te_tmp = np.hstack((y_pred_te_tmp, y_te_fold_pred))

        # average the folds and store for logging    
        mse_te.append(np.mean(mse_te_tmp, axis=0)) ; mse_tr.append(np.mean(mse_tr_tmp, axis=0))
        acc_te.append(np.mean(acc_te_tmp, axis=0)) ; acc_tr.append(np.mean(acc_tr_tmp, axis=0))
        y_pred_te.append(y_pred_te_tmp)

    # append only the mse and misclassification % for the lowest mse - this is our "best" lambda
    # we will use this to show why perhaps we want to move to minimizing the misclassification to select our best model
    ind_lambda_opt = np.argmin(mse_te)

    best_lambdas.append(lambdas[ind_lambda_opt])
    best_mses_te.append(mse_te[ind_lambda_opt]) ; best_mses_tr.append(mse_tr[ind_lambda_opt])
    best_acc_te.append(acc_te[ind_lambda_opt]) ; best_acc_tr.append(acc_tr[ind_lambda_opt])
    best_y_pred_te.append(y_pred_te[ind_lambda_opt])

    # save progressively the values to later avoid re-computing
    with open('../../comparing mse and misclassification tuning.pkl', 'wb') as f:
        pickle.dump([best_lambdas, best_mses_te, best_mses_tr, best_acc_te, best_acc_tr], f)

best_d_mse = degrees[np.argmin(best_mses_te)]
best_d_misclass = degrees[np.argmin(acc_te)]

# We recognize we selected the "best" values for each degree according to MSE (instead of misclassification %) but it still shows why MSE is flawed
print(f'According to the lowest MSE the best degree was {best_d_mse}, while according to the lowest misclassifications it was {best_d_misclass}')


Computing for Degree 1


TypeError: data type not understood