# Parameter tuning 

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import time
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

# Loading the training data
y, tX, ids = load_csv_data('data/train.csv')

## Scoring

In [None]:
def f1_score(y_valid, tX_valid, weights):
    """
    Return the F1-score achieved with the predictions of a validation set

    Parameters
    ----------
    y_valid : np.ndarray
        Vector with the validation labels.
    tX_valid : np.ndarray
        Array with the validation samples as rows and the features as columns.
    weights : np.ndarray
        Vector containing the weights.

    Returns
    -------
    f1 : float
        F1-score for this configuration (the closer to 1 the better)

    References
    ----------
    [1] Wikipedia entry for 'Precision and recall'
        https://en.wikipedia.org/wiki/Precision_and_recall
    [2] Wikipedia entry for 'F-score'
        https://en.wikipedia.org/wiki/F-score

    """

    y_pred = predict_labels(weights, tX_valid) # Obtaining the predictions

    # Calculating number of true positives, false positives, and false negatives
    num_tp = np.sum((y_valid == 1) & (y_pred == 1))
    num_fp = np.sum((y_valid == -1) & (y_pred == 1))
    num_fn = np.sum((y_valid == 1) & (y_pred == -1))
    precision = num_tp / (num_tp + num_fp)
    recall = num_tp / (num_tp + num_fn)
    f1 = np.round(2 * precision * recall / (precision + recall), 4)
    return f1

In [None]:
def accuracy(y_valid, tX_valid, weights):
    """
    Return the accuracy achieved with the predictions of a validation set

    Parameters
    ----------
    y_valid : np.ndarray
        Vector with the validation labels.
    tX_valid : np.ndarray
        Array with the validation samples as rows and the features as columns.
    weights : np.ndarray
        Vector containing the weights.

    Returns
    -------
    acc : float
        Accuracy for this configuration (the closer to 1 the better)

    References
    ----------
    [3] Wikipedia entry for 'Accuracy and precision'
        https://en.wikipedia.org/wiki/Accuracy_and_precision

    """

    y_pred = predict_labels(weights, tX_valid) # Obtaining the predictions
    num_tp_tn = np.sum(y_valid == y_pred) # Calculating number of true positives and negatives
    acc = np.round(num_tp_tn / len(y_valid), 4)
    return acc

## Cross validation

In [None]:
def cross_validate(regressor, y, tX, param, k_fold, score='f1'):    
    """
    'k_fold' cross validate a regressor with parameters 'param', data '(y, TX)' 
    and score the predictions with 'score'

    Parameters
    ----------
    regressor : String
        String that defines which regressor to use
    y : np.ndarray
        Vector with the labels.
    tX : np.ndarray
        Array with the samples as rows and the features as columns.
    param : np.ndarray
        Parameters on weight, gamma and max_iterations
    k_fold : Integer
        Integer that defines how many folds to make
    score : float
        Float of a given performance score (e.g. f1)

    Returns
    -------
    mean : float
    std : float

    """

    fold_size = int(len(y) / k_fold)
    scores = []
    rand_ind = np.random.permutation(np.arange(len(y)))
    for k in range(k_fold):
        valid_ind = rand_ind[k*fold_size:(k+1)*fold_size] # Splitting data set into validation and training set (for current fold)
        tX_train, y_train = tX[valid_ind], y[valid_ind]
        tX_valid, y_valid = np.delete(tX, valid_ind, axis=0), np.delete(y, valid_ind, axis=0)
        weights, _ = eval(regressor)(y_train, tX_train, **param) # Fitting/scoring regressor with the tarining/validation set
        if score == 'f1': # Scoring according to the scoring criterion
            scores.append(f1_score(y_valid, tX_valid, weights))
        else:
            scores.append(accuracy(y_valid, tX_valid, weights))
    return np.mean(scores), np.std(scores, ddof=1)

In [None]:
def best_parameters(regressor, y, tX, params, k_fold, score):
    
    """
    Find best parameters from a list of parameters with k_fold cross validation.

    Parameters
    ----------
    regressor : String
        String that defines which regressor to use
    y : np.ndarray
        Vector with the labels.
    tX : np.ndarray
        Array with the samples as rows and the features as columns.
    param : np.ndarray
        Parameters on weight, gamma and max_iterations
    k_fold : Integer
        Integer that defines how many folds to make
    score : float
        Float of a given performance score (e.g. f1)

    """

    max_std_score = 0.0
    max_mean_score = 0.0
    max_param = {}
    for param in params:
        mean_score, std_score = cross_validate(regressor, y, tX, param, k_fold, score)
        if mean_score > max_mean_score: # Checking for the best score
            max_mean_score = mean_score
            max_std_score = std_score
            max_param = param
    print("Best parameter set for '", regressor, "' was:")
    print("param = ", max_param)
    print("Mean ", score, " score: ", max_mean_score, " +/- ", max_std_score)

## Configurations