# Project 1 
#### by Fabio, Ivan and Olivier

## Data Import and Preprocessing

In [41]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Loading and standardizing the training data
from proj1_helpers import *
from implementations import *
y, tX, ids = load_csv_data('data/train.csv')
tX = standardize(tX)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# Splitting dataset into train and validation sets
ratio = 0.8  # Ratio of samples to use in new train set
cut_off_point = int(ratio*len(y))
np.random.seed(0)  # Fixing a seed for reproducibility
rand_ind = np.random.permutation(np.arange(len(y)))
y_train, y_valid = y[cut_off_point:], y[:cut_off_point]
tX_train, tX_valid = tX[cut_off_point:], tX[:cut_off_point]

## Calculating the F1-score for a fit

In [43]:
def f1_score(y_valid, tX_valid, weights):
    """
    Return the F1-score achieved with the predictions of a validation set
    
    Parameters
    ----------
    y_valid : np.ndarray
        Vector with the validation labels.
    tX_valid : np.ndarray
        Array with the validation samples as rows and the features as columns.
    weights : np.ndarray
        Vector containing the weights.

    Returns
    -------
    f1 : float
        F1-score for this configuration (the closer to 1 the better)
    """

    # Obtaining the predictions
    y_pred = predict_labels(weights, tX_valid)

    # Calculating number of true positives, false positives, and false negatives
    num_tp = np.sum(y_pred == y_valid)
    num_fp = np.sum((y_pred == 1) & (y_valid == -1))
    num_fn = np.sum((y_pred == -1) & (y_valid == 1))

    precision = num_tp / (num_tp + num_fp)
    recall = num_tp / (num_tp + num_fn)

    f1 = 2 * precision * recall / (precision + recall)
    
    return f1

# Regressors

## Least squares gradient descent

In [44]:
# Hyperparameters
initial_w = np.ones(tX_train.shape[1], dtype=float)
max_iters = 100
gamma = 0.1

# Fitting
weights, loss = least_squares_GD(y_train, tX_train, initial_w, max_iters, gamma)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'least_squares_GD': F1 = ", f1)

F1-score achieved with 'least_squares_GD': F1 =  0.8273680569195546


 ## Least squares stochastic gradient descent

In [45]:
# Hyperparameters
initial_w = np.ones(tX_train.shape[1], dtype=float)
max_iters = 1000
gamma = 0.01

# Fitting
weights, loss = least_squares_SGD(y_train, tX_train, initial_w, max_iters, gamma)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'least_squares_SGD': F1 = ", f1)

F1-score achieved with 'least_squares_SGD': F1 =  0.7870995912525623


## Least squares (normal equation)

In [46]:
# Fitting
weights, loss = least_squares(y_train, tX_train)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'least_squares': F1 = ", f1)

F1-score achieved with 'least_squares': F1 =  0.8362316015489758


## Ridge regression (for least squares normal equation)

In [47]:
# Hyperparameters
lambda_ = 10

# Fitting
weights, loss = ridge_regression(y_valid, tX_valid, lambda_)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'ridge_regression': F1 = ", f1)

F1-score achieved with 'ridge_regression': F1 =  0.7814982697275432


## Logistic regression

In [48]:
# Hyperparameters
initial_w = np.ones(tX_train.shape[1], dtype=float)
max_iters = 100
gamma = 0.1

# Fitting
# weights, loss = logistic_regression(y, tX, initial_w, max_iters, gamma)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'logistic_regression': F1 = ", f1)

F1-score achieved with 'logistic_regression': F1 =  0.7814982697275432


## Regularized logistic regression

In [49]:
# Hyperparameters
lambda_ = 0.1
initial_w = np.ones(tX_train.shape[1], dtype=float)
max_iters = 100
gamma = 0.1

# Fitting
# weights, loss = reg_logistic_regression(y, tX, lambda_, initial_w, max_iters, gamma)

# Scoring
f1 = f1_score(y_valid, tX_valid, weights)
print("F1-score achieved with 'reg_logistic_regression': F1 = ", f1)

F1-score achieved with 'reg_logistic_regression': F1 =  0.7814982697275432


## Generate predictions and save ouput in csv format for submission

In [51]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)