In [None]:
# Useful starting lines
import numpy as np
from proj1_helpers import load_csv_data, predict_labels, compute_accuracy, create_csv_submission
from cross_validation import build_k_indices
from helpers import process_data, add_constant_column, build_poly
from implementations import least_squares_gd, least_squares_sgd, least_squares, ridge_regression, logistic_regression, reg_logistic_regression

%load_ext autoreload
%autoreload 2

# Define seed for train/test random splitting
seed = 10

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here

We load the training data into our y (labels), tX (input matrix) and ids (indexes)

In [None]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Gradient descent

In [None]:
def cross_validation_gradient_descent(y, x, k_indices, k, gamma, max_iters):
    """return the loss of gradient descent."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using gradient descent
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = least_squares_gd(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [None]:
k_fold = 10
gamma = 0.01
max_iters = 500

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_gradient_descent(y, tX, k_indices, k, gamma, max_iters)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## Stochastic gradient descent

In [None]:
def cross_validation_stochastic_gradient_descent(y, x, k_indices, k, gamma, max_iters):
    """return the loss of gradient descent."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using stochastic gradient descent
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = least_squares_sgd(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [None]:
k_fold = 10
gamma = 0.01
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_stochastic_gradient_descent(y, tX, k_indices, k, gamma, max_iters)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## Least squares

In [None]:
def cross_validation_least_squares(y, x, k_indices, k):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using least squares
    weights, loss = least_squares(y_train, x_train)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [None]:
k_fold = 10

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_least_squares(y, tX, k_indices, k)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## Ridge regression

In [None]:
def cross_validation_ridge_regression(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test, False)
    
    phi_train = build_poly(x_train, degree)
    phi_test = build_poly(x_test, degree)
    
    phi_train = add_constant_column(phi_train)
    phi_test = add_constant_column(phi_test)    
    
    # compute weights using ridge regression
    weights, loss = ridge_regression(y_train, phi_train, lambda_)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, phi_train)
    accuracy_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, phi_test)
    accuracy_test = compute_accuracy(y_test_pred, y_test)
    
    return accuracy_train, accuracy_test

In [None]:
k_fold = 10
lambda_ = 0.01
degree = 7

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

acc_train = []
acc_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambda_, degree)
    acc_train.append(loss_train)
    acc_test.append(loss_test)

for i in range(len(acc_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, acc_train[i], acc_test[i]))

print("\nAverage test accuracy: %f" % np.mean(acc_test))
print("Variance test accuracy: %f" % np.var(acc_test))
print("Min test accuracy: %f" % np.min(acc_test))
print("Max test accuracy: %f" % np.max(acc_test))

## Logistic regression

In [None]:
def cross_validation_logistic_regression(y, x, k_indices, k, max_iters, gamma):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using logistic regression
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [None]:
k_fold = 10
gamma = 0.6
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_logistic_regression(y, tX, k_indices, k, max_iters, gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## Regularized logistic regression

In [None]:
def cross_validation_reg_logistic_regression(y, x, k_indices, k, max_iters, lambda_, gamma):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using logistic regression
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = reg_logistic_regression(y_train, x_train, lambda_, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [None]:
k_fold = 10
gamma = 0.6
lambda_ = 0.04
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_reg_logistic_regression(y, tX, k_indices, k, max_iters, lambda_, gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)

for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## Prepare submission

In [None]:
y_train, tX_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
degree = 7
lambda_ = 0.01

tX_train, tX_test = process_data(tX_train, tX_test, False)

phi_train = build_poly(tX_train, degree)
phi_test = build_poly(tX_test, degree)

phi_train = add_constant_column(phi_train)
phi_test = add_constant_column(phi_test)    

# compute weights using ridge regression
weights, loss = ridge_regression(y_train, phi_train, lambda_)

## Generate predictions and save ouput in csv format for submission:

In [None]:
OUTPUT_PATH = 'data/output_ridge_regression.csv'

y_pred = predict_labels(weights, phi_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)