In [1]:
# Useful starting lines
import numpy as np
from proj1_helpers import load_csv_data, predict_labels, compute_accuracy, create_csv_submission
from cross_validation import build_k_indices
from helpers import process_data, add_constant_column, build_poly
from implementations import least_squares_gd, least_squares_sgd, least_squares, ridge_regression, logistic_regression, reg_logistic_regression

%load_ext autoreload
%autoreload 2

# Define seed for train/test random splitting
seed = 10

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here

We load the training data into our y (labels), tX (input matrix) and ids (indexes)

In [3]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Gradient descent

In [4]:
def cross_validation_gradient_descent(y, x, k_indices, k, gamma, max_iters):
    """return the loss of gradient descent."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using gradient descent
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = least_squares_gd(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [5]:
k_fold = 10
gamma = 0.01
max_iters = 500

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_gradient_descent(y, tX, k_indices, k, gamma, max_iters)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.762916 / Test accuracy : 0.764600
1 - Training accuracy: 0.764182 / Test accuracy : 0.758680
2 - Training accuracy: 0.763200 / Test accuracy : 0.767200
3 - Training accuracy: 0.763733 / Test accuracy : 0.761080
4 - Training accuracy: 0.763133 / Test accuracy : 0.764400
5 - Training accuracy: 0.763449 / Test accuracy : 0.761720
6 - Training accuracy: 0.763062 / Test accuracy : 0.760840
7 - Training accuracy: 0.763262 / Test accuracy : 0.765520
8 - Training accuracy: 0.762427 / Test accuracy : 0.765280
9 - Training accuracy: 0.763173 / Test accuracy : 0.762360

Average test accuracy: 0.763168
Variance test accuracy: 0.000006
Min test accuracy: 0.758680
Max test accuracy: 0.767200


## Stochastic gradient descent

In [6]:
def cross_validation_stochastic_gradient_descent(y, x, k_indices, k, gamma, max_iters):
    """return the loss of gradient descent."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using stochastic gradient descent
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = least_squares_sgd(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [7]:
k_fold = 10
gamma = 0.01
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_stochastic_gradient_descent(y, tX, k_indices, k, gamma, max_iters)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.688742 / Test accuracy : 0.689160
1 - Training accuracy: 0.676387 / Test accuracy : 0.677600
2 - Training accuracy: 0.633613 / Test accuracy : 0.634400
3 - Training accuracy: 0.674324 / Test accuracy : 0.671400
4 - Training accuracy: 0.688796 / Test accuracy : 0.693120
5 - Training accuracy: 0.687262 / Test accuracy : 0.684240
6 - Training accuracy: 0.696013 / Test accuracy : 0.694320
7 - Training accuracy: 0.728467 / Test accuracy : 0.732600
8 - Training accuracy: 0.702493 / Test accuracy : 0.704560
9 - Training accuracy: 0.679089 / Test accuracy : 0.681080

Average test accuracy: 0.686248
Variance test accuracy: 0.000562
Min test accuracy: 0.634400
Max test accuracy: 0.732600


## Least squares

In [8]:
def cross_validation_least_squares(y, x, k_indices, k):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using least squares
    weights, loss = least_squares(y_train, x_train)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [9]:
k_fold = 10

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_least_squares(y, tX, k_indices, k)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.776400 / Test accuracy : 0.775120
1 - Training accuracy: 0.777689 / Test accuracy : 0.772280
2 - Training accuracy: 0.777324 / Test accuracy : 0.779400
3 - Training accuracy: 0.777453 / Test accuracy : 0.774560
4 - Training accuracy: 0.776422 / Test accuracy : 0.778600
5 - Training accuracy: 0.776787 / Test accuracy : 0.776040
6 - Training accuracy: 0.776698 / Test accuracy : 0.773560
7 - Training accuracy: 0.776773 / Test accuracy : 0.780200
8 - Training accuracy: 0.775524 / Test accuracy : 0.777160
9 - Training accuracy: 0.776147 / Test accuracy : 0.779080

Average test accuracy: 0.776600
Variance test accuracy: 0.000007
Min test accuracy: 0.772280
Max test accuracy: 0.780200


## Ridge regression

In [10]:
def cross_validation_ridge_regression(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test, False)
    
    phi_train = build_poly(x_train, degree)
    phi_test = build_poly(x_test, degree)
    
    phi_train = add_constant_column(phi_train)
    phi_test = add_constant_column(phi_test)    
    
    # compute weights using ridge regression
    weights, loss = ridge_regression(y_train, phi_train, lambda_)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, phi_train)
    accuracy_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, phi_test)
    accuracy_test = compute_accuracy(y_test_pred, y_test)
    
    return accuracy_train, accuracy_test

In [11]:
k_fold = 10
lambda_ = 0.001
degree = 7

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

acc_train = []
acc_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambda_, degree)
    acc_train.append(loss_train)
    acc_test.append(loss_test)

for i in range(len(acc_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, acc_train[i], acc_test[i]))

print("\nAverage test accuracy: %f" % np.mean(acc_test))
print("Variance test accuracy: %f" % np.var(acc_test))
print("Min test accuracy: %f" % np.min(acc_test))
print("Max test accuracy: %f" % np.max(acc_test))

0 - Training accuracy: 0.819258 / Test accuracy : 0.819360
1 - Training accuracy: 0.820716 / Test accuracy : 0.815560
2 - Training accuracy: 0.818476 / Test accuracy : 0.818920
3 - Training accuracy: 0.819236 / Test accuracy : 0.820240
4 - Training accuracy: 0.819907 / Test accuracy : 0.821760
5 - Training accuracy: 0.820338 / Test accuracy : 0.817000
6 - Training accuracy: 0.819004 / Test accuracy : 0.818640
7 - Training accuracy: 0.820236 / Test accuracy : 0.822560
8 - Training accuracy: 0.818480 / Test accuracy : 0.820000
9 - Training accuracy: 0.819378 / Test accuracy : 0.819800

Average test accuracy: 0.819384
Variance test accuracy: 0.000004
Min test accuracy: 0.815560
Max test accuracy: 0.822560


## Logistic regression

In [12]:
def cross_validation_logistic_regression(y, x, k_indices, k, max_iters, gamma):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using logistic regression
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [13]:
k_fold = 10
gamma = 0.6
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_logistic_regression(y, tX, k_indices, k, max_iters, gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

  return 1 / (1 + np.exp(-t))
  y * np.log(sigmoid(tx.dot(w))) + (1 - y) * np.log(1 - sigmoid(tx.dot(w)))
  y * np.log(sigmoid(tx.dot(w))) + (1 - y) * np.log(1 - sigmoid(tx.dot(w)))


0 - Training accuracy: 0.721089 / Test accuracy : 0.725800
1 - Training accuracy: 0.721960 / Test accuracy : 0.720280
2 - Training accuracy: 0.721049 / Test accuracy : 0.726040
3 - Training accuracy: 0.721822 / Test accuracy : 0.717360
4 - Training accuracy: 0.721551 / Test accuracy : 0.721560
5 - Training accuracy: 0.721742 / Test accuracy : 0.719680
6 - Training accuracy: 0.721671 / Test accuracy : 0.716240
7 - Training accuracy: 0.721289 / Test accuracy : 0.724640
8 - Training accuracy: 0.720542 / Test accuracy : 0.725680
9 - Training accuracy: 0.722467 / Test accuracy : 0.717320

Average test accuracy: 0.721460
Variance test accuracy: 0.000013
Min test accuracy: 0.716240
Max test accuracy: 0.726040


## Regularized logistic regression

In [14]:
def cross_validation_reg_logistic_regression(y, x, k_indices, k, max_iters, lambda_, gamma):
    """return the loss of least squares."""
    
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()
    
    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]
    
    x_train, x_test = process_data(x_train, x_test)
    
    # compute weights using logistic regression
    initial_w = np.zeros(x_train.shape[1])
    weights, loss = reg_logistic_regression(y_train, x_train, lambda_, initial_w, max_iters, gamma)
    
    # calculate the accuracy for train and test data
    y_train_pred = predict_labels(weights, x_train)
    acc_train = compute_accuracy(y_train_pred, y_train)
    
    y_test_pred = predict_labels(weights, x_test)
    acc_test = compute_accuracy(y_test_pred, y_test)
    
    return acc_train, acc_test

In [15]:
k_fold = 10
gamma = 0.6
lambda_ = 0.04
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_reg_logistic_regression(y, tX, k_indices, k, max_iters, lambda_, gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)

for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

  return 1 / (1 + np.exp(-t))
  y * np.log(sigmoid(tx.dot(w))) + (1 - y) * np.log(1 - sigmoid(tx.dot(w)))
  y * np.log(sigmoid(tx.dot(w))) + (1 - y) * np.log(1 - sigmoid(tx.dot(w)))


0 - Training accuracy: 0.721089 / Test accuracy : 0.725800
1 - Training accuracy: 0.721956 / Test accuracy : 0.720280
2 - Training accuracy: 0.721058 / Test accuracy : 0.726080
3 - Training accuracy: 0.721818 / Test accuracy : 0.717320
4 - Training accuracy: 0.721551 / Test accuracy : 0.721600
5 - Training accuracy: 0.721738 / Test accuracy : 0.719680
6 - Training accuracy: 0.721671 / Test accuracy : 0.716240
7 - Training accuracy: 0.721293 / Test accuracy : 0.724640
8 - Training accuracy: 0.720529 / Test accuracy : 0.725600
9 - Training accuracy: 0.722480 / Test accuracy : 0.717320

Average test accuracy: 0.721456
Variance test accuracy: 0.000013
Min test accuracy: 0.716240
Max test accuracy: 0.726080


## Prepare submission

In [16]:
y_train, tX_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [17]:
degree = 7
lambda_ = 0.01

tX_train, tX_test = process_data(tX_train, tX_test, False)

phi_train = build_poly(tX_train, degree)
phi_test = build_poly(tX_test, degree)

phi_train = add_constant_column(phi_train)
phi_test = add_constant_column(phi_test)    

# compute weights using ridge regression
weights, loss = ridge_regression(y_train, phi_train, lambda_)

## Generate predictions and save ouput in csv format for submission:

In [18]:
OUTPUT_PATH = 'data/output_ridge_regression.csv'

y_pred = predict_labels(weights, phi_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)