In [4]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from helpers import *
from implementations import *
%load_ext autoreload
%autoreload 2

#### Load the training data into feature matrix, class labels, and event ids:

In [5]:
y_train, x_train, _ = load_csv_data('data/train.csv')

#### Sanitizing the missing values and standardize the features

In [28]:
sanitized_x = sanitize(x_train)
y = y_train.reshape((y_train.shape[0], 1))

feature_22 = sanitized_x[:, 22]
x_minus_22 = sanitized_x[:, np.array(range(sanitized_x.shape[1])) != 22]
x_sep = []
y_sep = []

categories_22 = np.unique(feature_22)

for i in np.unique(feature_22):
    print(i)
    x_sep.append(x_minus_22[feature_22 == i, :])
    y_sep.append(y[feature_22 == i, :])


0.0
1.0
2.0
3.0


Extend the features with sqrt sin and cos

In [29]:
extended_x_sep = [add_sqrt(e) for e in x_sep]
extended_x_sep[0]

array([[ 0.38476846,  0.9103791 , -0.00585329, ..., -0.86338833,
         0.73478992, -0.67829475],
       [ 0.94253641, -0.91455619,  1.31336873, ..., -0.86338833,
         0.73478992, -0.67829475],
       [-0.28385846,  0.03732318,  0.48512597, ..., -0.86338833,
         0.73478992, -0.67829475],
       ...,
       [ 0.        ,  0.2529135 , -0.32082851, ..., -0.86338833,
         0.73478992, -0.67829475],
       [-0.46960659, -0.84532397, -0.30297338, ..., -0.86338833,
         0.73478992, -0.67829475],
       [ 0.        ,  0.66533608, -0.25352276, ..., -0.86338833,
         0.73478992, -0.67829475]])

In [30]:
accuracies = []
weights = []
ys = []

for i, x_chunk in enumerate(extended_x_sep):
    w_init = np.random.rand(x_chunk.shape[1], 1)
    w, loss = ridge_regression(y_sep[i], x_chunk, 0.0001)
    y_pred = predict_labels(w, x_chunk)
    accuracies.append(compute_accuracy_linear_reg(y_sep[i], y_pred))
    weights.append(w)
    ys.append(y_pred)
    
accuracies

[0.8418924464283927,
 0.7951227690085629,
 0.8215724805970742,
 0.8170907778379354]

In [31]:
# Rebuilding final y for submission, in progress
N = x_train.shape[0]
predictions = np.zeros((N,1))

for i, value in enumerate(categories_22):
    ind = np.arange(N)[feature_22 == value]
    predictions[ind] = ys[i]

predictions

array([[ 1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [ 1.],
       [-1.]])

In [32]:
compute_accuracy_linear_reg(y, predictions)

0.821092

In [None]:
# Old but kept because can be useful as basis
N = sanitized_x.shape[0]
inds = np.random.choice(range(N), 100000, replace=False)
sanitized_x_sub = sanitized_x[inds, :]
sanitized_x_sub = sanitized_x_sub[:, [0,1,2,3,7,8,9,10,11,13,14,17,19,21,22,29]]
sanitized_y_sub = sanitized_y[inds]
x_train, x_test, y_train, y_test = split_data(sanitized_x, sanitized_y, 0.8)
number_of_w = 1

for degree in range(1, 5)
    tx_train = build_poly(x_train, degree)
    tx_test = build_poly(x_test, degree)
    for gamma in np.array([0.01, 0.05, 0.1, 0.5]):
        print("Running logistic regression with polynomial feature expansion with degree", degree, 
                 "and gamma", gamma,", with", number_of_w, "different initial weights.")
        for i in  range(number_of_w):
            w_init = np.random.rand(tx_train.shape[1], 1)
            w, loss = logistic_regression(y_train, tx_train, w_init, 1000, gamma, verbose=True)
            print(i,":")
            print("Accuracy on training set", compute_accuracy(y_train, tx_train, w))
            print("Accuracy on test set:, ", compute_accuracy(y_test, tx_test, w))

## Generate predictions and save ouput in csv format for submission:


In [34]:
_, x_test, ids_test = load_csv_data('data/test.csv')

In [35]:
sanitized_x_t = sanitize(x_test)

feature_22_t = sanitized_x_t[:, 22]
x_minus_22_t = sanitized_x_t[:, np.array(range(sanitized_x_t.shape[1])) != 22]
x_sep_t = []

categories_22_t = np.unique(feature_22_t)

for i in np.unique(feature_22_t):
    print(i)
    x_sep_t.append(x_minus_22_t[feature_22_t == i, :])


0.0
1.0
2.0
3.0


In [39]:
extended_x_sep_t = [add_sqrt(e) for e in x_sep_t]
extended_x_sep_t[0]

array([[ 0.        ,  0.85695592, -1.41340956, ..., -0.86258746,
         0.73572682, -0.67727841],
       [-0.07180953,  0.19686173,  0.37643086, ..., -0.86258746,
         0.73572682, -0.67727841],
       [ 0.24635352, -0.52705785,  0.39940857, ..., -0.86258746,
         0.73572682, -0.67727841],
       ...,
       [-0.05036034,  0.75185168,  0.06502106, ..., -0.86258746,
         0.73572682, -0.67727841],
       [-0.23553134, -1.1138051 , -0.39465676, ..., -0.86258746,
         0.73572682, -0.67727841],
       [-0.51947781,  0.87164791, -0.08655773, ..., -0.86258746,
         0.73572682, -0.67727841]])

In [40]:
y_sep_t = []

for i, w in enumerate(weights):
    y_t = predict_labels(w, extended_x_sep_t[i])
    y_sep_t.append(y_t)

In [42]:
# Rebuilding final y for submission, in progress
N = x_test.shape[0]
predictions_t = np.zeros((N,1))

for i, value in enumerate(categories_22_t):
    ind = np.arange(N)[feature_22_t == value]
    predictions_t[ind] = y_sep_t[i]

predictions_t

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [ 1.],
       [-1.],
       [-1.]])

In [45]:
OUTPUT_PATH = 'predictions.csv'
create_csv_submission(ids_test, predictions_t, OUTPUT_PATH)