In [None]:
import numpy as np
from helpers_given import load_csv_data
from implementations import *
from helpers import *
from preprocess import *
from cross_validation import *

seed = 1
np.random.seed(1)

In [None]:
# load the data
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("dataset/", sub_sample=False)

In [None]:
# convert y to values in {0,1} to use the same prediction for every method
y_train_01 = convert_minus1_to_0(y_train)

# Without dropping any columns

We just replace the nans by the mean of the corresponding column.

In [None]:
tx_clean = np.where(np.isnan(x_train), np.ma.array(x_train, mask=np.isnan(x_train)).mean(axis=0), x_train) 

In [None]:
def prediction(tx, w):
    return convert_predict(tx@w)

### For mean_squared_error_gd

In [None]:
w, loss = mean_squared_error_gd(y_train_01, tx_clean, np.zeros(tx_clean.shape[1]), max_iters=100, gamma=0.01)
a = compute_accuracy(y_train_01, prediction(tx_clean, w))

In [None]:
k_fold = 4
max_iter = 100
k_indices = build_k_indices(y_train_01, k_fold, seed)
gammas = [1e-5]

for gamma_ in gammas:
    accuracy = []
    for k in range(k_fold):
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        y_te = y_train_01[te_indice]
        y_tr = y_train_01[tr_indice]
        tx_te = tx_clean[te_indice]
        tx_tr = tx_clean[tr_indice]
        w, loss_tr = mean_squared_error_gd(y_tr, tx_tr, np.zeros(tx_tr.shape[1]), max_iter, gamma_)
        a = compute_accuracy(y_te, prediction(tx_te, w))
        accuracy.append(a)
    print("for gamma= ", gamma_)
    print(np.mean(accuracy), np.std(accuracy))

### For mean_squared_error_sgd

In [None]:
k_fold = 4
max_iter = 100
k_indices = build_k_indices(y_train_01, k_fold, seed)
gammas = [1e-7]

for gamma_ in gammas:
    accuracy = []
    for k in range(k_fold):
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        y_te = y_train_01[te_indice]
        y_tr = y_train_01[tr_indice]
        tx_te = tx_clean[te_indice]
        tx_tr = tx_clean[tr_indice]
        w, loss_tr = mean_squared_error_sgd(y_tr, tx_tr, np.zeros(tx_tr.shape[1]), max_iter, gamma_)
        a = compute_accuracy(y_te, prediction(tx_te, w))
        accuracy.append(a)
    print("for gamma= ", gamma_)
    print(np.mean(accuracy), np.std(accuracy))

### For ridge_regression

In [None]:
k_fold = 4
k_indices = build_k_indices(y_train_01, k_fold, seed)
lambdas = [0.05]

for lambda_ in lambdas:
    accuracy = []
    for k in range(k_fold):
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        y_te = y_train_01[te_indice]
        y_tr = y_train_01[tr_indice]
        tx_te = tx_clean[te_indice]
        tx_tr = tx_clean[tr_indice]
        w, loss_tr = ridge_regression(y_tr, tx_tr, lambda_)
        a = compute_accuracy(y_te, prediction(tx_te, w))
        accuracy.append(a)
    print("for lambda= ", lambda_)
    print(np.mean(accuracy), np.std(accuracy))

### For least_squares

In [None]:
k_fold = 4
k_indices = build_k_indices(y_train_01, k_fold, seed)

accuracy = []
for k in range(k_fold):
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y_train_01[te_indice]
    y_tr = y_train_01[tr_indice]
    tx_te = tx_clean[te_indice]
    tx_tr = tx_clean[tr_indice]
    w, loss_tr = least_squares(y_tr, tx_tr)
    a = compute_accuracy(y_te, prediction(tx_te, w))
    accuracy.append(a)
print(np.mean(accuracy), np.std(accuracy))

### For logistic_regression

In [None]:
k_fold = 4
max_iter = 100
k_indices = build_k_indices(y_train_01, k_fold, seed)
gammas = [0.5]

for gamma_ in gammas:
    accuracy = []
    for k in range(k_fold):
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        y_te = y_train_01[te_indice]
        y_tr = y_train_01[tr_indice]
        tx_te = tx_clean[te_indice]
        tx_tr = tx_clean[tr_indice]
        w, loss_tr = logistic_regression(y_tr, tx_tr, np.zeros(tx_tr.shape[1]), max_iter, gamma_)
        a = compute_accuracy(y_te, prediction(tx_te, w))
        accuracy.append(a)
    print("for gamma= ", gamma_)
    print(np.mean(accuracy), np.std(accuracy))

### For reg_logistic_regression

In [None]:
k_fold = 4
max_iter = 100
k_indices = build_k_indices(y_train_01, k_fold, seed)
gamma = 0.5
lambdas = [0.1]

for lambda_ in lambdas:
    accuracy = []
    for k in range(k_fold):
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        y_te = y_train_01[te_indice]
        y_tr = y_train_01[tr_indice]
        tx_te = tx_clean[te_indice]
        tx_tr = tx_clean[tr_indice]
        w, loss_tr = reg_logistic_regression(y_tr, tx_tr, lambda_, np.zeros(tx_tr.shape[1]), max_iter, gamma)
        a = compute_accuracy(y_te, prediction(tx_te, w))
        accuracy.append(a)
    print("for lambda= ", lambda_)
    print(np.mean(accuracy), np.std(accuracy))