In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [None]:
from YT_implementations import *
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# Undefined values processing

# From documentation, list of features which may be undefined
undefined_cols=[0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28]

for c in undefined_cols:
    undefined_lines = (tX[:, c]==-999)
    tX[undefined_lines, c] = 0

In [None]:
# Data preprocessing

mean = np.mean(tX, axis=0)
std = np.std(tX, axis=0)

tX_train = standardization(tX, mean, std)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(tX_train)
x_test = pca.transform(tX_train)

plt.scatter(list(range(30)), pca.singular_values_)

features_ordered = np.abs(pca.components_).argmax(axis=1)
features_top = features_ordered[:20]

print(features_ordered)

In [None]:
tX_train = tX_train[:, features_top]

In [None]:
max_iters=500
seed = 7
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)

### Least Squares GD

In [None]:
# Hyperparameter Gamma Tuning for Least Squares GD using Cross-Validation

gammas = np.logspace(-4, 0, 15)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Least Squares GD")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')
plt.xlabel("gamma")

In [None]:
test(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gamma_opt, None)

### Least Squares SGD

In [None]:
# Hyperparameter Gamma Tuning for Least Squares SGD using Cross-Validation

gammas = np.logspace(-4, 0, 15)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares_SGD, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Least Squares SGD")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Least Squares SGD
test(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gamma_opt, None)

### Least Squares

In [None]:
# Degree Tuning for Least Squares using Cross-Validation

degrees = list(range(1, 10))
degree_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares, max_iters, k_fold, k_indices, degrees, None)


In [None]:
print("optimal degree : {:d} / accuracy : {:f}".format(degree_opt, acc_opt))

In [None]:
plt.title("Degree tuning for Least Squares")
plt.xlabel("degree")
plt.ylabel("accuracy")
plt.plot(degrees, list(zip(accs_tr, accs_te)))
plt.plot(degree_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Least Squares
test(y, tX_train, least_squares, max_iters, k_fold, k_indices, degree_opt, None)

### Ridge Regression

In [None]:
# Hyperparameter and Degree Tuning for Ridge Regression using Cross-Validation

degrees = list(range(1, 10))
lambdas = np.logspace(-4, 0, 15)
lambda_opt, degree_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, ridge_regression, max_iters, k_fold, k_indices, lambdas, degrees)


In [None]:
print("optimal degree : {:d} and lambda : {:f} / accuracy : {:f}".format(degree_opt, lambda_opt, acc_opt))

In [None]:
plt.title("Degree tuning for Ridge Regression")
plt.xlabel("degree")
plt.ylabel("loss")
plt.plot(degrees, list(zip(accs_tr, accs_te)))
plt.plot(degree_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Ridge Regression
test(y, tX_train, ridge_regression, max_iters, k_fold, k_indices, degree_opt, lambda_opt)

### Logistic Regression

In [None]:
y_logistic = y.copy()
y_logistic[y_logistic == -1] = 0

In [None]:
# Hyperparameter Tuning for Logistic Regression using Cross-Validation

gammas = np.logspace(-4, 0, 15)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y_logistic, tX_train, logistic_regression, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Logistic Regression")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Logistic Regression

test(y_logistic, tX_train, logistic_regression, max_iters, k_fold, k_indices, gamma_opt, None)

### Regularized Logistic Regression

In [None]:
# Hyperparameter Tuning for Regularized Ridge Regression using Cross-Validation

gammas = np.logspace(-4, 0, 15)
lambdas = np.logspace(-4, 0, 15)
lambda_opt, gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, reg_logistic_regression, max_iters, k_fold, k_indices, lambdas, gammas)


In [None]:
print("optimal gamma : {:f} and lambda : {:f} / accuracy : {:f}".format(gamma_opt, lambda_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Regularized Logistic Regression")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
test(y_logistic, tX_train, reg_logistic_regression, max_iters, k_fold, k_indices, gamma_opt, lambda_opt)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_test = standardization(tX_test, mean, std)
y_logistic = y_test.copy()
y_logistic[y_logistic == -1] = 0

initial_w = np.zeros(tX_test.shape[1])
weights, _ = reg_logistic_regression(y_logistic, tX_test, lambda_opt, initial_w, max_iters, gamma_opt)

In [None]:
OUTPUT_PATH = 'output.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
compute_accuracy(pd.read_csv(OUTPUT_PATH).Prediction.values.tolist(), y_test)