In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [None]:
from YT_implementations import *
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# Data preprocessing

mean = np.mean(tX, axis=0)
std = np.std(tX, axis=0)

tX_train = standardization(tX, mean, std)

In [None]:
max_iters=500
seed = 7
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)

In [None]:
def hyperparameter_tuning(y, tX, regression_method, max_iters, k_fold, k_indices, params1, params2): 
    
    method = str(regression_method).split()[1]
    
    losses_tr = []
    losses_te = []
    accuracies_tr = []
    accuracies_te = []
    
    # 1-param tuning
    if params2 is None:
        for param in params1:
            losses_tr_tmp = []
            losses_te_tmp = []
            accuracies_tr_tmp = []
            accuracies_te_tmp = []

            for k in range(k_fold):
                initial_w = np.zeros(tX.shape[1])
                
                # least_squares
                if method == "least_squares":
                    w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, param, regression_method)
                # logistic_regression
                elif method == "logistic_regression":
                    w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y_logistic, tX, k_indices, k, None, regression_method, initial_w=None, max_iters=max_iters, gamma=param)
                # least_squares_GD / least_squares_SGD
                else:
                    w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, None, regression_method, initial_w=initial_w, max_iters=max_iters, gamma=param)

                losses_tr_tmp.append(loss_tr)
                losses_te_tmp.append(loss_te)
                accuracies_tr_tmp.append(acc_tr)
                accuracies_te_tmp.append(acc_te)

            losses_tr.append(np.mean(losses_tr_tmp))
            losses_te.append(np.mean(losses_te_tmp))
            accuracies_tr.append(np.mean(accuracies_tr_tmp))
            accuracies_te.append(np.mean(accuracies_te_tmp))
            
        idx_opt = np.argmax(accuracies_te)
        param_opt = params1[idx_opt]
        accuracy_opt = accuracies_te[idx_opt]

        return param_opt, accuracy_opt, losses_tr, losses_te, accuracies_tr, accuracies_te
            
    # 2-params tuning
    else:
        opt_params1 = []
        for param2 in params2:
            losses_tr_tmp2 = []
            losses_te_tmp2 = []
            accuracies_tr_tmp2 = []
            accuracies_te_tmp2 = []
            
            for param1 in params1:
                losses_tr_tmp = []
                losses_te_tmp = []
                accuracies_tr_tmp = []
                accuracies_te_tmp = []

                for k in range(k_fold):
                    initial_w = np.zeros(tX.shape[1])

                    # ridge_regression
                    if method == "ridge_regression":
                        w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, param2, regression_method, lambda_=param1)
                    # reg_logistic_regression
                    elif method == "reg_logistic_regression":
                        w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y_logistic, tX, k_indices, k, None, regression_method, initial_w=None, max_iters=max_iters, gamma=param2, lambda_=param1)
                    else:
                        break

                    losses_tr_tmp.append(loss_tr)
                    losses_te_tmp.append(loss_te)
                    accuracies_tr_tmp.append(acc_tr)
                    accuracies_te_tmp.append(acc_te)

                losses_tr_tmp2.append(np.mean(losses_tr_tmp))
                losses_te_tmp2.append(np.mean(losses_te_tmp))
                accuracies_tr_tmp2.append(np.mean(accuracies_tr_tmp))
                accuracies_te_tmp2.append(np.mean(accuracies_te_tmp))
            
            idx_opt_param1 = np.argmax(accuracies_te_tmp)
            opt_params1.append(params1[idx_opt_param1])
            losses_tr.append(losses_tr_tmp2[idx_opt_param1])
            losses_te.append(losses_te_tmp2[idx_opt_param1])
            accuracies_tr.append(accuracies_tr_tmp2[idx_opt_param1])
            accuracies_te.append(accuracies_te_tmp2[idx_opt_param1])

        idx_opt = np.argmax(accuracies_te)
        param1_opt = opt_params1[idx_opt]
        param2_opt = params2[idx_opt]
        accuracy_opt = accuracies_te[idx_opt]

        return param1_opt, param2_opt, accuracy_opt, losses_tr, losses_te, accuracies_tr, accuracies_te


In [None]:
def test(y, tX, regression_method, max_iters, k_fold, k_indices, param1, param2):
    method = str(regression_method).split()[1]

    losses_tr = []
    losses_te = []
    accuracies_tr = []
    accuracies_te = []

    for k in range(k_fold):
        initial_w = np.zeros(tX_train.shape[1])
        
        if method == "least_squares_GD":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, None, regression_method, initial_w=initial_w, max_iters=max_iters, gamma=param1)
        elif method == "least_sqaures_SGD":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, None, least_squares_SGD, initial_w=initial_w, max_iters=max_iters, gamma=param1)
        elif method == "least_squares":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, param1, regression_method)
        elif method == "ridge_regression":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, param1, regression_method, lambda_=param2)
        elif method == "logistic_regression":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, None, regression_method, max_iters=max_iters, gamma=param1)
        elif method == "reg_logistic_regression":
            w, loss_tr, loss_te, acc_tr, acc_te = cross_validation(y, tX, k_indices, k, None, regression_method, max_iters=max_iters, gamma=param1, lambda_=param2)
            
        losses_tr.append(loss_tr)
        losses_te.append(loss_te)
        accuracies_tr.append(acc_te)
        accuracies_te.append(acc_te)

    l_tr = np.mean(losses_tr)
    l_te = np.mean(losses_te)
    acc_tr = np.mean(accuracies_tr)
    acc_te = np.mean(accuracies_te)
    
    print("Train Loss     : {:f} / Test Loss     : {:f}".format(l_tr, l_te))
    print("Train Accuracy : {:f} / Test Accuracy : {:f}".format(acc_tr, acc_te))
    
    
    return

### Least Squares GD

In [None]:
# Hyperparameter Gamma Tuning for Least Squares GD using Cross-Validation

gammas = np.linspace(0, 1, 20)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Least Squares GD")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')
plt.xlabel("gamma")

In [None]:
test(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gamma_opt, None)

### Least Squares SGD

In [None]:
# Hyperparameter Gamma Tuning for Least Squares SGD using Cross-Validation

gammas = np.linspace(0, 1, 20)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares_SGD, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Least Squares SGD")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Least Squares SGD
test(y, tX_train, least_squares_GD, max_iters, k_fold, k_indices, gamma_opt, None)

### Least Squares

In [None]:
# Degree Tuning for Least Squares using Cross-Validation

degrees = list(range(1, 10))
degree_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, least_squares, max_iters, k_fold, k_indices, degrees, None)


In [None]:
print("optimal degree : {:d} / accuracy : {:f}".format(degree_opt, acc_opt))

In [None]:
plt.title("Degree tuning for Least Squares")
plt.xlabel("degree")
plt.ylabel("accuracy")
plt.plot(degrees, list(zip(accs_tr, accs_te)))
plt.plot(degree_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Least Squares
test(y, tX_train, least_squares, max_iters, k_fold, k_indices, degree_opt, None)

### Ridge Regression

In [None]:
# Hyperparameter and Degree Tuning for Ridge Regression using Cross-Validation

degrees = list(range(1, 10))
lambdas = np.logspace(-4, 0, 15)
lambda_opt, degree_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, ridge_regression, max_iters, k_fold, k_indices, lambdas, degrees)


In [None]:
print("optimal degree : {:d} and lambda : {:f} / accuracy : {:f}".format(degree_opt, lambda_opt, acc_opt))

In [None]:
plt.title("Degree tuning for Ridge Regression")
plt.xlabel("degree")
plt.ylabel("loss")
plt.plot(degrees, list(zip(accs_tr, accs_te)))
plt.plot(degree_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Ridge Regression
test(y, tX_train, ridge_regression, max_iters, k_fold, k_indices, degree_opt, lambda_opt)

### Logistic Regression

In [None]:
y_logistic = y.copy()
y_logistic[y_logistic == -1] = 0

In [None]:
# Hyperparameter Tuning for Logistic Regression using Cross-Validation

gammas = np.logspace(-5, 0, 15)
gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y_logistic, tX_train, logistic_regression, max_iters, k_fold, k_indices, gammas, None)


In [None]:
print("optimal gamma : {:f} / accuracy : {:f}".format(gamma_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Logistic Regression")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
# Performance Test for Logistic Regression

test(y_logistic, tX_train, logistic_regression, max_iters, k_fold, k_indices, gamma_opt, None)

### Regularized Logistic Regression

In [None]:
# Hyperparameter Tuning for Regularized Ridge Regression using Cross-Validation

gammas = np.logspace(-5, 0, 15)
lambdas = np.logspace(-4, 0, 15)
lambda_opt, gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, reg_logistic_regression, max_iters, k_fold, k_indices, lambdas, gammas)


In [None]:
gammas = [0.000611]
lambdas = np.logspace(-4, 0, 15)
lambda_opt, gamma_opt, acc_opt, ls_tr, ls_te, accs_tr, accs_te = hyperparameter_tuning(y, tX_train, reg_logistic_regression, max_iters, k_fold, k_indices, lambdas, gammas)


In [None]:
print("optimal gamma : {:f} and lambda : {:f} / accuracy : {:f}".format(gamma_opt, lambda_opt, acc_opt))

In [None]:
plt.title("Gamma tuning for Regularized Logistic Regression")
plt.xlabel("gamma")
plt.ylabel("accuracy")
plt.plot(gammas, list(zip(accs_tr, accs_te)))
plt.plot(gamma_opt, acc_opt, 'ro')

In [None]:
test(y_logistic, tX_train, reg_logistic_regression, max_iters, k_fold, k_indices, gamma_opt, lambda_opt)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_test = standardization(tX_test, mean, std)
y_logistic = y_test.copy()
y_logistic[y_logistic == -1] = 0

initial_w = np.zeros(tX_test.shape[1])
weights, _ = reg_logistic_regression(y_logistic, tX_test, lambda_opt, initial_w, max_iters, gamma_opt)

In [None]:
OUTPUT_PATH = 'output.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
compute_accuracy(pd.read_csv(OUTPUT_PATH).Prediction.values.tolist(), y_test)