In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import helpers as hp
import implementations as impl

In [2]:
# Load data

x_train, x_test, y_train, train_ids, test_ids = hp.load_csv_data("data/")

In [3]:
y_train = np.expand_dims(y_train, 1)
y_train = y_train.reshape((y_train.shape[0],1))

In [4]:
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape) 
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 320)
y_train shape:  (328135, 1)
x_test shape:  (109379, 320)


In [5]:
# Build train data

import src.utils.constants as c
import src.features.build_features as bf
x_train_nonans, removed_cols = bf.build_train_features(data=x_train, percentage=c.PERCENTAGE_NAN)

In [6]:
# PARAMETERS

lambda_ = 0.1                                      # regularization parameter
max_iters = 10                                     # max number of iterations 
threshold = 1e-8                                   # threshold for stopping criterion
gamma = 0.4                                        # step size
initial_w = np.zeros((x_train_nonans.shape[1], 1)) # initial weights

In [7]:
# Mean squared error gradient descent
w_mean_squared_error_gd, loss_mean_squared_error_gd = impl.mean_squared_error_gd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_gd)

print("Mean squared error gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_gd, loss=loss_mean_squared_error_gd))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Mean squared error gradient descent: W: [[ 1.03434150e+15]
 [ 1.03434119e+15]
 [ 6.61042706e+14]
 [ 1.03434119e+15]
 [ 1.03434073e+15]
 [ 1.03422695e+15]
 [ 1.03427814e+15]
 [-1.13566882e+17]
 [-1.13566882e+17]
 [ 1.03434150e+15]
 [ 1.03434150e+15]
 [ 1.03434150e+15]
 [ 1.03434147e+15]
 [ 1.03434145e+15]
 [ 1.03434151e+15]
 [ 1.03434150e+15]
 [ 1.03434150e+15]
 [ 1.03434150e+15]
 [ 1.03434147e+15]
 [ 1.03434150e+15]
 [ 1.03434150e+15]
 [ 1.03434146e+15]
 [ 1.03434142e+15]
 [ 1.03434141e+15]
 [ 1.03433810e+15]
 [ 1.03433787e+15]
 [ 1.03433838e+15]
 [ 1.03434149e+15]
 [ 1.03434148e+15]
 [ 1.03434145e+15]
 [ 1.03434147e+15]
 [ 1.03434143e+15]
 [ 1.03434149e+15]
 [ 1.03434149e+15]
 [ 1.03434147e+15]
 [ 1.03434146e+15]
 [ 1.03434144e+15]
 [ 1.03434145e+15]
 [ 1.03434147e+15]
 [ 1.03434145e+15]
 [ 1.03434145e+15]
 [ 1.03434145e+15]
 [ 1.03434146e+15]
 [ 1.03434145e+15]
 [ 1.03434144e+15]
 [ 1.03434140e+15]
 [ 1.03433846e+15]
 [ 1.03434147e+15]
 [ 1.03434143e+15]
 [ 1.03434128e+15]
 [ 1.03434

In [8]:
# Mean squared error stochastic gradient descent
w_mean_squared_error_sgd, loss_mean_squared_error_sgd = impl.mean_squared_error_sgd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_sgd)

print("Mean squared error stochastic gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_sgd, loss=loss_mean_squared_error_sgd))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

In [None]:
# Test Least Squares Regression using Normal Equations

w_least_squares, loss_least_squares = impl.least_squares(y_train, x_train_nonans)

rmse_tr = np.sqrt(2 * loss_least_squares)

print("Least squares: W: {w}, Loss:{loss}".format(w=w_least_squares, loss=loss_least_squares))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

In [None]:
# Test Ridge Regression using Normal Equations

w_ridge_regression, loss_ridge_regression = impl.ridge_regression(y_train, x_train_nonans, lambda_)

rmse_tr = np.sqrt(2 * loss_ridge_regression)

print("Ridge regression: W: {w}, Loss:{loss}".format(w=w_ridge_regression, loss=loss_ridge_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

In [None]:
# Test Logistic Regression using gd

w_log_regression, loss_log_regression = impl.logistic_regression(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_log_regression)

print("Logistic regression: W: {w}, Loss:{loss}".format(w=w_log_regression, loss=loss_log_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Logistic regression: W: [[ 0.01982394]
 [ 0.01982394]
 [ 0.01278254]
 [ 0.01982394]
 [ 0.01982393]
 [ 0.01982175]
 [ 0.01982272]
 [-2.17659702]
 [-2.17659702]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982386]
 [ 0.01982387]
 [ 0.01982388]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982389]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.0198239 ]
 [ 0.01982392]
 [ 0.01982394]
 [ 0.01982282]
 [ 0.01982302]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 [ 0.01982394]
 

In [None]:
# Test Regularized Logistic Regression using gd

w_reg_log_regression, loss_reg_log_regression = impl.reg_logistic_regression(y_train, x_train_nonans, lambda_, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_reg_log_regression)

print("Logistic regression: W: {w}, Loss:{loss}".format(w=w_reg_log_regression, loss=loss_reg_log_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

In [None]:
# Cross validation

import src.model.Models as model
import src.model.train_model as tm

kwargs = {
    "lambda_": lambda_,
    "initial_w": initial_w,
    "max_iters": max_iters,
    "gamma": gamma,
}

accuracy, f1, w = tm.run_cross_validation(
    x=x_train_nonans,
    y=y_train,
    k=6,
    algorithm=impl.reg_logistic_regression,
    model=model.Models.LOGISTIC,
    **kwargs,
)

print("Accuracy: {accuracy}, F1: {f1}, W: {w}".format(accuracy=accuracy, f1=f1, w=w))

Accuracy: 0.0, F1: 0.0, W: [[ 0.00759127]
 [ 0.00759127]
 [ 0.00493151]
 [ 0.00759127]
 [ 0.00759126]
 [ 0.00759043]
 [ 0.0075908 ]
 [-0.8334944 ]
 [-0.8334944 ]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759123]
 [ 0.00759124]
 [ 0.00759124]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759125]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759126]
 [ 0.00759126]
 [ 0.00759127]
 [ 0.00759074]
 [ 0.00759088]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127]
 [ 0.00759127

In [None]:
# Create submission

import src.utils.functions as utils

utils.create_submission(
    x_test=x_test,
    w=w_log_regression,
    removed_cols=removed_cols,
    model=model.Models.LOGISTIC,
    filename="submission.csv"
)

In [None]:
# Quick check on positive/negative predictions

import os

pred = np.genfromtxt(
    os.path.join(c.MODELS_PATH, "sub.csv"), delimiter=",", skip_header=1
)
pred = pred[:, 1]

print("Positive predictions: ", np.sum(pred == 1))
print("Negative predictions: ", np.sum(pred == -1))