In [4]:
%load_ext autoreload
%autoreload 2

from helpers import *
from implementations import *

In [5]:
# Load data

x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/")

In [6]:
y_train = np.expand_dims(y_train, 1)
y_train = y_train.reshape((y_train.shape[0],1))

In [7]:
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape) 
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 320)
y_train shape:  (328135, 1)
x_test shape:  (109379, 320)


In [8]:
# Build train data

import src.constants as c
from src.data.build_data import build_train_data

x_train_nonans, removed_cols = build_train_data(data=x_train, percentage=c.PERCENTAGE_NAN)

(328135, 320)


In [10]:
# PARAMETERS

lambda_ = 0.1                                      # regularization parameter
max_iters = 50                                    # max number of iterations 
threshold = 1e-8                                   # threshold for stopping criterion
gamma = 0.4                                        # step size
initial_w = np.zeros((x_train_nonans.shape[1], 1)) # initial weights

In [11]:
# Mean squared error gradient descent
w_mean_squared_error_gd, loss_mean_squared_error_gd = mean_squared_error_gd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_gd)

print("Mean squared error gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_gd, loss=loss_mean_squared_error_gd))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Mean squared error gradient descent: W: [[ 5.68090924e+92]
 [ 5.68090756e+92]
 [ 3.63064193e+92]
 [ 5.68090755e+92]
 [ 5.68090502e+92]
 [ 5.68028012e+92]
 [ 5.68056125e+92]
 [-6.23742882e+94]
 [-6.23742882e+94]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090907e+92]
 [ 5.68090899e+92]
 [ 5.68090930e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090908e+92]
 [ 5.68090924e+92]
 [ 5.68090922e+92]
 [ 5.68090904e+92]
 [ 5.68090879e+92]
 [ 5.68090875e+92]
 [ 5.68089058e+92]
 [ 5.68088933e+92]
 [ 5.68089212e+92]
 [ 5.68090921e+92]
 [ 5.68090911e+92]
 [ 5.68090895e+92]
 [ 5.68090906e+92]
 [ 5.68090886e+92]
 [ 5.68090918e+92]
 [ 5.68090916e+92]
 [ 5.68090910e+92]
 [ 5.68090904e+92]
 [ 5.68090893e+92]
 [ 5.68090896e+92]
 [ 5.68090909e+92]
 [ 5.68090895e+92]
 [ 5.68090895e+92]
 [ 5.68090894e+92]
 [ 5.68090902e+92]
 [ 5.68090898e+92]
 [ 5.68090893e+92]
 [ 5.68090869e+92]
 [ 5.68089253e+92]
 [ 5.68090906e+92]
 [ 5.68090884e+92]
 [ 5.68090801e+92]
 [ 5.68090

In [24]:
# Mean squared error stochastic gradient descent
w_mean_squared_error_sgd, loss_mean_squared_error_sgd = mean_squared_error_sgd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_sgd)

print("Mean squared error stochastic gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_sgd, loss=loss_mean_squared_error_sgd))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Mean squared error stochastic gradient descent: W: [[ 5.68090924e+92]
 [ 5.68090756e+92]
 [ 3.63064193e+92]
 [ 5.68090755e+92]
 [ 5.68090502e+92]
 [ 5.68028012e+92]
 [ 5.68056125e+92]
 [-6.23742882e+94]
 [-6.23742882e+94]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090907e+92]
 [ 5.68090899e+92]
 [ 5.68090930e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090924e+92]
 [ 5.68090908e+92]
 [ 5.68090924e+92]
 [ 5.68090922e+92]
 [ 5.68090904e+92]
 [ 5.68090879e+92]
 [ 5.68090875e+92]
 [ 5.68089058e+92]
 [ 5.68088933e+92]
 [ 5.68089212e+92]
 [ 5.68090921e+92]
 [ 5.68090911e+92]
 [ 5.68090895e+92]
 [ 5.68090906e+92]
 [ 5.68090886e+92]
 [ 5.68090918e+92]
 [ 5.68090916e+92]
 [ 5.68090910e+92]
 [ 5.68090904e+92]
 [ 5.68090893e+92]
 [ 5.68090896e+92]
 [ 5.68090909e+92]
 [ 5.68090895e+92]
 [ 5.68090895e+92]
 [ 5.68090894e+92]
 [ 5.68090902e+92]
 [ 5.68090898e+92]
 [ 5.68090893e+92]
 [ 5.68090869e+92]
 [ 5.68089253e+92]
 [ 5.68090906e+92]
 [ 5.68090884e+92]
 [ 5.68090801e+92]

In [None]:
# Test Least Squares Regression using Normal Equations

w_least_squares, loss_least_squares = least_squares(y_train, x_train_nonans)

rmse_tr = np.sqrt(2 * loss_least_squares)

print("Least squares: W: {w}, Loss:{loss}".format(w=w_least_squares, loss=loss_least_squares))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

LinAlgError: Singular matrix

In [14]:
# Test Ridge Regression using Normal Equations

w_ridge_regression, loss_ridge_regression = ridge_regression(y_train, x_train_nonans, lambda_)

rmse_tr = np.sqrt(2 * loss_ridge_regression)

print("Ridge regression: W: {w}, Loss:{loss}".format(w=w_ridge_regression, loss=loss_ridge_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Ridge regression: W: [[-3.79799091e-05]
 [-3.79797249e-05]
 [ 1.16986226e-04]
 [-3.79797414e-05]
 [-3.79813997e-05]
 [-3.79757193e-05]
 [-3.79822931e-05]
 [ 4.16850488e-03]
 [ 4.16850488e-03]
 [-3.79799091e-05]
 [-3.79799090e-05]
 [-3.79799091e-05]
 [-3.79798791e-05]
 [-3.79801369e-05]
 [-3.79799742e-05]
 [-3.79800735e-05]
 [-3.79799091e-05]
 [-3.79799091e-05]
 [-3.79799945e-05]
 [-3.79799127e-05]
 [-3.79799244e-05]
 [-3.79799647e-05]
 [-3.79800227e-05]
 [-3.79778770e-05]
 [-3.80066694e-05]
 [-3.79819500e-05]
 [-3.79889414e-05]
 [-3.79800056e-05]
 [-3.79801672e-05]
 [-3.79799308e-05]
 [-3.79803943e-05]
 [-3.79814801e-05]
 [-3.79800511e-05]
 [-3.79802546e-05]
 [-3.79804086e-05]
 [-3.79804689e-05]
 [-3.79801427e-05]
 [-3.79800124e-05]
 [-3.79799537e-05]
 [-3.79800729e-05]
 [-3.79800767e-05]
 [-3.79801898e-05]
 [-3.79804763e-05]
 [-3.79800737e-05]
 [-3.79800337e-05]
 [-3.79807632e-05]
 [-3.79786418e-05]
 [-3.79801691e-05]
 [-3.79802835e-05]
 [-3.79805668e-05]
 [-3.79799812e-05]
 [-3.79799

In [15]:
# Test Logistic Regression using gd

w_log_regression, loss_log_regression = logistic_regression(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_log_regression)

print("Logistic regression: W: {w}, Loss:{loss}".format(w=w_log_regression, loss=loss_log_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Logistic regression: W: [[ 0.01390955]
 [ 0.01390955]
 [ 0.00945532]
 [ 0.01390955]
 [ 0.01390954]
 [ 0.01390801]
 [ 0.01390868]
 [-1.527224  ]
 [-1.527224  ]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390956]
 [ 0.0139094 ]
 [ 0.0139095 ]
 [ 0.01390948]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390952]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390957]
 [ 0.01390963]
 [ 0.01390954]
 [ 0.01390956]
 [ 0.01390739]
 [ 0.0139084 ]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 [ 0.01390955]
 

In [16]:
# Test Regularized Logistic Regression using gd

w_reg_log_regression, loss_reg_log_regression = reg_logistic_regression(y_train, x_train_nonans, lambda_, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_reg_log_regression)

print("Logistic regression: W: {w}, Loss:{loss}".format(w=w_reg_log_regression, loss=loss_reg_log_regression))
print("RMSE train: {rmse_tr}".format(rmse_tr=rmse_tr))

Logistic regression: W: [[ 0.02525463]
 [ 0.02525462]
 [ 0.01627935]
 [ 0.02525462]
 [ 0.02525461]
 [ 0.02525183]
 [ 0.02525308]
 [-2.77286642]
 [-2.77286642]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525462]
 [ 0.02525463]
 [ 0.02525452]
 [ 0.02525454]
 [ 0.02525454]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525462]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525462]
 [ 0.02525455]
 [ 0.02525463]
 [ 0.02525462]
 [ 0.02525462]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525457]
 [ 0.0252546 ]
 [ 0.02525463]
 [ 0.02525321]
 [ 0.02525346]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 [ 0.02525463]
 

In [22]:
# Create submission

from src.data.build_data import build_test_data
from src.model.predictions import Models, create_submission

create_submission(
    x_test=x_test,
    w=w_log_regression,
    removed_cols=removed_cols,
    model=Models.LOGISTIC,
    filename="submission.csv"
)

(221,)


In [23]:
# Quick check on positive/negative predictions

import src.constants as c

pred = np.genfromtxt(
    os.path.join(c.MODELS_PATH, "sub.csv"), delimiter=",", skip_header=1
)
pred = pred[:, 1]

print("Positive predictions: ", np.sum(pred == 1))
print("Negative predictions: ", np.sum(pred == -1))

Positive predictions:  109379
Negative predictions:  0
