In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from utils import *
from implementations import *
from helpers import *

In [41]:
# Load training data (2.23m)

x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/")

In [42]:
# Load test data
y_train = np.expand_dims(y_train, 1)
y_train = y_train.reshape((y_train.shape[0], 1))

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape) 
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 320)
y_train shape:  (328135, 1)
x_test shape:  (109379, 320)


In [43]:
x_train = standardize(x_train)

In [75]:
def replace_nan_mean(x: np.ndarray) -> np.ndarray:
    x = x.copy()
    for col in range(x.shape[1]):
        mean = np.nanmean(x[:, col])
        x[np.isnan(x[:,col]), col] = mean
    return x

def less_than_percent_nans(x: np.ndarray, percentage: int) -> np.ndarray:
    x = x.copy()
    nan_percentage_per_column = np.isnan(x).sum(0) / len(x)
    less_than_percent_nans_columns_mask = nan_percentage_per_column < (percentage / 100)
    return x[:, less_than_percent_nans_columns_mask]
    

In [84]:
x_train_nonans = replace_nan_mean(less_than_percent_nans(x_train, 90))

In [87]:
np.isnan(x_train_nonans).sum()

0

In [48]:
np.isnan(x_train[:, 100]).sum()

replace_nan_mean(x_train)

np.isnan(x_train[:, 100]).sum()


186763

In [88]:
# PARAMETERS

max_iters = 20                            # max number of iterations 
threshold = 1e-8                            # threshold for stopping criterion
gamma = 0.001                                 # step size
initial_w = np.zeros((x_train_nonans.shape[1], 1)) # initial weights

In [89]:
# Mean squared error gradient descent
w_mean_squared_error_gd, loss_mean_squared_error_gd = mean_squared_error_gd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_gd)
#rmse_te = np.sqrt(2 * compute_loss(y_test, x_test, w_mean_squared_error_gd))

print("Mean squared error gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_gd, loss=loss_mean_squared_error_gd))
#print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))


[[0.04415104]]
[[2.57087432e+29]]
[[1.69531403e+61]]
[[1.1179425e+93]]
[[7.37205856e+124]]
[[4.86136339e+156]]
[[3.20573336e+188]]
[[2.11395971e+220]]
[[1.39401041e+252]]
[[9.19253572e+283]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
[[inf]]
Mean squared error gradient descent: W: [[-1.07543494e+299]
 [            -inf]
 [-1.08489838e+299]
 [-2.45095000e+299]
 [            -inf]
 [            -inf]
 [            -inf]
 [            -inf]
 [-1.69074928e+298]
 [-1.69103597e+298]
 [-1.69074928e+298]
 [-2.61158578e+298]
 [-3.03301256e+298]
 [-1.35525461e+298]
 [-1.67711464e+298]
 [-1.69074928e+298]
 [-1.69074928e+298]
 [-2.54377639e+298]
 [-1.70047550e+298]
 [-1.80389757e+298]
 [-2.76002206e+298]
 [-4.11391854e+298]
 [-4.34862207e+298]
 [            -inf]
 [            -inf]
 [            -inf]
 [-1.85982687e+298]
 [-2.35691913e+298]
 [-3.23925017e+298]
 [-2.66082265e+298]
 [-3.74018807e+298]
 [-1.98251040e+298]
 [-2.09637780e+298]
 [-2.45097822e+298]
 [-2.7542

In [66]:
# Test Least Squares Regression using Normal Equations

w_least_squares, loss_least_squares = least_squares(y_tr, x_tr)
rmse_tr = np.sqrt(2 * loss_least_squares)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_least_squares))

print("Least squares: W: {w}, Loss:{loss}".format(w=w_least_squares, loss=loss_least_squares))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Least squares: W: [ 3.13356376e-02 -3.09741815e+00 -2.70522700e+00 -2.06801610e-01
 -1.33912480e+00  1.94835177e-01 -1.03232898e+01  1.48327058e+02
 -4.77128549e-02  4.80738057e+00 -9.37298630e+01  4.14169258e+01
  1.95969787e+01 -1.09387125e+00 -1.14289354e-01 -4.31650570e-01
  8.45753704e-01 -1.53106392e-01  3.67680263e-01  1.60072822e+00
  9.50537668e-02 -2.16964200e-01 -9.17137406e+01 -9.50814758e-02
  8.77034174e-02  1.48635485e-01 -3.04442023e-02 -2.85312478e+00
 -5.12725271e+00 -5.13349749e+00], Loss:0.33944681722221687
RMSE train: 0.8239500193849344, RMSE test: 205.03819387349836


In [67]:
# Test Ridge Regression using Normal Equations

w_ridge_regression, loss_ridge_regression = ridge_regression(y_tr, x_tr, lambda_)
rmse_tr = np.sqrt(2 * loss_ridge_regression)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_ridge_regression))

print("Ridge regression: W: {w}, Loss:{loss}".format(w=w_ridge_regression, loss=loss_ridge_regression))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Ridge regression: W: [ 0.03285406 -0.01522637 -0.01105648 -0.00676256  0.0155015   0.02760007
  0.01531225 -0.00745091 -0.00890914 -0.01231396 -0.00753058 -0.00725283
  0.01547001 -0.00656988 -0.00737802 -0.00738148 -0.01009919 -0.00737697
 -0.00737479 -0.00953921 -0.00737048 -0.01489131 -0.00743624  0.01443978
  0.01488679  0.01488641  0.01422847  0.01545997  0.01545747 -0.01039903], Loss:0.4339121896746597
RMSE train: 0.9315709201930465, RMSE test: 132.4973086708635
