In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
from utils import *
from implementations import *
from helpers import *

In [3]:
# Load training data (2.23m)

x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/")

In [14]:
# Load test data
y_train = np.expand_dims(y_train, 1)
y_train = y_train.reshape((y_train.shape[0],1))

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape) 
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 320)
y_train shape:  (328135, 1)
x_test shape:  (109379, 320)


In [15]:
def replace_nan_mean(x: np.ndarray) -> np.ndarray:
    x = x.copy()
    for col in range(x.shape[1]):
        mean = np.nanmean(x[:, col])
        x[np.isnan(x[:,col]), col] = mean
    return x

def less_than_percent_nans(x: np.ndarray, percentage: int) -> np.ndarray:
    x = x.copy()
    nan_percentage_per_column = np.isnan(x).sum(0) / len(x)
    less_than_percent_nans_columns_mask = nan_percentage_per_column < (percentage / 100)
    return x[:, less_than_percent_nans_columns_mask]
    

In [16]:
x_train_nonans = replace_nan_mean(less_than_percent_nans(x_train, 90))

In [7]:
np.isnan(x_train_nonans).sum()

0

In [17]:
x_train_nonans = standardize(x_train_nonans)

In [9]:
# get onòy the first 3 columns
x_train_nonans = x_train_nonans[:, :3]

In [10]:
x_train_nonans.shape

(328135, 3)

In [11]:
np.isnan(x_train[:, 100]).sum()

replace_nan_mean(x_train)

np.isnan(x_train[:, 100]).sum()


186763

In [23]:
# PARAMETERS

max_iters = 20                            # max number of iterations 
threshold = 1e-8                            # threshold for stopping criterion
gamma = 0.4                 # step size
initial_w = np.zeros((x_train_nonans.shape[1], 1)) # initial weights

In [24]:
# Mean squared error gradient descent
w_mean_squared_error_gd, loss_mean_squared_error_gd = mean_squared_error_gd(y_train, x_train_nonans, initial_w, max_iters, gamma)

rmse_tr = np.sqrt(2 * loss_mean_squared_error_gd)
#rmse_te = np.sqrt(2 * compute_loss(y_test, x_test, w_mean_squared_error_gd))

print("Mean squared error gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_gd, loss=loss_mean_squared_error_gd))
#print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))


0.04415103539701647
30.094030883250454
231679.09678235275
1785971280.9470463
13767724471942.688
1.0613285844051994e+17
8.181587061617403e+20
6.3070351473047e+24
4.861977517290225e+28
3.7480091400376114e+32
2.889271384708802e+36
2.227286226525484e+40
1.7169740305894428e+44
1.3235837345958824e+48
1.0203263830876962e+52
7.865508624905023e+55
6.063376087682339e+59
4.6741452249215735e+63
3.603212676851863e+67
2.7776504515524736e+71
Mean squared error gradient descent: W: [[ 2.81580402e+34]
 [ 2.81580319e+34]
 [ 1.79956688e+34]
 [ 2.81580318e+34]
 [ 2.81580193e+34]
 [ 2.81549219e+34]
 [ 2.81563154e+34]
 [-3.09164897e+36]
 [-3.09164897e+36]
 [ 2.81580402e+34]
 [ 2.81580402e+34]
 [ 2.81580402e+34]
 [ 2.81580394e+34]
 [ 2.81580390e+34]
 [ 2.81580405e+34]
 [ 2.81580402e+34]
 [ 2.81580402e+34]
 [ 2.81580402e+34]
 [ 2.81580394e+34]
 [ 2.81580402e+34]
 [ 2.81580401e+34]
 [ 2.81580392e+34]
 [ 2.81580380e+34]
 [ 2.81580378e+34]
 [ 2.81579477e+34]
 [ 2.81579415e+34]
 [ 2.81579554e+34]
 [ 2.81580401e+3

In [None]:
# Test Least Squares Regression using Normal Equations

w_least_squares, loss_least_squares = least_squares(y_tr, x_tr)
rmse_tr = np.sqrt(2 * loss_least_squares)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_least_squares))

print("Least squares: W: {w}, Loss:{loss}".format(w=w_least_squares, loss=loss_least_squares))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Least squares: W: [ 3.13356376e-02 -3.09741815e+00 -2.70522700e+00 -2.06801610e-01
 -1.33912480e+00  1.94835177e-01 -1.03232898e+01  1.48327058e+02
 -4.77128549e-02  4.80738057e+00 -9.37298630e+01  4.14169258e+01
  1.95969787e+01 -1.09387125e+00 -1.14289354e-01 -4.31650570e-01
  8.45753704e-01 -1.53106392e-01  3.67680263e-01  1.60072822e+00
  9.50537668e-02 -2.16964200e-01 -9.17137406e+01 -9.50814758e-02
  8.77034174e-02  1.48635485e-01 -3.04442023e-02 -2.85312478e+00
 -5.12725271e+00 -5.13349749e+00], Loss:0.33944681722221687
RMSE train: 0.8239500193849344, RMSE test: 205.03819387349836


In [None]:
# Test Ridge Regression using Normal Equations

w_ridge_regression, loss_ridge_regression = ridge_regression(y_tr, x_tr, lambda_)
rmse_tr = np.sqrt(2 * loss_ridge_regression)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_ridge_regression))

print("Ridge regression: W: {w}, Loss:{loss}".format(w=w_ridge_regression, loss=loss_ridge_regression))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Ridge regression: W: [ 0.03285406 -0.01522637 -0.01105648 -0.00676256  0.0155015   0.02760007
  0.01531225 -0.00745091 -0.00890914 -0.01231396 -0.00753058 -0.00725283
  0.01547001 -0.00656988 -0.00737802 -0.00738148 -0.01009919 -0.00737697
 -0.00737479 -0.00953921 -0.00737048 -0.01489131 -0.00743624  0.01443978
  0.01488679  0.01488641  0.01422847  0.01545997  0.01545747 -0.01039903], Loss:0.4339121896746597
RMSE train: 0.9315709201930465, RMSE test: 132.4973086708635
