In [78]:
%load_ext autoreload
%autoreload 2

import numpy as np
from utils import *
from implementations import *
from helpers import load_csv_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
# Load training data

y_tr, x_tr, ids_tr = load_csv_data("data/train.csv")

In [80]:
# Load test data

y_te, x_te, ids_te = load_csv_data("data/test.csv")

In [None]:
print(x_tr)

In [100]:
# Print only "PRI_jet_num" column
# We can see that the range of values is 0,1,2,3
print(np.unique(x_tr[:,-8]))

[0. 1. 2. 3.]


In [112]:
# When "PRI_jet_num" is 0, then all the jet columns are -999
# Verify that this is the case

print(np.unique(x_tr[x_tr[:,-8] == 0][:, -7:]))

[-999.    0.]


In [115]:
# When "PRI_jet_num" is 1, then the lsubleading jet columns are -999
# Verify that this is the case

print(np.unique(x_tr[x_tr[:,-8] == 1][:, -4:-1]))

[-999.]


In [110]:
# The last column "PRI_jet_all_pt" is the sum of all the "jet" columns ???
# Verify that this is true


193.66
146.97699999999998
-46.68300000000002


In [84]:
x_tr = standardize(x_tr)

In [85]:
print(x_tr)

[[ 0.96922337  0.36758382  0.68924419 ...  0.36814539  0.35945282
   0.6308109 ]
 [ 0.96922337  0.36758382  0.7418138  ... -1.97227484 -1.97227484
   0.47340627]
 [ 0.96922337  0.36758382 -1.97227484 ... -1.97227484 -1.97227484
   0.46878505]
 ...
 [ 0.96922337  0.36758382  0.61199844 ... -1.97227484 -1.97227484
   0.46349931]
 [ 0.96922337  0.36758382  0.58741589 ... -1.97227484 -1.97227484
   0.36524397]
 [ 0.96922337  0.36758382 -1.97227484 ... -1.97227484 -1.97227484
   0.36524397]]


In [64]:
# PARAMETERS

initial_w = np.zeros(x_tr.shape[1])   # initial weights
max_iters = 100                       # max number of iterations
gamma = 0.1                           # step size 
lambda_ = 2                           # regularization parameter


In [68]:
# Test Linear Regression with Gradient Descent

w_mean_squared_error_gd, loss_mean_squared_error_gd = mean_squared_error_gd(y_tr, x_tr, initial_w, max_iters, gamma)

# Fit of the model tested using RMSE, TODO: try R2 statistic
rmse_tr = np.sqrt(2 * loss_mean_squared_error_gd)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_mean_squared_error_gd))

print("Mean squared error gradient descent: W: {w}, Loss:{loss}".format(w=w_mean_squared_error_gd, loss=loss_mean_squared_error_gd))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))


Mean squared error gradient descent: W: [ 1.30015261e+09  4.33387880e+09  4.85694746e+09  3.87300290e+09
 -1.64765853e+10 -1.69977021e+10 -1.64716554e+10  3.35069512e+09
  3.57345601e+09  5.12029269e+09  3.32888687e+09  3.29247363e+09
 -1.64741946e+10  3.96961299e+09  3.30196337e+09  3.30202850e+09
  4.12814541e+09  3.30169481e+09  3.30309479e+09  3.93840048e+09
  3.30193414e+09  6.08447174e+09  3.30778391e+09 -8.44268746e+09
 -8.83863667e+09 -8.83874473e+09 -1.65297807e+10 -1.64736646e+10
 -1.64736767e+10  3.62708267e+09], Loss:1.6373367723157755e+22
RMSE train: 180960590865.29175, RMSE test: 112611664608973.78


In [66]:
# Test Least Squares Regression using Normal Equations

w_least_squares, loss_least_squares = least_squares(y_tr, x_tr)
rmse_tr = np.sqrt(2 * loss_least_squares)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_least_squares))

print("Least squares: W: {w}, Loss:{loss}".format(w=w_least_squares, loss=loss_least_squares))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Least squares: W: [ 3.13356376e-02 -3.09741815e+00 -2.70522700e+00 -2.06801610e-01
 -1.33912480e+00  1.94835177e-01 -1.03232898e+01  1.48327058e+02
 -4.77128549e-02  4.80738057e+00 -9.37298630e+01  4.14169258e+01
  1.95969787e+01 -1.09387125e+00 -1.14289354e-01 -4.31650570e-01
  8.45753704e-01 -1.53106392e-01  3.67680263e-01  1.60072822e+00
  9.50537668e-02 -2.16964200e-01 -9.17137406e+01 -9.50814758e-02
  8.77034174e-02  1.48635485e-01 -3.04442023e-02 -2.85312478e+00
 -5.12725271e+00 -5.13349749e+00], Loss:0.33944681722221687
RMSE train: 0.8239500193849344, RMSE test: 205.03819387349836


In [67]:
# Test Ridge Regression using Normal Equations

w_ridge_regression, loss_ridge_regression = ridge_regression(y_tr, x_tr, lambda_)
rmse_tr = np.sqrt(2 * loss_ridge_regression)
rmse_te = np.sqrt(2 * compute_loss(y_te, x_te, w_ridge_regression))

print("Ridge regression: W: {w}, Loss:{loss}".format(w=w_ridge_regression, loss=loss_ridge_regression))
print("RMSE train: {rmse_tr}, RMSE test: {rmse_te}".format(rmse_tr=rmse_tr, rmse_te=rmse_te))

Ridge regression: W: [ 0.03285406 -0.01522637 -0.01105648 -0.00676256  0.0155015   0.02760007
  0.01531225 -0.00745091 -0.00890914 -0.01231396 -0.00753058 -0.00725283
  0.01547001 -0.00656988 -0.00737802 -0.00738148 -0.01009919 -0.00737697
 -0.00737479 -0.00953921 -0.00737048 -0.01489131 -0.00743624  0.01443978
  0.01488679  0.01488641  0.01422847  0.01545997  0.01545747 -0.01039903], Loss:0.4339121896746597
RMSE train: 0.9315709201930465, RMSE test: 132.4973086708635
