In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [18]:
from implementations import *
from cross_validation import *
from losses import *
from helpers import *
from poly_exp import lambda_gamma_degree_sgd_cv

In [30]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)
indices = np.random.permutation(len(y))
train_percentage = 0.8
cutoff_idx = int(train_percentage*len(y))
y_train = y[:cutoff_idx]
tX_train = tX[:cutoff_idx]
y_test = y[cutoff_idx:]
tX_test = tX[cutoff_idx:]

In [32]:
tX_train.shape

(200000, 30)

In [31]:
y_train_0 =y_train.copy()
y_train_0[y_train_0 == -1] = 0

## First make a dataset where the NaNs are replaced by the mean
tX_train_replace_invalid = tX_train.copy()

# Make the -999 into NaNs
tX_train_replace_invalid[tX_train_replace_invalid == -999.] = np.nan
# Replace NaNs by the column average
for i in range(tX_train.shape[1]):
  a = tX_train_replace_invalid[:, i]
  mean = np.average(a[ ~np.isnan(a) ])
  a[ np.isnan(a) ] = mean
# Now, tX_train_replace_invalid is the way we need it.

## Now make a dataset where the columns with NaNs are dropped
tX_train_drop_invalid = tX_train.copy()
tX_train_drop_invalid = tX_train_drop_invalid[:, ~np.any(tX_train_drop_invalid == -999., axis=0)]
# Now, tX_train_drop_invalid is the way we need it.

In [33]:
tX_train_drop_invalid.shape

(200000, 19)

In [86]:
def lambda_gamma_degree_sgd_cv_apo(y, tx, algorithm, lambdas, gammas, degrees, K, max_iters, batch_size):
  """Do K-fold cross-validation for each value in lambdas and gammas and each degree of polynomial expansion, at every iteration.
  
  Inputs:
  ========
  data_set : string
    Which dataset to work with
    Can take any value in { "NO_NANS", "REPLACE_NANS" }
  algorithm : string
    The algorithm to use for training
    Can take any value in { "LEAST_SQUARE" , "LOGISTIC_REGRESSION", "REGULARIZED_LOGISTIC_REGRESSION"}
  lambdas : iterable
    Regularisation parameters for cost function
  gammas : iterable
    Learning rates for SGD
  degrees : int
    The polynomial degree
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each degree, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each degree, for each lambda and gamma
  """
  y, tx = prepare_dimensions(y, tx)
  

  N = len(y)
  len_degrees = len(degrees)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  training_errors = np.zeros((K, len_degrees, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_degrees, len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K, seed)

  for d, degree in enumerate(degrees):
    print("Degree = {}".format(degree))
    tx_poly = build_poly(tx, degree)
    tx_poly, *_ = standardize(tx_poly)
    initial_w = np.ones((tx_poly.shape[1], 1))
    print(tx_poly.shape)
    for k in range(K):
      print("Fold = {}".format(k+1))
      # Take all but the k-th row of tx and y
      tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx_poly, y))
      # Take the k-th row of tx and y
      tx_test, y_test = map(lambda a: a[k_indices[k]], (tx_poly, y))

      for i, lambda_ in enumerate(lambdas):
        for j, gamma in enumerate(gammas):
          # Train
          w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, algorithm, batch_size, lambda_)
          # Test
          loss_te = compute_mse_loss(y_test, tx_test, w)

          training_errors[k, d, i, j] = loss_tr
          validation_errors[k, d, i, j] = loss_te

  return training_errors, validation_errors

In [119]:
def lambda_degree_ridge_cv_apo(y, tx, lambdas, degrees, K):
  """Do K-fold cross-validation for each value in lambdas and gammas and each degree of polynomial expansion, at every iteration.
  
  Inputs:
  ========
  data_set : string
    Which dataset to work with
    Can take any value in { "NO_NANS", "REPLACE_NANS" }
  algorithm : string
    The algorithm to use for training
    Can take any value in { "LEAST_SQUARE" , "LOGISTIC_REGRESSION", "REGULARIZED_LOGISTIC_REGRESSION"}
  lambdas : iterable
    Regularisation parameters for cost function
  gammas : iterable
    Learning rates for SGD
  degrees : int
    The polynomial degree
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each degree, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each degree, for each lambda and gamma
  """
  y, tx = prepare_dimensions(y, tx)
  

  N = len(y)
  len_degrees = len(degrees)
  len_lambdas = len(lambdas)

  training_errors = np.zeros((K, len_degrees, len_lambdas))
  validation_errors = np.zeros((K, len_degrees, len_lambdas))

  k_indices = build_k_indices(y, K, seed)

  for d, degree in enumerate(degrees):
    print("Degree = {}".format(degree))
    tx_poly = build_poly(tx, degree)
    tx_poly, *_ = standardize(tx_poly)
    initial_w = np.ones((tx_poly.shape[1], 1))
    print(tx_poly.shape)
    for k in range(K):
      print("Fold = {}".format(k+1))
      # Take all but the k-th row of tx and y
      tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx_poly, y))
      # Take the k-th row of tx and y
      tx_test, y_test = map(lambda a: a[k_indices[k]], (tx_poly, y))

      for i, lambda_ in enumerate(lambdas):
        # Train
        w, loss_tr = ridge_regression(y_train, tx_train, lambda_)
        # Test
        loss_te = compute_mse_loss(y_test, tx_test, w)

        training_errors[k, d, i] = loss_tr
        validation_errors[k, d, i] = loss_te

  return training_errors, validation_errors

# Least Squares

In [90]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.array([0])
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_drop_invalid, "LEAST_SQUARE", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 20)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 39)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 58)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 77)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 6)
0.12061365972229865


In [92]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.array([0])
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_replace_invalid, "LEAST_SQUARE", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 31)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 61)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 121)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 6)
0.33882490920110714


# Logistic regression

In [93]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.array([0])
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_drop_invalid, "LOGISTIC_REGRESSION", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 20)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 39)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 58)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 77)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 6)
1.360089820235805


In [94]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.array([0])
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_replace_invalid, "LOGISTIC_REGRESSION", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 31)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 61)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 121)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 6)
3.403233738372165


# Regularized logistic regression

In [95]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_drop_invalid, "REGULARIZED_LOGISTIC_REGRESSION", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 20)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 39)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 58)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 77)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 6)
0.5345946151629031


In [99]:
gammas = np.logspace(-8, -1, 8)
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv_apo(y_train_0, tX_train_replace_invalid, "REGULARIZED_LOGISTIC_REGRESSION", lambdas, gammas, degrees, K, max_iters, batch_size)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 31)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 61)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 121)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 6)
0.7078951059809448


# Least squares closed form

In [108]:
lambdas = np.array([0])
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_degree_ridge_cv_apo(y_train_0, tX_train_drop_invalid, lambdas,degrees, K)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 20)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 39)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 58)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 77)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(200000, 96)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(200000, 115)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(200000, 134)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(200000, 153)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(200000, 172)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(200000, 191)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(200000, 210)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(200000, 229)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(200000, 248)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(9, 0)
0.07594246565704504


In [109]:
val_error_deg_lambda_gamma

array([[0.08802954],
       [0.08331952],
       [0.08136525],
       [0.08038432],
       [0.07953894],
       [0.07909897],
       [0.0788376 ],
       [0.07850604],
       [0.07726183],
       [0.07594247],
       [0.07634854],
       [0.07667922],
       [0.07694727]])

In [110]:
lambdas = np.array([0])
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_degree_ridge_cv_apo(y_train_0, tX_train_replace_invalid, lambdas,degrees, K)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 31)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 61)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 121)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(200000, 151)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(200000, 181)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(200000, 211)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(200000, 241)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(200000, 271)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(200000, 301)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(200000, 331)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(200000, 361)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(200000, 391)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(10, 0)
0.0690634488451319


# Ridge regression

In [121]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_degree_ridge_cv_apo(y_train_0, tX_train_drop_invalid, lambdas,degrees, K)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 20)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 39)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 58)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 77)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(200000, 96)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(200000, 115)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(200000, 134)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(200000, 153)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(200000, 172)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(200000, 191)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(200000, 210)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(200000, 229)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(200000, 248)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(12, 0)
0.07857525875072088


In [118]:
val_error_deg_lambda_gamma

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.09184094],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.08864611],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.08813942]])

In [122]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
K=4
max_iters=1000
batch_size =1
training_errors, validation_errors =\
lambda_degree_ridge_cv_apo(y_train_0, tX_train_replace_invalid, lambdas,degrees, K)

val_error_deg_lambda_gamma = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda_gamma)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda_gamma), val_error_deg_lambda_gamma.shape)
print(idx_min)
print(val_error_deg_lambda_gamma[idx_min])

Degree = 1
(200000, 31)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(200000, 61)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(200000, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(200000, 121)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(200000, 151)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(200000, 181)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(200000, 211)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(200000, 241)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(200000, 271)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(200000, 301)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(200000, 331)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(200000, 361)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(200000, 391)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(12, 0)
0.07108794367635787
