In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from implementations import *
from cross_validation import *
from losses import *
from helpers import *

In [3]:
constants = {   "max_iters" : 1000, 
                "lambda" : 1, 
                "gamma" : 1, 
                "degree_expansion" : 3,
                "K" : 4,
                "batch_size" : 1,
                "dataset" : "REPLACE_INVALID",
                "algorithm" : "REGULARIZED_LOGISTIC_REGRESSION" } 

np.random.seed(1)

## Load the data

In [4]:
DATA_TRAIN_PATH = '../data/train.csv'
y_train, tX_train, ids_train = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

## Data preprocessing

In [5]:
### Relabel the output y from {-1,1} to {0,1}
y_train[y_train == -1] = 0

### Dealing with invalid data & Normalize data
# a. Dataset where columns containing invalid data are dropped
if (constants["dataset"] == "DROP_INVALID"):
    tX_train = tX_train[:, ~np.any(tX_train == -999., axis=0)]

    tX_train_means = np.mean(tX_train, axis=0)
    tX_train_stds = np.std(tX_train, axis=0)

    tX_train = (tX_train - tX_train_means)/tX_train_stds

# b. Dataset where columns containing invalid data are replaced by the mean of the corresponding feature
if (constants["dataset"] == "REPLACE_INVALID"):
    tX_train[tX_train == -999.] = np.nan

    tX_train_means = np.nanmean(tX_train, axis=0)
    tX_train_stds = np.nanstd(tX_train, axis=0)

    tX_train = (tX_train - tX_train_means)/tX_train_stds
    tX_train[np.isnan(tX_train)] = 0

### Feature expansion
tX_train = build_poly(tX_train, constants["degree_expansion"])

In [6]:
# ## Model training

In [12]:
def lambda_gamma_sgd_cv(y, tx, algorithm, lambdas, gammas, K, max_iters, batch_size):
  """Do K-fold cross-validation for each value in lambdas and gammas, at every iteration.
  
  Inputs:
  y : np array
    (N, 1) or (N,)
  tx : np array
    (N, D)
  algorithm : string
    The algorithm to use for training
    Can take any value in { "LEAST_SQUARE" , "LOGISTIC_REGRESSION", "REGULARIZED_LOGISTIC_REGRESSION"}
  lambdas : iterable
    Regularisation parameters for cost function
  gamma : iterable
    Learning rates for SGD
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each lambda and gamma
  """
  # loss = loss_kinds[algorithm][0]
  y, tx = prepare_dimensions(y, tx)

  N = len(y)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_lambdas, len_gammas))

  training_errors = np.zeros((K, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_lambdas, len_gammas))
  min_error = np.inf * np.ones((len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K)

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      for j, gamma in enumerate(gammas):
        # Train
        w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, algorithm, batch_size, lambda_)
        # Test
        # loss_te = loss(y_test, tx_test, w, lambda_)
        loss_te = compute_mse_loss(y_test, tx_test, w)
        
        training_errors[k, i, j] = loss_tr
        validation_errors[k, i, j] = loss_te

        # Keep the weights that give the lowest loss_te
        if loss_te < min_error[i, j]:
          min_error[i, j] = loss_te
          w_best[:, i, j] = w.ravel()

  return w_best, training_errors, validation_errors
# -
def lambda_gamma_ridge_cv(y, tx, lambdas, K, max_iters, batch_size, seed):
  """Do K-fold cross-validation with ridge regression for each value in lambdas, at every iteration.
  
  Inputs:
  y : np array
    (N, 1) or (N,)
  tx : np array
    (N, D)
  lambdas : iterable
    Regularisation parameters for cost function
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  
  Outputs:
  w_best : np array
    (D, len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas))
    Training loss for each fold, for each lambda
  validation_errors : np array
    (K, len(lambdas))
    Validation loss for each fold, for each lambda
  """
  y, tx = prepare_dimensions(y, tx)

  N = len(y)
  len_lambdas = len(lambdas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_lambdas))

  training_errors = np.zeros((K, len_lambdas))
  validation_errors = np.zeros((K, len_lambdas))
  min_error = np.inf * np.ones((len_lambdas))

  k_indices = build_k_indices(y, K)

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      # Train
      w, loss_tr = ridge_regression(y, tx, lambda_)
      # Test
      loss_te = compute_mse_loss(y_test, tx_test, w)
      
      training_errors[k, i] = loss_tr
      validation_errors[k, i] = loss_te

      # Keep the weights that give the lowest loss_te
      if loss_te < min_error[i]:
        min_error[i] = loss_te
        w_best[:, i] = w.ravel()

  return w_best, training_errors, validation_errors

In [14]:
lambdas = np.logspace(-10, -1, 10)
gammas = np.logspace(-5, 1, 5)

# w_best, training_errors, validation_errors = lambda_gamma_ridge_cv(y_train, tX, lambdas, K, max_iters, batch_size)
w_best, training_errors, validation_errors = lambda_gamma_sgd_cv(y_train, tX_train, "REGULARIZED_LOGISTIC_REGRESSION", lambdas, gammas, constants["K"], constants["max_iters"], constants["batch_size"])

In [15]:
# Run a convergence test
algo_dict = {
  "REGULARIZED_LOGISTIC_REGRESSION" : "reg_logist",
  "LEAST_SQUARE" : "least_sq",
  "LOGISTIC_REGRESSION" : "logist",
}

def convergence_test(algorithm, y, tx, K, max_iters, lambda_, gamma):
  k_indices = build_k_indices(y, K)
  k = 0
  # Take all but the k-th row of tx and y
  tx_train, y_train_ = map(lambda a: a[np.delete(k_indices, k).flatten()], (tX, y))
  # Take the k-th row of tx and y
  tx_test, y_test = map(lambda a: a[k_indices[k]], (tX, y))

  w, loss, training_errors, validation_errors = SGD(
    y_train_,
    tx_train,
    initial_w = np.ones(tx.shape[1]),
    max_iters = max_iters,
    gamma = gamma,
    loss_kind = algorithm,
    batch_size = constants["batch_size"],
    lambda_ = lambda_,
    verbose = True,
    validation_y = y_test,
    validation_tx = tx_test)

  training_errors = list(map(lambda x: x/len(y_train_), training_errors))
  validation_errors = list(map(lambda x: x/len(y_test), validation_errors))

  # Save to file
  a = np.column_stack((np.arange(1,max_iters), training_errors[1:], validation_errors[1:])) # When epoch is 0 LaTeX complains
  np.savetxt('../../latex-example-paper/figures/conv_{}_dataset{}.txt'.format(algo_dict[algorithm], constants["dataset"]),
             a,
             header = 'epoch tr_loss val_loss\n# gamma = {}, lambda = {}'.format(gamma, lambda_),
             comments = '')

  return w, loss, training_errors, validation_errors


## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
# y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH, sub_sample=False)

Relabel the output y from {-1,1} to {0,1}

In [None]:
y_test[y_test == -1] = 0

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
# y_pred = predict_labels(weights, tX_test)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)