from implementations import *
from helpers import *
import matplotlib.pyplot as plt
import pandas as pd

y, tx, ids = load_csv_data("../data/train.csv", True)
N, D = tx.shape

(w, loss) = least_squares(y, tx)
lambdas = np.logspace(1e-10, 1e10)
weights = []
losses = []

for lambda_ in lambdas:
  (w, loss) = ridge_regression(y_train, tx, lambda_)
  weights.append(w)
  losses.append(loss)
lt.semilogx(lambdas, losses)

# # Boxplots for each feature

# +

for i in range(D):
  a_all = tx[:, i]
  a = a_all[a_all != -999]
  y_a = y[a_all != -999]
  fig, ax = plt.subplots()

  ax.set_title('Feature {}'.format(i))
  ax.boxplot((a[y_a == -1], a[y_a == 1]), labels=('-1', '1'), vert=False)
# -

# # Some scatter plots of a few features
# There are 30 features so `29*30/2 = 435` pairs. We start with those that
# have `NaN`s so that we can decide what to do with them.

# All data points
df = pd.read_csv("../data/train.csv")
df.replace(-999, np.nan, inplace=True)
df

pearson = df.corr().to_numpy()
pearson[pearson > 0.9]

fst = lambda lst: map(lambda tup: tup[0], list(lst))
snd = lambda lst: map(lambda tup: tup[1], list(lst))
thd = lambda lst: map(lambda tup: tup[2], list(lst))
#fst, snd, thd = map(lambda i: lambda lst: map(lambda tup: tup[i], list(lst)), [0, 1, 2])

# Which features have NaNs, and how many?
nan_list = [ (col, i, df[col].isna().sum()) for i, col in enumerate(df.columns[2:]) ]
#nan_list
nan_features = filter(lambda tup: tup[-1] != 0, nan_list)
notnan_features = filter(lambda tup: tup[-1] == 0, nan_list)
#list(nan_features)
#list(notnan_features)
nan_list

for nan_feature in fst(nan_features):
  for notnan_feature in fst(notnan_features):
    fig, ax = plt.subplots()

    #print('{} against {}'.format(notnan_feature, nan_feature))
    ax.set_title('{} against {}'.format(notnan_feature, nan_feature))
    ax.scatter(df[nan_feature], df[notnan_feature])

list(map(lambda tup: tup[0], list(nan_features)))

list(nan_features)

df

# - Examine data with regularised least squares SGD
# (which hyperparameters give the best results?)
# - Try to change learning rate and maximum degree of polynomial
# - Get rid of all features that have nans
# - Try to write a function that tests all different hyperparameters
# at each run of the cross-validation (rather than doing
# a CV for each param) -> is that feasible?

# +
# 1. Get rid of NaNs
# -

no_nan_mask = ~np.any([ tx == -999 ], axis = 1).reshape(D)
tx_no_nans = tx[ :, no_nan_mask ]
tx_no_nans.shape

# +
# 2. Write function to get best regularisation parameter
# -

from cross_validation import build_k_indices

In [4]:
from project1 import *

In [None]:

def lambda_eval_cv(y, tx, lambdas, K, max_iters, gamma, batch_size, seed):
  """Do K-fold cross-validation for each value in lambdas, at every iteration.
  
  Inputs:
  y : np array
    N x 1
  tx : np array
    N x D
  initial_w : np array
    D x 1
  lambdas : iterable
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  gamma : float
    Learning rate for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  tr_losses : np array
    Training loss averaged over every iteration, for each lambda
  te_losses : np array
    Validation loss averaged over every iteration, for each lambda
  """
  y, tx = prepare_dimensions(y, tx)
  N = len(y)

  # Get initial w using least squares
  # initial_w, _ = least_squares(y, tx)  
  initial_w = np.ones((tx.shape[1], 1))
  weights = [ initial_w for i in range(len(lambdas)) ]
  w_best = [ initial_w for i in range(len(lambdas)) ]

  k_indices = build_k_indices(y, K, seed)

  training_errors = [ [] for i in range(len(lambdas))]
  validation_errors = [ [] for i in range(len(lambdas))]
  min_error = [ np.inf for i in range(len(lambdas))]

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      # Train
      w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, "REGULARIZED_LOGISTIC_REGRESSION", batch_size, lambda_)
      # Test
      loss_te = compute_regularized_logistic_loss(y_test, tx_test, w, lambda_)
      
      weights[i] = w
      training_errors[i].append(loss_tr)
      validation_errors[i].append(loss_te)

      # Keep the weights that give the lowest loss_te
      if loss_te < min_error[i]:
        min_error[i] = loss_te
        w_best[i] = w

  # training_errors = map(lambda a: np.average(np.array(a)), training_errors)
  # validation_errors = map(lambda a: np.average(np.array(a)), validation_errors)
  return w_best, training_errors, validation_errors

In [None]:
lambdas = np.logspace(-6, -1, 5)

w_best, training_errors, validation_errors = lambda_eval_cv(
  y,
  tx_no_nans,
  lambdas,
  K = 4,
  max_iters = 1000,
  gamma = 0.05,
  batch_size = 1,
  seed = 1,
)

In [None]:
# Train
w, loss_tr = SGD(y, tx, initial_w, max_iters, gamma, "REGULARIZED_LOGISTIC_REGRESSION", batch_size, lambda_)
# Test
loss_te = compute_regularized_logistic_loss(y_test, tx_test, w, lambda_)

In [146]:
training_errors

array([1., 0., 0., ..., 1., 0., 1.])

In [None]:
# I want a function from which tests can be carried out;
# I want it to do cross-validation, say, for different values of
# lambda and degree and gamma, and a given algorithm
# Syntax: eval(degree = 3, lambdas = np.logspace(-10, 1, 10), algo = "REGULARIZED_LOGISTIC_REGRESSION")

In [None]:
from project1 import tX_train_drop_invalid, tX_train_replace_invalid

data_sets = {
  "DROP_NANS"  : tX_train_drop_invalid,
  "NAN_AVGS"   : tX_train_replace_invalid,
  "CATEGORIES" : None,
}

In [None]:

# def cv_eval(**kwargs):
def cv_eval(
  data_set   = "DROP_NANS",
  algo       = "REGULARIZED_LOGISTIC_REGRESSION",
  degree     = 1,
  degrees    = None,
  lambda_    = 1e-10,
  lambdas    = None,
  gamma      = 0.1,
  gammas     = None,
  K          = 4,
  max_iters  = 100,
  batch_size = 1,
  seed       = 1
):
  """ Trains and tests models for certain parameter values;
  """
  if lambdas: 
    w_best, training_errors, validation_errors = lambda_eval_cv(y, tx, lambdas, K, max_iters, gamma, batch_size, seed)



  return w_best, training_errors, validation_errors

In [175]:

def lambda_gamma_eval_cv(y, tx, lambdas, gammas, K, max_iters, batch_size, seed):
  """Do K-fold cross-validation for each value in lambdas, at every iteration.
  
  Inputs:
  y : np array
    N x 1
  tx : np array
    N x D
  lambdas : iterable
    Regularisation parameters for cost function
  gamma : iterable
    Learning rates for SGD
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  weights : np array
    Trained weights for 
  tr_losses : np array
    Training loss for each fold, for each lambda and gamma
  te_losses : np array
    Validation loss for each fold, for each lambda and gamma
  """
  y, tx = prepare_dimensions(y, tx)
  N = len(y)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_lambdas, len_gammas))

  training_errors = np.zeros((K, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_lambdas, len_gammas))
  min_error = np.inf * np.ones((len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K, seed)

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      for j, gamma in enumerate(gammas):
        # Train
        w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, "REGULARIZED_LOGISTIC_REGRESSION", batch_size, lambda_)
        # Test
        loss_te = compute_regularized_logistic_loss(y_test, tx_test, w, lambda_)
        
        # weights[i] = w
        training_errors[k, i, j] = loss_tr
        validation_errors[k, i, j] = loss_te

        # Keep the weights that give the lowest loss_te
        if loss_te < min_error[i, j]:
          min_error[i, j] = loss_te
          w_best[:, i, j] = w.ravel()

  return w_best, training_errors, validation_errors

In [176]:
lambdas = np.logspace(-6, -1, 5)
gammas = np.logspace(-6, -1, 5)

w_best, training_errors, validation_errors = lambda_gamma_eval_cv(
  y,
  tx_no_nans,
  lambdas,
  gammas,
  K = 4,
  max_iters = 1000,
  batch_size = 1,
  seed = 1,
)

In [179]:
def lambda_gamma_sgd_cv(algorithm, lambdas, gammas, K, max_iters, batch_size, seed):
  """Do K-fold cross-validation for each value in lambdas and gammas, at every iteration.
  
  Inputs:
  algorithm : string
    The algorithm to use for training
    Can take any value in { "LEAST_SQUARE" , "LOGISTIC_REGRESSION", "REGULARIZED_LOGISTIC_REGRESSION"}
  lambdas : iterable
    Regularisation parameters for cost function
  gamma : iterable
    Learning rates for SGD
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each lambda and gamma
  """
  loss = loss_kinds[algorithm]
  y, tx = prepare_dimensions(y, tx)

  N = len(y)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_lambdas, len_gammas))

  training_errors = np.zeros((K, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_lambdas, len_gammas))
  min_error = np.inf * np.ones((len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K, seed)

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      for j, gamma in enumerate(gammas):
        # Train
        w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, algorithm, batch_size, lambda_)
        # Test
        loss_te = compute_mse_loss(y_test, tx_test, w)
        
        training_errors[k, i, j] = loss_tr
        validation_errors[k, i, j] = loss_te

        # Keep the weights that give the lowest loss_te
        if loss_te < min_error[i, j]:
          min_error[i, j] = loss_te
          w_best[:, i, j] = w.ravel()

  return w_best, training_errors, validation_errors

array([[624771.33037806, 152164.69076527,  19185.57181618,
        277776.97650153, 364959.49006667],
       [624771.33068091, 149144.65144178,  32124.23636751,
        236981.89825162, 306403.34900747],
       [624771.3360619 , 156035.01500659,  25314.26831659,
        183019.36184067, 310246.89527931],
       [624771.43174624, 143114.17188239,  46662.77706104,
        230771.11501926, 280082.48721357],
       [624773.12172302, 120286.0114853 , 104933.81866281,
        211812.95327603, 289105.63468022]])

In [None]:
def lambda_gamma_ridge_cv(lambdas, gammas, K, max_iters, batch_size, seed):
  """Do K-fold cross-validation with ridge regression for each value in lambdas and gammas, at every iteration.
  
  Inputs:
  lambdas : iterable
    Regularisation parameters for cost function
  gamma : iterable
    Learning rates for SGD
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each lambda and gamma
  """
  y, tx = prepare_dimensions(y, tx)

  N = len(y)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_lambdas, len_gammas))

  training_errors = np.zeros((K, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_lambdas, len_gammas))
  min_error = np.inf * np.ones((len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K, seed)

  for k in range(K):
    # Take all but the k-th row of tx and y
    tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx, y))
    # Take the k-th row of tx and y
    tx_test, y_test = map(lambda a: a[k_indices[k]], (tx, y))

    for i, lambda_ in enumerate(lambdas):
      for j, gamma in enumerate(gammas):
        # Train
        w, loss_tr = ridge_regression(y, tx, lambda_)
        # Test
        loss_te = compute_mse_loss(y_test, tx_test, w)
        
        training_errors[k, i, j] = loss_tr
        validation_errors[k, i, j] = loss_te

        # Keep the weights that give the lowest loss_te
        if loss_te < min_error[i, j]:
          min_error[i, j] = loss_te
          w_best[:, i, j] = w.ravel()

  return w_best, training_errors, validation_errors

In [None]:
tX_train_replace_invalid = tX_train.copy()
tX_train_replace_invalid[tX_train_replace_invalid == -999.] = np.nan

# Replace NaNs by the column average

In [None]:
def lambda_gamma_degree_sgd_cv(algorithm, lambdas, gammas, degrees, K, max_iters, batch_size):
  """Do K-fold cross-validation for each value in lambdas and gammas and each degree of polynomial expansion, at every iteration.
  
  Inputs:
  algorithm : string
    The algorithm to use for training
    Can take any value in { "LEAST_SQUARE" , "LOGISTIC_REGRESSION", "REGULARIZED_LOGISTIC_REGRESSION"}
  lambdas : iterable
    Regularisation parameters for cost function
  gammas : iterable
    Learning rates for SGD
  degrees : iterable
    Highest polynomial degree
  K : int
    Number of folds
  max_iters : int
    Maxium number of iterations for SGD
  batch_size : int
    Size of mini-batches
  seed : int
    Seed for pseudo-random number generation
  
  Outputs:
  w_best : np array
    (D, len(lambdas), len(lambdas))
    Trained weights that produced the smallest validation error
    over all folds, for each lambda and gamma
  training_errors : np array
    (K, len(lambdas), len(lambdas))
    Training loss for each fold, for each degree, for each lambda and gamma
  validation_errors : np array
    (K, len(lambdas), len(lambdas))
    Validation loss for each fold, for each degree, for each lambda and gamma
  """
  loss = loss_kinds[algorithm]
  y, tx = prepare_dimensions(y, tx)

  N = len(y)
  len_lambdas = len(degrees)
  len_lambdas = len(lambdas)
  len_gammas = len(gammas)

  initial_w = np.ones((tx.shape[1], 1))
  w_best = np.zeros((tx.shape[1], len_degrees, len_lambdas, len_gammas))

  training_errors = np.zeros((K, len_degrees, len_lambdas, len_gammas))
  validation_errors = np.zeros((K, len_degrees, len_lambdas, len_gammas))
  min_error = np.inf * np.ones((len_degrees, len_lambdas, len_gammas))

  k_indices = build_k_indices(y, K, seed)

  for d, degree in enumerate(degrees):

    # replace_nan_by_mean
    # Make into polynomial
    # Normalise
    tx_poly = build_poly(tx, degree)


    for k in range(K):


      # Take all but the k-th row of tx and y
      tx_train, y_train = map(lambda a: a[np.delete(k_indices, k).flatten()], (tx_poly, y))
      # Take the k-th row of tx and y
      tx_test, y_test = map(lambda a: a[k_indices[k]], (tx_poly, y))

      for i, lambda_ in enumerate(lambdas):
        for j, gamma in enumerate(gammas):
          # Train
          w, loss_tr = SGD(y_train, tx_train, initial_w, max_iters, gamma, algorithm, batch_size, lambda_)
          # Test
          loss_te = compute_mse_loss(y_test, tx_test, w)
          
          training_errors[k, d, i, j] = loss_tr
          validation_errors[k, d, i, j] = loss_te

          # Keep the weights that give the lowest loss_te
          if loss_te < min_error[d, i, j]:
            min_error[d, i, j] = loss_te
            w_best[:, d, i, j] = w.ravel()

  return w_best, training_errors, validation_errors