## 0. Imports

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from implementations import * 
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

## 1. Load data

In [None]:
# just load the train data set
data_path = "../dataset/train.csv"
y, x_raw, col_labels = load_csv_data(data_path, 5000)

y,x_raw = equalize_predictions(y,x_raw)
y.shape, x_raw.shape

## 2. fill with nan values where there are invalid entries 

In [None]:
# example
k = np.array([
    [1,     0, -999],
    [7,     2,   -5],
    [-7,    0,    5],
    [-999, -1,    5],
    [-7,    2, -999]
])

print(k)
print()

# fill
nan_values = [-999, 0]
k = fill_with_nan_list(k, nan_values)
print(k)
print()

# ->>>> NB: probably better to substitute after standardization at step 5 <<<<-

# generate meaningful values with: 
# - np.nanmean(x, axis=0) (mean of each colum without considering nan values)
# - np.nanmedian(x, axis=0) (mean of each colum without considering nan values)
# - np.nanstd(x, axis=0) (mean of each colum without considering nan values)
k = sustitute_nans(k, substitutions=[-100, -200, -300]) 
# e.g. x = sustitute_nans(x, substitutions=np.nanmean(x, axis=0)) 
print(k)

In [None]:
x = fill_with_nan_list(x_raw, nan_values=[0, -999])
x.shape

## 3. standardize (it won't consider the np.nan entries)

In [None]:
x, mean_x, std_x = standardize(x)
x.shape

## 4. either drop_nan_rows/column or substitute nan values with something meaningful

In [None]:
# check drop_nan_rows/columns and sustitute_nans in implementations.py
# np.nanmean(x, axis=0) should be an array of 0s (the column are standardized)
x = sustitute_nans(x, substitutions=np.nanmean(x, axis=0)) 
x.shape

## 5. train and test models:

### Ridge regression with cross validation

In [None]:
# just show how the function ratios_visualization works

model1 = np.array([
    [0.65, 0.9], # relative to degrees[0]
    [0.7, 0.8],  # relative to degrees[1]
    [0.7, 0.7]]) # relative to degrees[2]
model2 =  np.array([
    [0.7, 0.6],  # relative to degrees[0]
    [0.9, 1],    # relative to degrees[1]
    [0.8, 0.8]]) # relative to degrees[2]
degree_list = np.array([2, 4, 7])
lambdas = np.array([0.1, 0.5]) # x asis
 
ratios_visualization([model1, model2, model1-0.1], degree_list, lambdas, x_label="x label", log_axis_x=False, 
                     save_figure_with_name="test_figure")

In [None]:
# num_sets: number of sets in which the dataset will be splitted to run the cross validation
# degree_list: list of degree to be tried
# lambdas: list of lambdas to be tried 
def cross_validation_ridge_regression(x, y, num_K_sets, degree_list, lambdas):
    seed = 2
    
    # split indices in k sets
    k_indices = build_k_indices(y, num_K_sets, seed)
    
    ratio_tr = [] # matrix success ratio obtained with the training sets
    ratio_te = [] # matrix success ratio obtained with the test sets

    # define lists to store the loss of training data and test data
    for nfigure, degree in enumerate(degree_list): # one figure per degree
        tx = build_poly(x, degree) 
        
        # one row (figure) per degree
        ratio_tr.append([])
        ratio_te.append([])
        
        for npoint, lambda_ in enumerate(lambdas):
            # for each lambda we compute the expected ratio of success (this will be the a point in the figure)
            ratio_tr[nfigure].append(0)
            ratio_te[nfigure].append(0)
            
            for k_curr in range(num_K_sets):
                train, test = get_kth_set(y, tx, k_indices, k_curr)
                
                # train the model, just line should change depending on the chosen training 
                _, w = ridge_regression(train.y, train.tx, lambda_)
                
#                 # gradient descent
#                 initial_w = np.zeros(tx.shape[1])
#                 max_iters = 50
#                 gamma = lambda_
#                 _, w = gradient_descent(train.y, train.tx, initial_w, max_iters, gamma, batch_size=-1, print_output=False, plot_losses=False, costfunc=CostFunction.MSE)
                
                # compute how good is the model
                ratio_tr[nfigure][npoint] += compute_loss(train.y, train.tx, w, costfunc=CostFunction.SUCCESS_RATIO)
                ratio_te[nfigure][npoint] += compute_loss(test.y, test.tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            
            # average the ratio obtained with the cross validation
            ratio_tr[nfigure][npoint] /= num_K_sets
            ratio_te[nfigure][npoint] /= num_K_sets
            
    ratio_tr = np.array(ratio_tr)
    ratio_te = np.array(ratio_te)
    
#     print(ratio_tr.shape) # #degree x #lambdas
#     print(ratio_te.shape)
#     print(degrees.shape)
#     print(lambdas.shape)
    
    return ratio_tr, ratio_te

In [None]:
# the lambda of the ridge regression does not affect much the model
nlambdas = 3
lambdas = np.linspace(0, 1000, nlambdas)

num_K_sets = 5
degree_list = [2, 3, 4]

ratio_tr, ratio_te = cross_validation_ridge_regression(x, y, num_K_sets, degree_list, lambdas)

In [None]:
# there is A LOT of data => there is no overfitting => the training error is close to the test error => it does
# not makes sense to do the cross validation (just use the whole set to train the data and compute the 
# SUCCESS_RATIO on the training data)
cross_validation_visualization(ratio_tr, ratio_te, degree_list, lambdas, "lambda (ridge_regression)")

ratio_tr.shape, ratio_te.shape

### Ridge regression without cross validation

In [None]:
# num_sets: number of sets in which the dataset will be splitted to run the cross validation
# degree_list: list of degree to be tried
# lambdas: list of lambdas to be tried 
def ridge_regression_tuning(x, y, degree_list, lambdas):    
    
    ratios = [] # matrix success ratio obtained with the training sets

    # define lists to store the loss of training data and test data
    for nfigure, degree in enumerate(degree_list): # one figure per degree
        tx = build_poly(x, degree) 
        
        # one row (figure) per degree
        ratios.append([])
        
        for npoint, lambda_ in enumerate(lambdas):
            # for each lambda we compute the expected ratio of success (this will be the a point in the figure)
            ratios[nfigure].append(0)
            
            # train the model, just line should change depending on the chosen training 
            _, w = ridge_regression(y, tx, lambda_)

            ratios[nfigure][npoint] = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            
    ratios = np.array(ratios)
    
    return ratios

In [None]:
# the lambda of the ridge regression does not affect mush the model
nlambdas = 3
lambdas = np.linspace(0, 1000, nlambdas)
degree_list = [2, 3, 4]

ratios = ridge_regression_tuning(x, y, degree_list, lambdas)

In [None]:
x_axis = lambdas
figure_name = "ridge regression,nan_values=[0, -999],substitutions=nanmean(x, axis=0)"
ratios_visualization([ratios], degree_list, x_axis, x_label="lambas", log_axis_x=False,
                    save_figure_with_name=figure_name)

### Gradient descent try

In [None]:
degree = 4
tx = build_poly(x, degree) 

### Logistic regression

In [None]:
initial_w = np.zeros(tx.shape[1])-1
max_iters = 5 # try a small one first to check it is converging 
gamma = 0.000000000002 # very small (for higher diverges)

_, w = gradient_descent(y, tx, initial_w, max_iters, gamma, batch_size=-1, 
                        print_output_with_weights=[0, 1, 2], plot_losses=True, costfunc=CostFunction.MSE)

print(compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO))