In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from implementations import * 
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

In [None]:
# import some data to test the functions
from helpers import *

ids_tr,predictions_tr,data_tr, ids_te,data_te = load_boson_data()

y = predictions_tr

x, mean_x, std_x = standardize(data_tr)

y,tx = build_model_data(x,y)

y.shape, x.shape, tx.shape, data_tr.shape

In [None]:
# gradient descent
initial_w = np.zeros(tx.shape[1])
max_iters = 20
gamma = 0.1
loss, w = gradient_descent(y, tx, initial_w, max_iters, gamma, print_output=False, plot_losses=True)
loss,w

In [None]:
# stochastic gradient descent
# gradient descent
initial_w = np.array([0, 0])
max_iters = 20
gamma = 0.5
loss, w = gradient_descent(y, tx, initial_w, max_iters, gamma, batch_size=1, print_output=False, plot_losses=True)
loss,w

In [None]:
# analityc solution
least_squares(y, tx)

In [None]:
# ridge regression to choose smaller weights (the simpler the model the better)
lambda_ = 0.02 # how to properly choose lambda?
ridge_regression(y, tx, lambda_)

In [None]:
# logistic regression (it is gradient descent with a different loss function)
from helpers import sample_data, load_data
from plots import visualization

# load data.
height, weight, gender = load_data()

# build sampled x and y.
seed = 1
y = np.expand_dims(gender, axis=1)
X = np.c_[height.reshape(-1), weight.reshape(-1)]
y, X = sample_data(y, X, seed, size_samples=200)
x, mean_x, std_x = standardize(X)

tx = build_poly(x, 1)
initial_w = np.ones(tx.shape[1])
max_iters = 50
gamma = 0.2
minloss, w = gradient_descent(y, tx, initial_w, max_iters, gamma, batch_size=-1, print_output=False, plot_losses=True, costfunc=CostFunction.PROB)
# w = np.array(w).reshape(-1, 1)
visualization(y, x, mean_x, std_x, w, "classification_by_least_square")
minloss, w

In [None]:
tes_std, _, _ = standardize(data_te)
tx_std =  build_poly(tes_std, 1)

print(tx_std.shape)
predict_labels(w, tx_std)

## 0. Imports

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from implementations import * 
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

## 1. Load data

In [None]:
# just load the training dataset
data_path = "../dataset/train.csv"
y_loaded, x_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_loaded.shape, x_loaded.shape

In [None]:
y1 = np.where(y_loaded == 1)
y0 = np.where(y_loaded == -1)
np.sum(x_loaded[y1, -1] == 0), np.sum(x_loaded[y0, -1] == 0)

## 2. Clean data

In [None]:
y_all = y_loaded.copy()
x_all = clean_x2(x_loaded)
x_all.shape, y_all.shape

In [None]:
# decide the maximum correlation between the columns
corr = 0.8
# clean the input features
x_all, keptCols = clean_x(x_loaded, corr, subs_func=np.nanstd, bool_col=True)
y_all = y_loaded

# extract a subsample for the training
subsample = 50000
indices = np.random.permutation(y_all.shape[0])
x_sub, y_sub = x_all[:subsample], y_all[:subsample]

x_sub.shape, y_sub.shape, x_all.shape, y_all.shape

In [None]:
def norm(x, y):
    y_1 = np.where(y == 1)[0]
    y_0 = np.where(y == -1)[0]

    if len(y_1) > len(y_0):
        y_1 = y_1[:len(y_0)]
        return np.concatenate([x[y_0, :], x[y_1, :]], axis=0), np.append(y[y_0], y[y_1])
    else:
        y_0 = y_0[:len(y_1)]
        return np.concatenate([x[y_0, :], x[y_1, :]], axis=0), np.append(y[y_0], y[y_1])
    
# possibily equalize the number of 1s and -1s (just for the training!!)
x_all_, y_all_ = norm(x_all, y_all)

x_all_.shape, y_all_.shape

### clean_x3

In [None]:
y_all = y_loaded.copy()
x_all, bool_cols = clean_x3(x_loaded)

## 3. train and test models:

### Ridge regression with cross validation

In [None]:
# just show how the function ratios_visualization works

model1 = np.array([
    [0.65, 0.9], # relative to degrees[0]
    [0.7, 0.8],  # relative to degrees[1]
    [0.7, 0.7]]) # relative to degrees[2]
model2 =  np.array([
    [0.7, 0.6],  # relative to degrees[0]
    [0.9, 1],    # relative to degrees[1]
    [0.8, 0.8]]) # relative to degrees[2]
degree_list = np.array([2, 4, 7])
lambdas = np.array([0.1, 0.5]) # x asis
 
ratios_visualization([model1, model2, model1-0.1], degree_list, lambdas, x_label="x label", log_axis_x=False, 
                     save_figure_with_name="test_figure")

In [None]:
# num_sets: number of sets in which the dataset will be splitted to run the cross validation
# degree_list: list of degree to be tried
# lambdas: list of lambdas to be tried 
def cross_validation_ridge_regression(x, y, num_K_sets, degree_list, lambdas):
    seed = 2
    
    # split indices in k sets
    k_indices = build_k_indices(y, num_K_sets, seed)
    
    ratio_tr = [] # matrix success ratio obtained with the training sets
    ratio_te = [] # matrix success ratio obtained with the test sets

    # define lists to store the loss of training data and test data
    for nfigure, degree in enumerate(degree_list): # one figure per degree
        tx = build_poly(x, degree) 
        
        # one row (figure) per degree
        ratio_tr.append([])
        ratio_te.append([])
        
        for npoint, lambda_ in enumerate(lambdas):
            # for each lambda we compute the expected ratio of success (this will be the a point in the figure)
            ratio_tr[nfigure].append(0)
            ratio_te[nfigure].append(0)
            
            for k_curr in range(num_K_sets):
                train, test = get_kth_set(y, tx, k_indices, k_curr)
                
                # train the model, just line should change depending on the chosen training 
                _, w = ridge_regression(train.y, train.tx, lambda_)
                
#                 # gradient descent
#                 initial_w = np.zeros(tx.shape[1])
#                 max_iters = 50
#                 gamma = lambda_
#                 _, w = gradient_descent(train.y, train.tx, initial_w, max_iters, gamma, batch_size=-1, print_output=False, plot_losses=False, costfunc=CostFunction.MSE)
                
                # compute how good is the model
                ratio_tr[nfigure][npoint] += compute_loss(train.y, train.tx, w, costfunc=CostFunction.SUCCESS_RATIO)
                ratio_te[nfigure][npoint] += compute_loss(test.y, test.tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            
            # average the ratio obtained with the cross validation
            ratio_tr[nfigure][npoint] /= num_K_sets
            ratio_te[nfigure][npoint] /= num_K_sets
            
    ratio_tr = np.array(ratio_tr)
    ratio_te = np.array(ratio_te)
    
#     print(ratio_tr.shape) # #degree x #lambdas
#     print(ratio_te.shape)
#     print(degrees.shape)
#     print(lambdas.shape)
    
    return ratio_tr, ratio_te

In [None]:
x_sub.shape, y_sub.shape, x_all.shape, y_all.shape

In [None]:
# the lambda of the ridge regression does not affect mush the model
nlambdas = 5
lambdas = np.linspace(0, 1000, nlambdas)

num_K_sets = 2
degree_list = [5, 7, 9, 13, 15, 17]

ratio_tr, ratio_te = cross_validation_ridge_regression(x_sub, y_sub, num_K_sets, degree_list, lambdas)

In [None]:
# there is A LOT of data => there is no overfitting => the training error is close to the test error => it does
# not makes sense to do the cross validation (just use the whole set to train the data and compute the 
# SUCCESS_RATIO on the training data)
cross_validation_visualization(ratio_tr, ratio_te, degree_list, lambdas, "lambda (ridge_regression)")

ratio_tr.shape, ratio_te.shape

### Ridge regression without cross validation

In [None]:
# num_sets: number of sets in which the dataset will be splitted to run the cross validation
# degree_list: list of degree to be tried
# lambdas: list of lambdas to be tried 
def ridge_regression_tuning(x, y, degree_list, lambdas):    
    
    ratios_tr = [] # matrix success ratio obtained with the training sets
    ratios_te = []

    # define lists to store the loss of training data and test data
    for nfigure, degree in enumerate(degree_list): # one figure per degree
        tx = build_poly(x, degree) 
        tx = np.hstack((bool_cols[:tx.shape[0], :], tx))
        
        tx_all = build_poly(x_all, degree)
        tx_all = np.hstack((bool_cols, tx_all))
        
        
        # one row (figure) per degree
        ratios_tr.append([])
        ratios_te.append([])
        
        for npoint, lambda_ in enumerate(lambdas):
            # for each lambda we compute the expected ratio of success (this will be the a point in the figure)
            ratios_tr[nfigure].append(0)
            ratios_te[nfigure].append(0)
            
            # train the model, just line should change depending on the chosen training 
            _, w = ridge_regression(y, tx, lambda_)

            ratios_tr[nfigure][npoint] = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            ratios_te[nfigure][npoint] = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    
    return np.array(ratios_tr), np.array(ratios_te)

In [None]:
x_all.shape, y_all.shape, bool_cols.shape

In [None]:
# the lambda of the ridge regression does not affect much the model
nlambdas = 5
lambdas = np.linspace(0, 0.1, nlambdas)
degree_list = [5, 7, 9, 11, 13, 15]

ratios_tr, ratios_te = ridge_regression_tuning(x_all[:50000], y_all[:50000], degree_list, lambdas)

# _, w = ridge_regression(y, tx, lambda_)

# plot
x_axis = lambdas
figure_name = ""
ratios_visualization([ratios_tr, ratios_te], degree_list, x_axis, x_label="lambas", log_axis_x=False,
                    save_figure_with_name=figure_name)

In [None]:
np.where([ratios_te > 0.81])

In [None]:
tx_all_.shape, txy_all.shape

In [None]:
# chose the best parameters and compute the weigths
degree = 18
lambda_ = 0
tx_all = build_poly(x_all, degree) 
tx_all = np.hstack((bool_cols, tx_all))

_, w = ridge_regression(y_all, tx_all, lambda_)
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

# tx_sub = build_poly(x_sub, degree) 
# _, w = ridge_regression(y_sub, tx_sub, lambda_)
# compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

### Gradient descent try

In [None]:
degree = 4
tx = build_poly(x, degree) 

### Logistic regression

In [None]:
initial_w = np.zeros(tx.shape[1])-1
max_iters = 5 # try a small one first to check it is converging 
gamma = 0.000000000002 # very small (for higher diverges)

_, w = gradient_descent(y, tx, initial_w, max_iters, gamma, batch_size=-1, 
                        print_output_with_weights=[0, 1, 2], plot_losses=True, costfunc=CostFunction.MSE)

print(compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO))

## Create submit file

In [None]:
# compute w
degree = 13
lambda_ = 0
tx_all = build_poly(x_all, degree)
_, w = ridge_regression(y_all, tx_all, lambda_)
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

In [None]:
# load test dataset
data_path = "../dataset/test.csv"
y_te_loaded, x_te_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_te_loaded.shape, x_te_loaded.shape

In [None]:
# clean in the same way of the trained set
x_te = clean_x2(x_te_loaded)
x_te.shape

In [None]:
# create the poly
degree = 13
tx_te = build_poly(x_te, degree)

# predict
y_te_pred = predict_labels(w, tx_te)
y_te_pred.shape, (y_te_pred==-1).sum(), (y_te_pred==1).sum()

In [None]:
# num_correct = np.sum(y_te_pred==y_te_loaded)
# num_correct/len(y_te_pred)

In [None]:
# store the predictions
create_csv_submission(ids_te, y_te_pred, "ridge_regression_44_columns_keepall_add14columns")