In [15]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import implementations as imp
%run pre_processing.ipynb

In [17]:
train_path = '../resources/train.csv'

In [18]:
def split_indices(size_dataset,k):
    len_gps = int(size_dataset/k)
    #shuffle rows for randomness
    shuffle = np.random.permutation(size_dataset)
    partition = [shuffle[i * len_gps: (i + 1) * len_gps] for i in range(k)]
    return len_gps, np.array(partition)

In [19]:
def split_data(y,tx,ids,k,ind_test):
    size_dataset = y.shape[0]
    len_gps, partition = split_indices(size_dataset,k)
    test = partition[ind_test]
    a = partition[0:ind_test]
    b = partition[ind_test+1:]
    train = np.ndarray.flatten(np.concatenate((a,b)))
    tx_test = tx[test]
    y_test = y[test]
    ids_test = ids[test]
    tx_train = tx[train]
    y_train = y[train]
    ids_train = ids[train]
    return (tx_test,y_test,ids_test), (tx_train,y_train,ids_train) 

In [23]:
def train_and_predict(tx,y,ids,k,ind_test,lambda_):
    test_set, train_set = split_data(y,tx,ids,k,ind_test)
    (x_tr1,y_tr1,ids_tr1), (x_tr2,y_tr2,ids_tr2), (x_tr3,y_tr3,ids_tr3),ind1,ind2,ind3,ind4,ind5,ind6 = process_data_cross_val(train_set)
    (xtest_1,ytest_1,ids_1), (xtest_2,ytest_2,ids_2), (xtest_3,ytest_3,ids_3) = process_test_cross_val(test_set,ind1,ind2,ind3,ind4,ind5,ind6)
    
    #Train the model on the training set LEAST SQUARES
    #w1 , l1 = imp.least_squares(y_tr1,x_tr1)
    #w2 , l2 = imp.least_squares(y_tr2,x_tr2)
    #w3 , l3 = imp.least_squares(y_tr3,x_tr3)
    
    #Train the model on the training set RIDGE REGRESSION
    w1 , l1 = imp.ridge_regression(y_tr1, x_tr1, lambda_)
    w2 , l2 = imp.ridge_regression(y_tr2, x_tr2, lambda_)
    w3 , l3 = imp.ridge_regression(y_tr3, x_tr3, lambda_)
    
    #Predict 'y' of the test set using the weight found in the training set
    y_pred1 = predict_labels(w1, xtest_1)
    y_pred2 = predict_labels(w2, xtest_2)
    y_pred3 = predict_labels(w3, xtest_3)
    
    true_l1 = imp.compute_loss(ytest_1, xtest_1, w1)
    true_l2 = imp.compute_loss(ytest_2, xtest_2, w2)
    true_l3 = imp.compute_loss(ytest_3, xtest_3, w3)

    accuracy_1 = np.mean((ytest_1==y_pred1))
    accuracy_2 = np.mean((ytest_2==y_pred2))
    accuracy_3 = np.mean((ytest_3==y_pred3))
    
    #Compute generalized error
    gen_err1 = np.abs(l1-true_l1)
    gen_err2 = np.abs(l2-true_l2)
    gen_err3 = np.abs(l3-true_l3)
    
    return (accuracy_1,accuracy_2,accuracy_3), (gen_err1,gen_err2,gen_err3), (w1,w2,w3)


In [24]:
def cross_validation(train_path,k, lambda_):
    train_data = LoadTrainingDataset(train_path)
    tx , y ,ids = train_data.get_data()
    accuracy =[]
    gen_err = []
    weights = []
    for i in range (k):
        acc,err,w = train_and_predict(tx,y,ids,k,i, lambda_)
        accuracy.append(acc)
        gen_err.append(err)
        weights.append(w)
    acc = np.mean(np.mean(accuracy,0))
    var = np.mean(np.std(accuracy,0))
    return acc,var

In [25]:
cross_validation(train_path,5,0.0007)

(0.7071047964367638, 0.002373605824552488)

In [38]:
#Test best k parameter for least square
def test_best_k_ls():
    accuracies = []
    variances = []
    best_k = -1
    best_acc = 0
    for k in range (2,11):
        bias,var = cross_validation(train_path,k,1)
        accuracies.append(bias)
        variances.append(var)
    for i in range (len(accuracies)):
        if (accuracies[i] > best_acc):
            best_acc = accuracies[i]
            best_k = i+2         
    return accuracies, variances, best_acc,best_k
    

In [41]:
#Test best k and lambda parameters for ridge regression (takes a lot of time)
def test_best_k_rreg(): 
    best_k = -1
    best_l = 0
    best_acc = 0
    possible_lambda = np.logspace(-8, 0, 20)
    for l in possible_lambda :
        accuracies = []
        variances = []
        for k in range (2,11) :
            bias,var = cross_validation(train_path,k,l)
            accuracies.append(bias)
            variances.append(var)
        for i in range (len(accuracies)):
            if (accuracies[i] > best_acc):
                best_acc = accuracies[i]
                best_k = i+2 
                best_l = l
    return best_acc,best_k, best_l
    

In [42]:
test_best_k_rreg()

(0.7103933738861331, 7, 1.2742749857031322e-06)

In [40]:
test_best_k_ls()

([0.689516710015098,
  0.690510281624157,
  0.6878286301355706,
  0.6898419134872622,
  0.689800856259955,
  0.6903875276215677,
  0.6886618579825022,
  0.6903841241260928,
  0.6889836667211137],
 [0.0006092938640544684,
  0.0017486179776842288,
  0.0020919887847824074,
  0.002892445376448173,
  0.002793706532225762,
  0.0032014108257130604,
  0.0024794894795790575,
  0.003881105833419689,
  0.004074373610819477],
 0.690510281624157,
 3)