In [10]:
import pandas as pd
import numpy as np
import random
from implementations import *
from helpers import load_csv_data

In [1]:
seed = 7

In [4]:
train_labels, train_data, train_ids = load_csv_data("data/train.csv")
#test_labels, test_data, test_ids = load_csv_data("data/test.csv")

Add a first column of constant ones

In [14]:
ones = np.ones((train_data.shape[0],1))
tx = np.hstack((ones, train_data))
#test_ones = np.ones((test_data.shape[0],1))
#test_tx = np.hstack((test_ones, test_data))
y = train_labels

functions to test different methods for regression

In [23]:
def compute_loss(y, tx, model):
    err = y - (tx @ model)
    return (err.T @ err)/y.shape[0]

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def using_method(train_model):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, expected_loss
    """
    k = 4
    k_indices = build_k_indices(train_y, k, seed)
    test_idx = k_indices[k-1]
    train_idx = (k_indices[:k-1]).flatten()
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    test_loss = compute_loss(y[test_idx], tx[test_idx], model)
    return model, test_loss

Now try different methods

In [24]:
using_method(least_squares)

(array([-3.69200662e-01,  7.25959416e-05, -7.21325174e-03, -6.22529353e-03,
        -6.04962266e-04, -2.70293764e-03,  4.31706296e-04, -2.60819014e-02,
         3.41196625e-01, -8.91027605e-05, -2.29693777e+00, -2.16773412e-01,
         9.71974519e-02,  4.58128562e-02,  2.30549749e+00, -4.38304925e-04,
        -1.35401773e-03,  2.31010787e+00, -9.28875488e-04,  4.98961766e-04,
         3.84718609e-03,  8.73514078e-04, -4.93964898e-04, -2.08798128e-01,
        -1.16919903e-04,  9.11650566e-04, -4.70952829e-04,  3.10052697e-05,
        -6.32614235e-03, -1.08960417e-02,  2.29612132e+00]),
 0.6824535262405655)

In [29]:
using_method(lambda y,tx: ridge_regression(y,tx,0.00001))

(array([-3.67453313e-01,  7.26504386e-05, -7.21408261e-03, -6.22365469e-03,
        -6.05763429e-04, -2.78355919e-03,  4.31854217e-04, -2.60894162e-02,
         3.41060111e-01, -8.87995896e-05, -3.33784877e-03, -2.16750759e-01,
         9.71972335e-02,  4.58988585e-02,  1.18964084e-02, -4.40617687e-04,
        -1.35361461e-03,  1.65061054e-02, -9.27074597e-04,  4.98585092e-04,
         3.84757476e-03,  8.76254227e-04, -4.94122327e-04, -2.09314554e-01,
        -1.21965503e-04,  9.13166563e-04, -4.66909825e-04,  2.37616071e-05,
        -6.32337966e-03, -1.08887263e-02,  2.52647085e-03]),
 0.6824576588149239)