In [10]:
import pandas as pd
import numpy as np
import random
from implementations import *
from helpers import load_csv_data

In [1]:
seed = 7

In [72]:
train_labels, train_data, train_ids = load_csv_data("data/train.csv")
#test_labels, test_data, test_ids = load_csv_data("data/test.csv")
x = train_data
y = train_labels

functions to test different methods for regression

In [73]:
def compute_loss(y, tx, model):
    err = y - (tx @ model)
    return (err.T @ err)/y.shape[0]

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,D)
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,D*d+1)
    """    
    N = x.shape[0]
    D = x.shape[1]
    #poly_base = np.zeros((N, D*degree + 1))
    poly_base = np.ones((N,1))
    poly_base = np.hstack((poly_base, x.copy()))
    for i in range(degree-1):
        range_start = 1 + i*D
        range_stop = 1 + (i+1)*D
        next_power = poly_base[:,range_start:range_stop]*x
        poly_base = np.hstack((poly_base, next_power))
    
    return poly_base

def using_method(train_model, degree):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, expected_loss
    """
    tx = build_poly(x, degree)
    k = 4
    k_indices = build_k_indices(train_y, k, seed)
    test_idx = k_indices[k-1]
    train_idx = (k_indices[:k-1]).flatten()
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    test_loss = compute_loss(y[test_idx], tx[test_idx], model)
    return model, test_loss

Now try different methods

In [77]:
using_method(least_squares,2)

(array([-6.33628698e+04,  2.35897173e-04, -9.75903429e-03, -6.48521706e-03,
         1.64044485e-03, -1.87504042e-01,  2.97110008e-05, -3.71795590e-02,
         7.93370273e-01, -7.79859026e-04, -2.21744739e+00, -3.03757107e-01,
         5.55407922e-02,  4.04958059e-01,  2.23018240e+00,  1.26922131e-03,
        -1.12742370e-03,  2.23308190e+00, -1.54277284e-03,  5.21838956e-04,
         2.06113671e-03,  1.19421211e-03,  2.19488164e-04,  5.28010893e+04,
         1.68495455e-03,  1.51445388e-03,  6.52376858e-04,  1.39485679e-03,
         1.96730263e-03,  1.65156252e-04,  2.21699525e+00,  2.28977022e-07,
         2.91465713e-05,  3.57118576e-06,  1.90352931e-06,  2.26641463e-02,
        -2.59872453e-08, -2.13541682e-03, -9.52295124e-02,  4.51415930e-06,
        -6.78990219e-07,  2.84720743e-02,  4.11213161e-02, -3.13029753e-02,
        -3.56035748e-05, -2.23472788e-02, -9.17581397e-04, -4.74546649e-05,
        -3.57204275e-02, -6.26093822e-04, -7.39044768e-06,  1.34614834e-04,
        -1.2

In [82]:
using_method(lambda y,tx: ridge_regression(y,tx,0.00001), 1)

(array([-3.67453313e-01,  7.26504386e-05, -7.21408261e-03, -6.22365469e-03,
        -6.05763429e-04, -2.78355919e-03,  4.31854217e-04, -2.60894162e-02,
         3.41060111e-01, -8.87995896e-05, -3.33784877e-03, -2.16750759e-01,
         9.71972335e-02,  4.58988585e-02,  1.18964084e-02, -4.40617687e-04,
        -1.35361461e-03,  1.65061054e-02, -9.27074597e-04,  4.98585092e-04,
         3.84757476e-03,  8.76254227e-04, -4.94122327e-04, -2.09314554e-01,
        -1.21965503e-04,  9.13166563e-04, -4.66909825e-04,  2.37616071e-05,
        -6.32337966e-03, -1.08887263e-02,  2.52647085e-03]),
 0.6824576588149239)