In [10]:
import pandas as pd
import numpy as np
import random
from implementations import *
from helpers import load_csv_data

In [1]:
seed = 7

In [113]:
train_labels, train_data, train_ids = load_csv_data("data/train.csv")
_, test_data, test_ids = load_csv_data("data/train.csv")


In [119]:
train_data_mean = train_data.mean(axis=0)
train_data_std = train_data.std(axis=0)
x = (train_data - train_data_mean)/train_data_std
test_x = (test_data - train_data_mean)/train_data_std
y = train_labels

functions to test different methods for regression

In [103]:
def compute_loss(y, tx, model):
    err = y - (tx @ model)
    return (err.T @ err)/y.shape[0]

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,D)
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,D*d+1)
    """    
    N = x.shape[0]
    D = x.shape[1]
    #poly_base = np.zeros((N, D*degree + 1))
    poly_base = np.ones((N,1))
    poly_base = np.hstack((poly_base, x.copy()))
    for i in range(degree-1):
        range_start = 1 + i*D
        range_stop = 1 + (i+1)*D
        next_power = poly_base[:,range_start:range_stop]*x
        poly_base = np.hstack((poly_base, next_power))
    
    return poly_base

def using_method(train_model, degree):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, expected_loss
    """
    tx = build_poly(x, degree)
    k = 4
    k_indices = build_k_indices(train_y, k, seed)
    test_idx = k_indices[k-1]
    train_idx = (k_indices[:k-1]).flatten()
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    test_loss = compute_loss(y[test_idx], tx[test_idx], model)
    return model, test_loss

Now try different methods

In [129]:
using_method(least_squares,2)

(array([-4.45304550e+03,  8.64846646e-02, -2.43657221e-01, -2.41641346e-01,
         1.18422980e-01, -1.47201046e+04,  3.79406884e-02,  1.36063684e+03,
         2.68124417e-01, -1.35710250e-02, -2.57282077e+02, -1.87218726e-01,
         5.38064206e-02,  2.03773787e+04,  5.00541573e+01, -1.13227276e-03,
         1.92713269e-04,  4.93056675e+01,  4.96668781e-04,  9.14434997e-04,
         4.75368576e-02,  2.18275446e-03, -3.79968526e-02,  3.14001763e+04,
         1.94578616e+00, -1.63052605e+04, -2.37205201e+02,  3.06500002e-01,
        -2.16379340e+04,  9.58845737e+02,  2.17898803e+02,  3.78063163e-02,
         3.64114507e-02,  5.95306023e-03,  7.71002751e-03,  4.69456425e+03,
        -1.11906469e-02, -4.39827442e+02, -5.83697764e-02,  2.23967611e-03,
        -9.08764862e-03,  2.03170056e-02,  5.85812603e-02, -6.45995214e+03,
        -1.78836558e-02, -3.29401946e-02, -3.03005306e-03, -2.31037648e-02,
        -5.71582613e-02, -2.06890845e-03, -7.99635408e-03,  4.40089202e-04,
        -1.9

In [125]:
ridge_model, ridge_loss = using_method(lambda y,tx: ridge_regression(y,tx,0.0001), 4)
(ridge_model, ridge_loss)

(array([-2.09671375e-01,  2.52179304e+00, -2.49792019e-01, -2.47148269e-01,
         5.97554928e-02, -6.47578197e-02,  2.49525527e-02, -6.24010231e-02,
         1.79726427e-01, -2.75558172e-02,  1.86355251e-01, -1.32825617e-01,
        -6.01522367e-02, -1.43557107e-02,  2.71457984e-01, -3.04615639e-03,
        -2.65751637e-03,  1.49181685e-01,  2.53683487e-03,  2.13060272e-03,
         3.56122485e-02,  3.36977951e-04, -4.71440315e-02, -7.04979133e-02,
        -1.72704850e-01, -7.55140042e-02, -1.00418856e-01, -3.63921229e-03,
        -5.42749408e-02, -5.80396618e-02,  1.24363551e-01, -1.47568827e+00,
         4.99829627e-02, -3.51576060e-02,  4.70225804e-02, -1.96310500e-01,
         3.35643771e-01, -1.84885922e-01, -7.09063587e-02,  1.46375025e-02,
         5.92873971e-02,  6.21042799e-02, -2.53860873e-02, -3.93296862e-02,
        -9.20580859e-02, -2.46562938e-02, -4.04645216e-03, -5.88045361e-02,
        -7.45620705e-02, -5.96065680e-03,  1.15088438e-02,  1.46509386e-02,
        -5.5

In [120]:
predictions = build_poly(test_x, 3) @ ridge_model
pd.Series(predictions).describe()

count    250000.000000
mean         -0.313658
std           0.527732
min          -4.919059
25%          -0.680911
50%          -0.327088
75%           0.048461
max          52.211127
dtype: float64