In [1]:
import numpy as np
from helpers import *
from implementations import *

## Loading data

In [2]:
(y, ox, ids) = load_csv_data('data/train.csv')
(_, otest, testid) = load_csv_data('data/test.csv')

In [3]:
x = ox.copy()
test = otest.copy()

In [4]:
(y.shape, x.shape, ids.shape)

((250000,), (250000, 30), (250000,))

## Cleaning data

In [5]:
#remove 999s, replacing by mean of column
x[np.where(x == -999)] = np.NaN
col_mean = np.nanmean(x, axis=0)
inds = np.where(np.isnan(x))
x[inds] = np.take(col_mean, inds[1])

In [6]:
test[np.where(test == -999)] = np.NaN
inds = np.where(np.isnan(test))
test[inds] = np.take(col_mean, inds[1])

## Expanding data

In [7]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,D)
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,D*d+1)
    """    
    N = x.shape[0]
    D = x.shape[1]
    #poly_base = np.zeros((N, D*degree + 1))
    poly_base = np.ones((N,1))
    poly_base = np.hstack((poly_base, x.copy()))
    for i in range(degree-1):
        range_start = 1 + i*D
        range_stop = 1 + (i+1)*D
        next_power = poly_base[:,range_start:range_stop]*x
        poly_base = np.hstack((poly_base, next_power))
    
    return poly_base


def expand_features(x):
    return np.hstack((build_poly(x, 6), np.sin(x), np.log(np.absolute(x) + 0.01)))

In [8]:
x = expand_features(x)
test = expand_features(test)

In [9]:
x.shape

(250000, 241)

## Training model

In [10]:
#standardize
trainmean = x.mean(axis = 0)
trainstd  = x.std(axis = 0)

#handle the case of the column of ones with std 0 : modify nothing
trainmean[0] = 0
trainstd[0]  = 1.0

x = (x-trainmean)/trainstd
test = (test - trainmean)/trainstd

In [11]:
(w, loss) = least_squares(y, x)

## Cross validation

In [12]:
def compute_opposite_of_accuracy(y, tx, model):
    lpred = (tx @ model)
    lpred[np.where(lpred < 0)] = -1
    lpred[np.where(lpred >= 0)] = 1
    err = (y - lpred)/2
    
    return np.absolute(err).mean()
    
def cross_validation(train_model, y, tx):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, training_loss, test_loss
    """
    k = 4
    k_indices = build_k_indices(y, k, seed=12345)
    test_idx = k_indices[k-1]
    train_idx = (k_indices[:k-1]).flatten()
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    one_minus_acc = compute_opposite_of_accuracy(y[test_idx], tx[test_idx], model)
    return model, train_loss, one_minus_acc

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

In [13]:

_, trainloss, one_minus_acc = cross_validation(least_squares, y, x)

In [14]:
trainloss, one_minus_acc

(0.28290147429606793, 0.189808)

In [15]:
1-one_minus_acc

0.810192

## Model output

In [16]:
def makeFinalPrediction(test, w):
    res = test @ w
    res[np.where(res < 0)] = -1
    res[np.where(res >= 0)] = 1
    create_csv_submission(testid, res, "output.csv")

In [17]:
makeFinalPrediction(test, w)