In [1]:
import numpy as np
import random
from implementations import *
from helpers import *


In [2]:
seed = 7

In [3]:
train_labels_raw, train_data_raw, train_ids = load_csv_data("data/train.csv")
_, test_data_raw, test_ids = load_csv_data("data/test.csv")


## Data cleaning and grouping
This part creates three sets of rows depending on the number of jets

In [4]:
med_over = train_data_raw[train_data_raw[:,0] != -999]
med_DER_mass_MMC = np.median(med_over[:,0])

In [5]:
train_data = train_data_raw[train_data_raw[:,0] > -999].copy()
means = np.nanmean(train_data, axis=0)
inds = np.where(np.isnan(train_data))
train_data[inds] = np.take(means, inds[1])

train_labels = train_labels_raw[train_data_raw[:,0] > -999].copy()

test_data = test_data_raw.copy()
test_data[test_data_raw[:,0] == -999] = med_DER_mass_MMC
inds = np.where(np.isnan(test_data))
test_data[inds] = np.take(means, inds[1])

In [6]:
two_more_jets = train_data[train_data[:,22] >= 2].copy()
one_jet = train_data[train_data[:,22] == 1].copy()
one_jet = np.delete(one_jet, np.s_[4,5,6,12,22,26,27,28,29], axis=1)
zero_jet = train_data[train_data[:,22] == 0].copy()
zero_jet = np.delete(zero_jet, np.s_[4,5,6,12,22,23,24,25,26,27,28,29], axis=1)

jet_sets = [zero_jet, one_jet, two_more_jets]

In [7]:
two_more_jets = train_labels[train_data[:,22] >= 2].copy()
one_jet = train_labels[train_data[:,22] == 1].copy()
zero_jet = train_labels[train_data[:,22] == 0].copy()

jet_sets_labels = [zero_jet, one_jet, two_more_jets]

In [8]:
two_more_jets_test = test_data[test_data[:,22] >= 2].copy()
one_jet_test = test_data[test_data[:,22] == 1].copy()
one_jet_test = np.delete(one_jet_test, np.s_[4,5,6,12,22,26,27,28,29], axis=1)
zero_jet_test = test_data[test_data[:,22] == 0].copy()
zero_jet_test = np.delete(zero_jet_test, np.s_[4,5,6,12,22,23,24,25,26,27,28,29], axis=1)


jet_sets_test = [zero_jet_test, one_jet_test, two_more_jets_test]

### Normalisation

In [9]:
for i in range(3):
    mean = jet_sets[i].mean(axis=0)
    std = jet_sets[i].std(axis=0)
    jet_sets[i] = (jet_sets[i] - mean)/std
    jet_sets_test[i] = (jet_sets_test[i] - mean)/std

## Compute the models

functions to test different methods for regression

In [10]:
def compute_loss(y, tx, model):
    """
    compute the proportion of misprediction
    """
    pred = tx @ model
    decision = 0
    pred[np.where(pred <  decision)] = -1
    pred[np.where(pred >= decision)] = 1
    err = (pred - y)/2
    return np.absolute(err).mean()

def build_validation_sets(y, k_fold, seed):
    """
    returns indices for a train set and a test set
    """
    num_row = y.shape[0]
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    return indices[indices.shape[0]//(k_fold+1):], indices[:indices.shape[0]//(k_fold+1)],

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,D)
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,D*d+1)
    """    
    N = x.shape[0]
    D = x.shape[1]
    #poly_base = np.zeros((N, D*degree + 1))
    poly_base = np.ones((N,1))
    poly_base = np.hstack((poly_base, x.copy()))
    for i in range(degree-1):
        range_start = 1 + i*D
        range_stop = 1 + (i+1)*D
        next_power = poly_base[:,range_start:range_stop]*x
        poly_base = np.hstack((poly_base, next_power))
    
    return poly_base

def cross_validation(y, x, train_model, degree):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, training_loss, test_loss
    """
    tx = build_poly(x, degree)
    k = 4
    train_idx, test_idx = build_validation_sets(x, k, seed)
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    test_loss = compute_loss(y[test_idx], tx[test_idx], model)
    return model, train_loss, test_loss

In [11]:
degree_all = 2
model_all, train_loss, test_loss = cross_validation(train_labels, train_data,least_squares, degree_all)
train_loss, test_loss

(0.3411996011080886, 0.24999410057342425)

In [12]:
degree0 = 1
model0, train_loss, test_loss = cross_validation(jet_sets_labels[0], jet_sets[0],least_squares, degree0)
test_loss

0.21554411166824772

In [13]:
degree1 = 6
model1, train_loss, test_loss = cross_validation(jet_sets_labels[1], jet_sets[1],least_squares,degree1)
test_loss

0.22863675335810232

In [14]:
degree2 = 4
model2, train_loss, test_loss = cross_validation(jet_sets_labels[2], jet_sets[2],least_squares, degree2)
test_loss

0.20305388342387315

### Not doing cross-validation

In [15]:
model0,_ = least_squares(jet_sets_labels[0], build_poly(jet_sets[0], degree0))
model1,_ = least_squares(jet_sets_labels[1], build_poly(jet_sets[1], degree1))
model2,_ = least_squares(jet_sets_labels[2], build_poly(jet_sets[2], degree2))

## Applying the models to the test set

In [16]:
def make_predictions(test_sets, degrees, models):
    assert test_data.shape[0] == test_sets[0].shape[0] + test_sets[1].shape[0] + test_sets[2].shape[0]
    test_sets_expanded = {}
    for i in range(3):
        test_sets_expanded[i] = build_poly(test_sets[i], degrees[i])
    
    pred = np.zeros(test_data.shape[0])
    pred[test_data[:,22] >= 2] = test_sets_expanded[2] @ models[2]
    pred[test_data[:,22] == 1] = test_sets_expanded[1] @ models[1]
    pred[test_data[:,22] == 0] = test_sets_expanded[0] @ models[0]
    decision = 0
    pred[np.where(pred <  decision)] = -1
    pred[np.where(pred >= decision)] = 1
    return pred

prediction = make_predictions(jet_sets_test, [degree0, degree1, degree2], [model0, model1, model2])

In [17]:
create_csv_submission(test_ids, prediction, "data/Erwin")