In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
from proj1_helpers import *
from preprocessing import *
from crossvalidation import *
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

## Load the training data into feature matrix, class labels, and event ids:

In [2]:


#load data
DATA_TRAIN_PATH = '../data/train.csv' #download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)

#add constant term
#tx = np.c_[np.ones((y.shape[0], 1)), x]

print(y.shape, x.shape, ids.shape)

(250000,) (250000, 30) (250000,)


In [3]:
def split_jet(y, tx):
    '''
        y -> label data set
        tx -> examples data set
        Splits data into data sets according to the number of jets
        -----
        Returns
        4 arrays of label data sets for each jet_num
        4 arrays of examples data sets for each jet_num
        4 arrays of indexes for each jet_num
    '''
    #features
    jet_0_tx = []
    jet_1_tx = []
    jet_2_tx = []
    jet_3_tx = []
    #labels
    jet_0_y = []
    jet_1_y = []
    jet_2_y = []
    jet_3_y = []
    
    
    index_0 = []
    index_1 = []
    index_2 = []
    index_3 = []

    for i in range(tx.shape[0]):
        if tx[i,22] == 0:
            jet_0_tx.append(tx[i])
            jet_0_y.append(y[i])
            index_0.append(i)
        if tx[i,22] == 1:
            jet_1_tx.append(tx[i])
            jet_1_y.append(y[i])
            index_1.append(i)
        if tx[i,22] == 2:
            jet_2_tx.append(tx[i])
            jet_2_y.append(y[i])
            index_2.append(i)
        if tx[i,22] == 3:
            jet_3_tx.append(tx[i])
            jet_3_y.append(y[i])
            index_3.append(i)
    #removing the column for jet_num which has index 22    
    jet_0_tx = np.delete(jet_0_tx, 22, axis=1)
    jet_1_tx = np.delete(jet_1_tx, 22, axis=1)
    jet_2_tx = np.delete(jet_2_tx, 22, axis=1)
    jet_3_tx = np.delete(jet_3_tx, 22, axis=1)
    
    #removing the PRI_jet_all_pt from jet_0
    jet_0_tx = np.delete(jet_0_tx, -1, axis=1)
            
    return np.array(jet_0_tx), np.array(jet_1_tx), \
           np.array(jet_2_tx), np.array(jet_3_tx), \
           np.array(jet_0_y), np.array(jet_1_y), \
           np.array(jet_2_y), np.array(jet_3_y),\
           np.array(index_0), np.array(index_1),np.array(index_2),np.array(index_3)

In [4]:
jet_0_tx, jet_1_tx, jet_2_tx, jet_3_tx, jet_0_y, jet_1_y, jet_2_y, jet_3_y, index_0, index_1, index_2, index_3 = split_jet(y, x)

In [5]:
def removeNaN(jet_x):
    '''Removes the columns that have nan values for each row'''
    return jet_x[:, np.any((jet_x != -999), axis=0)]

In [6]:
jet_0_tx = removeNaN(jet_0_tx)
jet_1_tx = removeNaN(jet_1_tx)
#no nan values for jet 2 and jet 3

In [7]:
jet_0_tx.shape

(99913, 18)

In [8]:
def replaceNaN(jet_x):
    '''Replaces the nan values'''
    for i in range(jet_x.shape[1]):
        idx = jet_x[:,i] > -999
        mean = np.mean(jet_x[idx,i])
        jet_x[idx==False,i] = mean

In [9]:
replaceNaN(jet_0_tx)

In [10]:
replaceNaN(jet_1_tx)

In [11]:
replaceNaN(jet_2_tx)

In [12]:
replaceNaN(jet_3_tx)

In [13]:
jet_0_tx = standardize(jet_0_tx)

In [14]:
jet_1_tx = standardize(jet_1_tx)

In [15]:
jet_2_tx = standardize(jet_2_tx)

In [16]:
jet_3_tx = standardize(jet_3_tx)

In [None]:
acc_LR = []
total_loss_te_LR = []
acc_RLR = []
total_loss_te_RLR = []

gamma = 1e-6
max_iters = 2000
lambda_ = 0.001
k_fold = 5
degree = 3

jet_0_tx_poly = build_poly(jet_0_tx, degree)
initial_w = np.zeros(jet_0_tx_poly.shape[1])

k_indices = build_k_indices(jet_0_y, k_fold, 1)

for k in range(k_fold):
    acc, loss_te_LR = cross_validation(jet_0_y, jet_0_tx_poly, k_indices, k, initial_w, 'logistic_regression', max_iters, gamma, lambda_)
    acc_LR.append(acc)
    total_loss_te_LR.append(loss_te_LR)

print(acc_LR)
print(total_loss_te_LR)


In [93]:
jet_0_tx.shape

(99913, 18)