In [1]:
import numpy as np
from implementations_clean import *
from proj1_helpers import *

In [2]:
y,X,ids = load_csv_data("train.csv")

In [3]:
X = np.where(X == -999., np.nan, X)
y_1 = y[~np.isnan(X).any(axis=1)]
X_1 = X[~np.isnan(X).any(axis=1)]
y_2 = y[np.isnan(X).any(axis=1)]
X_2 = X[np.isnan(X).any(axis=1)]

In [4]:
X.shape

(250000, 30)

In [5]:
def poly_features(X):
    X_copy = np.copy(X)
    for i in range(X_copy.shape[1]):
        for j in range(i,X_copy.shape[1]):
            X = np.column_stack((X, X_copy[:,i]*X_copy[:,j]))
    return X

In [6]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

In [7]:
def preproc(X):
    col_means = np.nanmean(X, axis=0)
    col_means = np.nanmedian(X, axis=0)
    idxs = np.where(np.isnan(X))
    X[idxs] = np.take(col_means, idxs[1])

    #feature 1: correlations der_mass_MMC
    X_gt_mmc = np.array(X[:,0], copy=True)
    # X_0_cop = np.array(X[:,0], copy=True)
    X_gt_mmc[X_gt_mmc <= 140] = 140
    # X = np.column_stack((X, X_gt_mmc))
    X[:,0][X[:,0] > 140] = 140
    X = np.column_stack((X, X_gt_mmc))

    #feature 2: add momentums
    #tau momentum
    tau_px = X[:,13]*np.cos(X[:,15])
    tau_py = X[:,13]*np.sin(X[:,15])
    tau_pz = X[:,13]*np.sinh(X[:,14])
    tau_mod = X[:,13]*np.cosh(X[:,14])
    X = np.column_stack((X, tau_px,tau_py,tau_pz))
    #lep momentum
    lep_px = X[:,16]*np.cos(X[:,18])
    lep_py = X[:,16]*np.sin(X[:,18])
    lep_pz = X[:,16]*np.sinh(X[:,17])
    lep_mod = X[:,16]*np.cosh(X[:,17])
    X = np.column_stack((X, lep_px,lep_py,lep_pz))
    #leading jet momentum
    jet_px = X[:,23]*np.cos(X[:,25])
    jet_py = X[:,23]*np.sin(X[:,25])
    jet_pz = X[:,23]*np.sinh(X[:,24])
    jet_mod = X[:,23]*np.cosh(X[:,24])
    X = np.column_stack((X, jet_px,jet_py,jet_pz))
    #subleading jet momentum
    subjet_px = X[:,26]*np.cos(X[:,28])
    subjet_py = X[:,26]*np.sin(X[:,28])
    subjet_pz = X[:,26]*np.sinh(X[:,27])
    subjet_mod = X[:,26]*np.cosh(X[:,27])
    X = np.column_stack((X, subjet_px,subjet_py,subjet_pz))

    #feature 8: total invariant mass
    term_1 = np.sqrt(tau_px**2 + tau_py**2 + tau_pz**2) + np.sqrt(lep_px**2 + lep_py**2 + lep_pz**2) \
    + np.sqrt(jet_px**2 + jet_py**2 + jet_pz**2) + np.sqrt(subjet_px**2 + subjet_py**2 + subjet_pz**2)
    term_2 = (tau_px + lep_px + jet_px + subjet_px)**2 + (tau_py + lep_py + jet_py + subjet_py)**2 \
            + (tau_pz + lep_pz + jet_pz + subjet_pz)**2
    inv_mass = np.sqrt(term_1**2 - term_2)


    #feature 3: abs angles
    #der_met_phi_centrality
    X[:,11] = np.abs(X[:,11])
    #tau phi
    X[:,15] = np.abs(X[:,15])
    #lep phi
    X[:,18] = np.abs(X[:,18])
    #met phi
    X[:,20] = np.abs(X[:,20])
    #lead jet phi
    X[:,24] = np.abs(X[:,24])
    #sublead jet phi
    X[:,27] = np.abs(X[:,27])
    #R sep abs
    X[:,7] = np.abs(X[:,7])

    #feature 9: log
    inv_log_cols = (0,1,2,3,4,5,7,8,9,10,12,13,16,19,21,23,26)
    X_inv_log_cols = np.log(1 / (1 + X[:, inv_log_cols]))
    X = np.hstack((X, X_inv_log_cols))
    # X_test_inv_log_cols = np.log(1 / (1 + X_test[:, inv_log_cols]))
    # X_test = np.hstack((X_test, X_test_inv_log_cols))


    #feature 4: categorical PRI_jet_num
    jet_num_0 = (X[:,22] == 0).astype(int)
    jet_num_1 = (X[:,22] == 1).astype(int)
    jet_num_2 = (X[:,22] == 2).astype(int)
    jet_num_3 = (X[:,22] == 3).astype(int)

    # #feature 5: pt ratios
    # #tau_lep_ratio = PRI_tau_pt/PRI_lep_pt
    tau_lep_ratio = X[:,13]/X[:,16]
    # #met_tot_ratio = PRI_met/PRI_met_sumet
    met_tot_ratio = X[:,19]/X[:,21]
    # X = np.column_stack((X, tau_lep_ratio,jets_ratio,met_tot_ratio))
    X = np.column_stack((X, tau_lep_ratio,met_tot_ratio))

    # #feature 6: jets_diff_angle
    jets_diff_angle = np.cos(X[:,24]-X[:,27])
    X = np.column_stack((X, jets_diff_angle))

#     X = np.column_stack((X, inv_mass))
    X = make_features(X)
    X = np.column_stack((X, jet_num_0, jet_num_1, jet_num_2, jet_num_3))
    return X

In [8]:
X_1 = preproc(X_1)
X_2 = preproc(X_2)

In [69]:
# np.random.shuffle(X)
cutoff_1 = int(0.8*((X_1.shape)[0]))
cutoff_2 = int(0.8*((X_2.shape)[0]))
X_1_train = X_1#[:cutoff_1]
y_1_train = y_1#[:cutoff_1]
X_1_test = X_1[cutoff_1:]
y_1_test = y_1[cutoff_1:]
X_2_train = X_2#[:cutoff_2]
y_2_train = y_2#[:cutoff_2]
X_2_test = X_2[cutoff_2:]
y_2_test = y_2[cutoff_2:]

In [70]:
class MLP:    
    #activations: 'relu', 'sigmoid', 'linear'
    #loss assumed to be BCE
    def __init__(self, gamma = 0.001,  dimensions = [2,10,1], activations = ['relu','sigmoid'] ,weight_decay = 0):
        assert (len(dimensions)-1) == len(activations), "Number of dimensions and activation functions do not match"
        # number of layers of our MLP
        self.num_layers = len(dimensions)
        self.gamma = gamma
        self.weight_decay = weight_decay
        
        # initialize the weights
        self.weights = {}
        self.bias = {}
        # the first layer is the input data
        self.activations = {}
        self.activations_grad = {}
        
        for n in np.arange(self.num_layers - 1):
            # the weights are initialized acccording to a normal distribution and divided by the size of the layer they're on
            self.weights[n + 1] = np.random.randn(dimensions[n + 1],dimensions[n]) / np.sqrt(dimensions[n])
            # bias are all initialized to zero
            self.bias[n + 1] = np.zeros(dimensions[n + 1])
            
            if activations[n] == 'relu':
                self.activations[n+1] = self.relu
                self.activations_grad[n+1] = self.relu_gradient
            elif activations[n] == 'sigmoid':
                self.activations[n+1] = self.sigmoid
                self.activations_grad[n+1] = self.sigmoid_gradient
            else:
                self.activations[n+1] = lambda x : x
                self.activations_grad[n+1] = lambda x : 1
    
    def feed_forward(self, x):        
        # keep track of all z and a to compute gradient in the backpropagation
        z = {}
        # the first layer is the input data
        a = {1:x}
        # We compute z[n+1] = a[n] * w[n] + b[n]
        # and a[n+1] = f(z[n+1]) = f(a[n] * x[n] + b[n]) where * is the inner product
        for n in np.arange(1, self.num_layers):
            z[n + 1] = self.weights[n] @ a[n] + self.bias[n]
            a[n + 1] = self.activations[n](z[n + 1])
        y_pred = a[n+1]    
        return y_pred,a, z
    
    # returns a prediction
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y_i_proba,_,_ = self.feed_forward(X[i].squeeze()) 
            preds[i] = (y_i_proba > 0.5)
        return preds
    
    def predict_proba(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y_i_proba,_,_ = self.feed_forward(X[i].squeeze()) 
            preds[i] = y_i_proba
        return preds
    
    def back_propagate(self, y,y_pred, a, z):
        
        weights_gradient = {}
        bias_gradient = {}
        
        nabla = self.BCE_gradient(y,y_pred)
        
        for n in np.flip(np.arange(1, self.num_layers)):
            nabla = nabla * self.activations_grad[n](z[n+1])
            weights_gradient[n] = np.outer(nabla, a[n])
            bias_gradient[n] = nabla
            nabla = nabla @ self.weights[n]
        
        return weights_gradient, bias_gradient
        ## self.gradient_descent_step(weights_gradient, bias_gradient)
    
    #weight decay : l2 reg
    def gradient_descent_step(self, weights_gradient, bias_gradient):
        for n in np.arange(1, self.num_layers):
            self.weights[n] = self.weights[n] - self.gamma * (weights_gradient[n] + self.weight_decay*self.weights[n])
            self.bias[n] = self.bias[n] - self.gamma * (bias_gradient[n] + self.weight_decay*self.bias[n])            
    
    #batch size = 1 for now
    def train(self, X, y, max_iter, batch_size = 1, decay = False, decay_rate = 3, decay_iteration = 0):
        for i in range(max_iter):
            if (decay):
                if ((i % decay_iteration == 0) and (i != 0)):
                    print("Iteration: {}".format(i))
                    print("Decay, lr : {}".format(self.gamma))
                    self.gamma = self.gamma/decay_rate
                    print("Decay, lr : {}".format(self.gamma))
                    print("")
            idxs = np.random.randint(0, X.shape[0],batch_size)
            X_batch = X[idxs].squeeze()
            y_batch = y[idxs]
            y_pred,a, z = self.feed_forward(X_batch)
            weights_gradient, bias_gradient = self.back_propagate(y_batch,y_pred,a, z)
            self.gradient_descent_step(weights_gradient, bias_gradient)
            if ((i % int(max_iter/5)) == 0):
                loss = self.BCE_loss(X,y)
                print("Iteration : {}, loss : {}".format(i,loss))
        loss = self.BCE_loss(X,y)
        print("Iteration : {}, loss : {}".format(i,loss))
        return loss
            
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_gradient(self,z):
        return sigmoid(z) * (1 - sigmoid(z))
    
    def relu(self,z):
        return np.where(z < 0, 0, z)

    def relu_gradient(self, z):
        return np.where(z < 0, 0, 1)
        
    #check if possible to vectorize
    def BCE_loss(self,X, y):
        loss = 0
        N = len(y)
        eps = 1e-7
        for i in range(N):
            y_pred,_,_ = self.feed_forward(X[i])
            loss_i = -(y[i]*np.log(y_pred+eps) + (1-y[i])*np.log(1-y_pred+eps))
            loss = loss + loss_i/N
        return loss
    
    def BCE_gradient(self,y,y_pred):
        #return y_pred-y
        eps = 1e-7
        return (-y/(y_pred+eps) + (1-y)/(1-y_pred+eps))

In [71]:
np.random.seed(1)
in_dim = X_1.shape[1]
n_h1 = 60
n_h2 = 60
n_h3 = 30
# n_h4 = 30
# n_h5 = 30
# n_h6 = 100
# n_h7 = 100
out_dim = 1
dimensions = [in_dim, n_h1,n_h2,n_h3,out_dim]
activations = ['relu','relu','relu','sigmoid']
gamma = 0.01
weight_decay = 0.001
mlp_1 = MLP(gamma = gamma, dimensions = dimensions, activations = activations,
          weight_decay = weight_decay)
mlp_2 = MLP(gamma = gamma, dimensions = dimensions, activations = activations,
          weight_decay = weight_decay)
mlp_1.train(X_1_train,y_1_train,max_iter = 3500000,decay_rate = 5,decay_iteration = 1500000,decay = True)
mlp_2.train(X_2_train,y_2_train,max_iter = 3500000,decay_rate = 5,decay_iteration = 1500000,decay = True)

Iteration : 0, loss : [0.70279451]
Iteration : 700000, loss : [0.36633528]
Iteration : 1400000, loss : [0.36845012]
Iteration: 1500000
Decay, lr : 0.01
Decay, lr : 0.002

Iteration : 2100000, loss : [0.33969262]
Iteration : 2800000, loss : [0.33148]
Iteration: 3000000
Decay, lr : 0.002
Decay, lr : 0.0004

Iteration : 3499999, loss : [0.31556049]
Iteration : 0, loss : [0.6911654]
Iteration : 700000, loss : [0.37842204]
Iteration : 1400000, loss : [0.38899488]
Iteration: 1500000
Decay, lr : 0.01
Decay, lr : 0.002

Iteration : 2100000, loss : [0.36531528]
Iteration : 2800000, loss : [0.36212953]
Iteration: 3000000
Decay, lr : 0.002
Decay, lr : 0.0004

Iteration : 3499999, loss : [0.35352894]


array([0.35352894])

In [72]:
#train accuracy
# y_1_pred = mlp_1.predict(X_1_train)
# acc1 = 1-np.sum(np.abs(y_1_pred - y_1_train)) / X_1_train.shape[0]
# print(acc1)
y_2_pred = mlp_2.predict(X_2_train)
acc2 = 1-np.sum(np.abs(y_2_pred - y_2_train)) / X_2_train.shape[0]
print(acc2)

0.8631558857209971
0.8422583376400603


In [73]:
#test accuracy
# y_1_pred = mlp_1.predict(X_1_test)
# acc1 = 1-np.sum(np.abs(y_1_pred - y_1_test)) / X_1_test.shape[0]
# print(acc1)
y_2_pred = mlp_2.predict(X_2_test)
acc2 = 1-np.sum(np.abs(y_2_pred - y_2_test)) / X_2_test.shape[0]
print(acc2)

0.8675034867503486
0.8403430644895267


In [74]:
_,X_sub,ids = load_csv_data("test.csv")
X_sub = np.where(X_sub == -999., np.nan, X_sub)
no_nan_idxs = ~np.isnan(X_sub).any(axis=1)
nan_idxs = np.isnan(X_sub).any(axis=1)

X_sub_1 = X_sub[~np.isnan(X_sub).any(axis=1)]
X_sub_2 = X_sub[np.isnan(X_sub).any(axis=1)]

X_sub_1 = preproc(X_sub_1)
X_sub_2 = preproc(X_sub_2)

sub_1_pred = mlp_1.predict(X_sub_1)
sub_2_pred = mlp_2.predict(X_sub_2)
sub_1_pred = sub_1_pred*2 -1
sub_2_pred = sub_2_pred*2 -1
sub_pred = np.zeros(X_sub.shape[0])
sub_pred[no_nan_idxs] = sub_1_pred
sub_pred[nan_idxs] = sub_2_pred

In [75]:
sub_pred.mean()

-0.3693839553144985

In [76]:
create_csv_submission(ids, sub_pred, "nn_split_submission.csv")