In [None]:
import numpy as np
from implementations_clean import *
from proj1_helpers import *

In [None]:
y,X,ids = load_csv_data("train.csv")
#ADD BIAS


In [None]:
import pandas as pd
X = np.where(X == -999., np.nan, X)
df = pd.DataFrame(X)
df.head()

In [None]:
#feature 1: correlations der_mass_MMC
col_means = np.nanmean(X, axis=0)
idxs = np.where(np.isnan(X))
X[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X[:,0][X[:,0] > 140] = 140
X = np.column_stack((X, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X[:,13]*np.cos(X[:,15])
tau_py = X[:,13]*np.sin(X[:,15])
tau_pz = X[:,13]*np.sinh(X[:,14])
X = np.column_stack((X, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X[:,16]*np.cos(X[:,18])
lep_py = X[:,16]*np.cos(X[:,18])
lep_pz = X[:,16]*np.cos(X[:,17])
X = np.column_stack((X, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X[:,22]*np.cos(X[:,24])
jet_py = X[:,22]*np.cos(X[:,24])
jet_pz = X[:,22]*np.cos(X[:,23])
X = np.column_stack((X, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X[:,25]*np.cos(X[:,27])
subjet_py = X[:,25]*np.cos(X[:,27])
subjet_pz = X[:,25]*np.cos(X[:,26])
X = np.column_stack((X, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X[:,11] = np.abs(X[:,11])
#tau phi
X[:,15] = np.abs(X[:,15])
#lep phi
X[:,18] = np.abs(X[:,18])
#met phi
X[:,20] = np.abs(X[:,20])
#lead jet phi
X[:,24] = np.abs(X[:,24])
#sublead jet phi
X[:,27] = np.abs(X[:,27])

df = pd.DataFrame(X)
df.head(20)
X[:,11].mean()

In [None]:
X[:,11].min()

In [None]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

In [None]:
# X,_,_ = standardize(X)
# bias = np.ones((X.shape[0],1))
# X = np.hstack([bias,X])
X = make_features(X)

In [None]:
df = pd.DataFrame(X)
df.head()

In [None]:
# np.random.shuffle(X)
cutoff = int(0.8*((X.shape)[0]))
X_train = X[:cutoff]
y_train = y[:cutoff]
X_test = X[cutoff:]
y_test = y[cutoff:]

In [None]:
initial_w = np.zeros(X.shape[1])
# initial_w = np.random.normal(scale = 0.1, size = X.shape[1])
# max_iters = 10*200000
max_iters = 1000
gamma = 0.1
lambda_ = 0.1

In [None]:
print(initial_w)

In [None]:
class MLP:    
    #activations: 'relu', 'sigmoid', 'linear'
    #loss assumed to be BCE
    def __init__(self, lambda_ = 0.001,  dimensions = [2,10,1], activations = ['relu','sigmoid'] ,weight_decay = 0):
        assert (len(dimensions)-1) == len(activations), "Number of dimensions and activation functions do not match"
        # number of layers of our MLP
        self.num_layers = len(dimensions)
        self.lambda_ = lambda_
        self.weight_decay = weight_decay
        
        # initialize the weights
        self.weights = {}
        self.bias = {}
        # the first layer is the input data
        self.activations = {}
        self.activations_grad = {}
        
        for n in np.arange(self.num_layers - 1):
            # the wights are initialized acccording to a normal distribution and divided by the size of the layer they're on
            self.weights[n + 1] = np.random.randn(dimensions[n + 1],dimensions[n]) / np.sqrt(dimensions[n])
            # bias are all initialized to zero
            self.bias[n + 1] = np.zeros(dimensions[n + 1])
            if activations[n] == 'relu':
                self.activations[n+1] = self.relu
                self.activations_grad[n+1] = self.relu_gradient
            elif activations[n] == 'sigmoid':
                self.activations[n+1] = self.sigmoid
                self.activations_grad[n+1] = self.sigmoid_gradient
            else:
                self.activations[n+1] = lambda x : x
                self.activations_grad[n+1] = lambda x : 1
    
    def feed_forward(self, x):        
        # keep track of all z and a to compute gradient in the backpropagation
        z = {}
        # the first layer is the input data
        a = {1:x}
        # We compute z[n+1] = a[n] * w[n] + b[n]
        # and a[n+1] = f(z[n+1]) = f(a[n] * x[n] + b[n]) where * is the inner product
        for n in np.arange(1, self.num_layers):
            z[n + 1] = self.weights[n] @ a[n] + self.bias[n]
            a[n + 1] = self.activations[n](z[n + 1])

        y_pred = a[n+1]    
        return y_pred,a, z
    
    # returns a prediction
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y_i_proba,_,_ = self.feed_forward(X[i].squeeze()) 
            preds[i] = (y_i_proba > 0.5)
        return preds
    
    def back_propagate(self, y,y_pred, a, z):
        
        weights_gradient = {}
        bias_gradient = {}
        
        nabla = self.BCE_gradient(y,y_pred)
        
        for n in np.flip(np.arange(1, self.num_layers)):
            nabla = nabla * self.activations[n](z[n+1])
            weights_gradient[n] = np.outer(nabla, a[n])
            bias_gradient[n] = nabla
            nabla = nabla @ self.weights[n]
        
        return weights_gradient, bias_gradient
        self.gradient_descent_step(weights_gradient, bias_gradient)
    
    #weight decay : l2 reg
    def gradient_descent_step(self, weights_gradient, bias_gradient):
        for n in np.arange(1, self.num_layers):
            self.weights[n] = self.weights[n] - self.lambda_ * (weights_gradient[n] + self.weight_decay*self.weights[n])
            self.bias[n] = self.bias[n] - self.lambda_ * (bias_gradient[n] + self.weight_decay*self.bias[n])            
    
    #batch size = 1 for now
    def train(self, X, y, max_iter, batch_size = 1):
        for i in np.arange(max_iter):
            idxs = np.random.randint(0, X.shape[0],batch_size)
            X_batch = X[idxs].squeeze()
            y_batch = y[idxs]
            y_pred,a, z = self.feed_forward(X_batch)
            weights_gradient, bias_gradient = self.back_propagate(y_batch,y_pred,a, z)
            self.gradient_descent_step(weights_gradient, bias_gradient)
            if (i % 200000 == 0):
                loss = self.BCE_loss(X,y)
                print("Iteration : {}, loss : {}".format(i,loss))
        loss = self.BCE_loss(X,y)
        return loss
            
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_gradient(self,z):
        return sigmoid(z) * (1 - sigmoid(z))
    
    def relu(self,z):
        return np.where(z < 0, 0, z)

    def relu_gradient(self,z):
        np.where(z < 0, 0, 1)
        
    #check if possible to vectorize
    def BCE_loss(self,X, y):
        loss = 0
        N = len(y)
        for i in range(N):
            y_pred,_,_ = self.feed_forward(X[i])
            eps = 1e-7
            loss_i = -(y[i]*np.log(y_pred+eps) + (1-y[i])*np.log(1-y_pred+eps))
            loss = loss + loss_i/N
        return loss
    
    def BCE_gradient(self,y,y_pred):
        return y_pred-y


In [None]:
in_dim = X_train.shape[1]
n_h1 = 30
n_h2 = 30
n_h3 = 30
out_dim = 1
dimensions = [in_dim, n_h1,n_h2,n_h3,out_dim]
activations = ['relu','relu','relu','sigmoid']
lambda_ = 0.0001
weight_decay = 0
mlp = MLP(lambda_ = lambda_, dimensions = dimensions, activations = activations, weight_decay = weight_decay)

In [None]:
mlp.train(X_train,y_train,max_iter = 3000000)

In [None]:
y_pred = mlp.predict(X_train)
acc = 1-np.sum(np.abs(y_pred - y_train)) / X_train.shape[0]
print(acc)

In [None]:
y_pred = mlp.predict(X_test)
acc = 1-np.sum(np.abs(y_pred - y_test)) / X_test.shape[0]
print(acc)

In [None]:
# # loss,w = GD_logistic_regression(y = y_train, tx = X_train, initial_w = initial_w,
# #                              max_iters = max_iters, gamma = gamma)
# loss,w = GD_reg_logistic_regression(y = y_train, tx = X_train, initial_w = initial_w,
#                              max_iters = max_iters, gamma = gamma, lambda_ = lambda_)
# # loss,w = logistic_regression(y = y_train, tx = X_train, initial_w = initial_w,
# #                              max_iters = max_iters, gamma = gamma)
# # loss,w = least_squares_GD(y = y_train,tx = X_train, initial_w = initial_w, max_iters = max_iters, gamma = gamma)

In [None]:
# #train accuracy
# y_pred = classification(X_train @ w)
# acc = 1-np.sum(np.abs(y_pred - y_train)) / X_train.shape[0]
# print(acc)

In [None]:
# #test accuracy
# y_pred = classification(X_test @ w)
# acc = 1-np.sum(np.abs(y_pred - y_test)) / X_test.shape[0]

In [None]:
acc

In [None]:
y,X_sub,ids = load_csv_data("test.csv")
#feature 1: correlations der_mass_MMC
col_means = np.nanmean(X_sub, axis=0)
idxs = np.where(np.isnan(X_sub))
X_sub[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X_sub[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X_sub[:,0][X_sub[:,0] > 140] = 140
X_sub = np.column_stack((X_sub, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X_sub[:,13]*np.cos(X_sub[:,15])
tau_py = X_sub[:,13]*np.sin(X_sub[:,15])
tau_pz = X_sub[:,13]*np.sinh(X_sub[:,14])
X_sub = np.column_stack((X_sub, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X_sub[:,16]*np.cos(X_sub[:,18])
lep_py = X_sub[:,16]*np.cos(X_sub[:,18])
lep_pz = X_sub[:,16]*np.cos(X_sub[:,17])
X_sub = np.column_stack((X_sub, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X_sub[:,22]*np.cos(X_sub[:,24])
jet_py = X_sub[:,22]*np.cos(X_sub[:,24])
jet_pz = X_sub[:,22]*np.cos(X_sub[:,23])
X_sub = np.column_stack((X_sub, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X_sub[:,25]*np.cos(X_sub[:,27])
subjet_py = X_sub[:,25]*np.cos(X_sub[:,27])
subjet_pz = X_sub[:,25]*np.cos(X_sub[:,26])
X_sub = np.column_stack((X_sub, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X_sub[:,11] = np.abs(X_sub[:,11])
#tau phi
X_sub[:,15] = np.abs(X_sub[:,15])
#lep phi
X_sub[:,18] = np.abs(X_sub[:,18])
#met phi
X_sub[:,20] = np.abs(X_sub[:,20])
#lead jet phi
X_sub[:,24] = np.abs(X_sub[:,24])
#sublead jet phi
X_sub[:,27] = np.abs(X_sub[:,27])

X_sub = make_features(X_sub)

In [None]:
# sub_pred = predict_labels(w,X_sub) 
sub_pred = mlp.predict(X_sub)
#map to -1,1
sub_pred = sub_pred*2 -1

In [None]:
sub_pred.mean()

In [None]:
create_csv_submission(ids, sub_pred, "submission_test.csv")

In [None]:
t = pd.read_csv("submission_test.csv")

In [None]:
t.to_csv("submission_test.csv",index = False)