## data preperation

### file opening

In [1]:
import numpy as np
from implementations import *
from proj1_helpers import *
from datetime import datetime
np.random.seed(2)

In [2]:
y,X,ids = load_csv_data("train.csv")
#ADD BIAS
import pandas as pd
X = np.where(X == -999., np.nan, X)
df = pd.DataFrame(X)

In [3]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

In [4]:
#feature 1: correlations der_mass_MMC
col_means = np.nanmean(X, axis=0)
idxs = np.where(np.isnan(X))
X[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X[:,0][X[:,0] > 140] = 140
X = np.column_stack((X, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X[:,13]*np.cos(X[:,15])
tau_py = X[:,13]*np.sin(X[:,15])
tau_pz = X[:,13]*np.sinh(X[:,14])
X = np.column_stack((X, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X[:,16]*np.cos(X[:,18])
lep_py = X[:,16]*np.cos(X[:,18])
lep_pz = X[:,16]*np.cos(X[:,17])
X = np.column_stack((X, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X[:,22]*np.cos(X[:,24])
jet_py = X[:,22]*np.cos(X[:,24])
jet_pz = X[:,22]*np.cos(X[:,23])
X = np.column_stack((X, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X[:,25]*np.cos(X[:,27])
subjet_py = X[:,25]*np.cos(X[:,27])
subjet_pz = X[:,25]*np.cos(X[:,26])
X = np.column_stack((X, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X[:,11] = np.abs(X[:,11])
#tau phi
X[:,15] = np.abs(X[:,15])
#lep phi
X[:,18] = np.abs(X[:,18])
#met phi
X[:,20] = np.abs(X[:,20])
#lead jet phi
X[:,24] = np.abs(X[:,24])
#sublead jet phi
X[:,27] = np.abs(X[:,27])

df = pd.DataFrame(X)
df.head(20)
X[:,11].mean()

X = make_features(X)

cutoff = int(0.8*((X.shape)[0]))
X_train = X[:cutoff]
y_train = y[:cutoff]
X_test = X[cutoff:]
y_test = y[cutoff:]

# Least Squares

## gradient descent Least Squares

In [5]:
max_iter = 50
losses = []
ws = []
gammas = np.linspace(0.1,0.3,21)
for gamma in gammas:
    w, loss = least_squares_GD(y_train, X_train, np.zeros(X_train.shape[1]), max_iter, gamma)
    losses.append(loss)
    ws.append(w)

In [6]:
index = np.argmin(losses)
w = ws[index]
loss = losses[index]
gammas[index]

0.29

In [7]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 79.01599999999999%, loss is of 0.15698247190125425


In [8]:
pred = X_test @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_test - pred)) / X_test.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

accuracy on test set is of 79.242 %


## stochastic gradient descent least squares

In [23]:
max_iter = 1000
losses = []
ws = []
gammas = np.linspace(0.01,0.03,21)
for gamma in gammas:
    w, loss = least_squares_SGD(y_train, X_train, np.zeros(X_train.shape[1]), max_iter, gamma)
    losses.append(loss)
    ws.append(w)

In [24]:
index = np.argmin(losses)
w = ws[index]
loss = losses[index]
gammas[index]

0.01

In [25]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 73.3605%, loss is of 0.2126655179650995


In [26]:
pred = X_test @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_test - pred)) / X_test.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

accuracy on test set is of 73.394 %


## least squares

In [27]:
w, loss = least_squares(X_train, y_train)

In [28]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 69.9195%, loss is of 0.2785096416848562


In [29]:
pred = X_test @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_test - pred)) / X_test.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

accuracy on test set is of 70.248 %


## ridge regression

In [30]:
lambdas = np.linspace(0.01, 0.1, 11)
losses = []
ws = []
for lambda_ in lambdas:
    w, loss = ridge_regression(X_train, y_train, lambda_)
    ws.append(w)
    losses.append(loss)

In [31]:
accuracies = []
for w in ws:
    pred_test = X_test @ w
    pred_test = np.where(pred_test > 1/2, 1, 0)
    accuracy = 100 - 100 * np.sum(np.abs(y_test - pred_test)) / X_test.shape[0]
    accuracies.append(accuracy)
index = np.argmax(accuracies)
w = ws[index]

In [32]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 78.998%, loss is of 0.16702375764989005


In [33]:
pred = X_test @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_test - pred)) / X_test.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

accuracy on test set is of 79.166 %


## logistic regression

In [34]:
max_iter = 1000
gamma = 0.05
w, loss = logistic_regression(y_train, X_train, np.zeros(X.shape[1]), max_iter, gamma)

In [35]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 69.9105%, loss is of 2.7688947890856888


## reg logistic regression

In [36]:
max_iter = 1000
gamma = 0.05
lambda_ = 0.01
w, loss = reg_logistic_regression(y_train, X_train, lambda_, np.zeros(X.shape[1]), max_iter, gamma)

In [37]:
pred_tr = X_train @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(y_train - pred_tr)) / X_train.shape[0]
print("accuracy on training set is of {}%, loss is of {}".format(accuracy, loss))

accuracy on training set is of 75.7025%, loss is of 0.4936587877193814


# MLP

### backprop

For MSE:

$ 
    \frac{\delta L}{a_n} = \frac{\delta (a_n - y)^2}{\delta a_{n}} = 2(a_n - y)  \\
    \frac{\delta a_{i}}{\delta z_{i}} = \frac{\delta S(z_{i})}{\delta z_{i}} = S(z_{i})(1 - S(z_{i})) \\ 
    \frac{\delta z_{i+1}}{\delta w_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta w_{i}} = a_{i} \\
    \frac{\delta z_{i+1}}{\delta b_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta b_{i}} = 1  \\
    \frac{\delta z_{i+1}}{\delta a_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta a_{i}}= w_{i} $

# tryin' ma best to vectorize baby

In [None]:
def relu(z):
    return np.where(z < 0, 0, z)

def relu_gradient(z):
    return np.where(z < 0, 0, 1)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_gradient(z):
    return sigmoid(z) * (1 - sigmoid(z))

def grad_loss(y_pred, y):
    return y_pred - y

def BCE_gradient(y,y_pred):
    #return y_pred-y
    return (-y/y_pred + (1-y)/(1-y_pred))

class layer:

    def __init__(self, dim_0, dim_1, activation):
        self.w = np.random.randn(dim_0, dim_1) / np.sqrt(dim_0)
        self.b = np.zeros(dim_0)
        if activation == 'relu':
            self.f = relu
            self.f_grad = relu_gradient
        if activation == 'sigmoid':
            self.f = sigmoid
            self.f_grad = sigmoid_gradient

    def feed_forward(self, a):
        self.a_prev = a
        if len(a.shape) == 1:
            # for batch_size 1
            self.z = np.dot(self.w, a) + self.b
        else:
            self.z = np.dot(self.w, a) + np.tile(self.b, (a.shape[1],1)).T
        self.a = self.f(self.z)
        return self.a

    def back_propagate(self, grad):
        grad = grad * self.f_grad(self.z)
        self.w_grad = grad @ self.a_prev.T
        self.b_grad = np.sum(grad, axis = 1)
        return (grad.T @ self.w).T


class MLP:
    # trying a vectorized MLP
    def __init__(self, dim, activations):
        
        self.layers = []
        for n in range(len(dim)-1):
            self.layers.append(layer(dim[n + 1], dim[n], activations[n]))
        
    def feed_forward(self, X):
        a = X
        for l in self.layers:
            a = l.feed_forward(a)
        return a
    
    def back_propagate(self, y_pred, y):
        grad = BCE_gradient(y, y_pred)
        for l in np.flip(self.layers):
            grad = l.back_propagate(grad)
        
            
    def gradient_descent_step(self, gamma, weight_decay):
        for l in self.layers:
            l.w -= (l.w_grad + l.w * weight_decay) * gamma
            l.b -= (l.b_grad + l.b * weight_decay) * gamma
            
    def train(self, X, Y, batch_size, max_iter, gamma, weight_decay, number_of_loss_computations = 5):
        
        start = datetime.now()
        gamma = gamma
        div = int(max_iter / number_of_loss_computations)
        
        for i in range(max_iter):
            
            if i == int(max_iter / 3):
                gamma = 0.1 * gamma
            if i % div == 0:
                print("{}% of the way".format(int(i/max_iter * 100)))
                print(self.BCE_loss(X, Y))
                
            idxs = np.arange(X.shape[0])
            np.random.shuffle(idxs)
            idxs = idxs[:batch_size]
            
            X_batch = X[idxs]
            y_batch = y[idxs]
            y_pred = self.feed_forward(X_batch.T)
            self.back_propagate(y_pred, y_batch)
            self.gradient_descent_step(gamma / batch_size, weight_decay)

        end = datetime.now()
        print("time taken:", end - start)
                
    def BCE_loss(self,X, y):
        loss = 0
        N = len(y)
        for i in range(N):
            y_pred = self.feed_forward(X[i])
            eps = 1e-7
            loss_i = -(y[i]*np.log(y_pred+eps) + (1-y[i])*np.log(1-y_pred+eps))
            loss = loss + loss_i/N
        
        return loss
    
    
    def BCE_loss_vect(self, X, y):
        y_pred = self.feed_forward(X.T)
        return np.mean(y_pred * np.log(y_pred + eps) + (1 - y_pred) * np.log(1 - y_pred + eps))

    
    def predict(self, X):
        y = self.feed_forward(X.T)
        return np.where(y < 0.5, 0, 1)

In [None]:
np.random.seed(1)
in_dim = X_train.shape[1]
n_h1 = 100
n_h2 = 100
n_h3 = 100
n_h4 = 100
n_h5 = 100
n_h6 = 100
n_h7 = 100
out_dim = 1
dimensions = [in_dim, n_h1,n_h2,n_h3,n_h4,n_h5,n_h6,n_h7,out_dim]
activations = ['relu','relu','relu','relu','relu','relu','relu','sigmoid']

# mlp = MLP(gamma = gamma, dimensions = dimensions, activations = activations, weight_decay = weight_decay)

dimensions = [in_dim, n_h1,n_h2,n_h3,n_h4,n_h5,n_h6,n_h7,out_dim]
activations = ['relu','relu','relu','relu','relu','relu','relu','sigmoid']
mlp_1 = MLP(dimensions, activations)
dimensions = [in_dim, 30,30,30,out_dim]
activations = ['relu','relu','relu','sigmoid']
mlp_2 = MLP(dimensions, activations)
dimensions = [in_dim, 50,50,50,50,50,out_dim]
activations = ['relu','relu','relu','relu','relu','sigmoid']
mlp_3 = MLP(dimensions, activations)

In [None]:
gamma = 0.0001
weight_decay = 0.001
max_iter = 50000
batch_size = 1

In [None]:
mlp_1.train(X, y, batch_size, max_iter, gamma, weight_decay)

In [None]:
y_pred1 = mlp_1.feed_forward(X_train.T)
acc_train_1 = 1-np.sum(np.abs(y_pred1 - y_train)) / X_train.shape[0]
y_pred1 = mlp_1.feed_forward(X_test.T)
acc_test_1 = 1-np.sum(np.abs(y_pred1 - y_test)) / X_test.shape[0]
print("first MLP")
print("training accuracy: {}% | test accuracy: {}%".format(acc_train_1 * 100, acc_test_1 * 100))

In [None]:
mlp_2.train(X, y, batch_size, max_iter, gamma, weight_decay * 0.1)

In [None]:
y_pred2 = mlp_2.feed_forward(X_train.T)
acc_train_2 = 1-np.sum(np.abs(y_pred2 - y_train)) / X_train.shape[0]
y_pred2 = mlp_1.feed_forward(X_test.T)
acc_test_2 = 1-np.sum(np.abs(y_pred2 - y_test)) / X_test.shape[0]
print("second MLP")
print("training accuracy: {}% | test accuracy: {}%".format(acc_train_2 * 100, acc_test_2 * 100))

In [None]:
mlp_3.train(X, y, batch_size, max_iter, gamma, weight_decay * 0.1)

In [None]:
y_pred3 = mlp_3.feed_forward(X_train.T)
acc_train_3 = 1-np.sum(np.abs(y_pred3 - y_train)) / X_train.shape[0]
y_pred3 = mlp_3.feed_forward(X_test.T)
acc_test_3 = 1-np.sum(np.abs(y_pred3 - y_test)) / X_test.shape[0]
print("third MLP")
print("training accuracy: {}% | test accuracy: {}%".format(acc_train_3 * 100, acc_test_3 * 100))

In [None]:
_,X_sub,ids = load_csv_data("test.csv")
#feature 1: correlations der_mass_MMC
X_sub = np.where(X_sub == -999., np.nan, X_sub)
col_means = np.nanmean(X_sub, axis=0)
idxs = np.where(np.isnan(X_sub))
X_sub[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X_sub[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X_sub[:,0][X_sub[:,0] > 140] = 140
X_sub = np.column_stack((X_sub, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X_sub[:,13]*np.cos(X_sub[:,15])
tau_py = X_sub[:,13]*np.sin(X_sub[:,15])
tau_pz = X_sub[:,13]*np.sinh(X_sub[:,14])
X_sub = np.column_stack((X_sub, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X_sub[:,16]*np.cos(X_sub[:,18])
lep_py = X_sub[:,16]*np.cos(X_sub[:,18])
lep_pz = X_sub[:,16]*np.cos(X_sub[:,17])
X_sub = np.column_stack((X_sub, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X_sub[:,22]*np.cos(X_sub[:,24])
jet_py = X_sub[:,22]*np.cos(X_sub[:,24])
jet_pz = X_sub[:,22]*np.cos(X_sub[:,23])
X_sub = np.column_stack((X_sub, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X_sub[:,25]*np.cos(X_sub[:,27])
subjet_py = X_sub[:,25]*np.cos(X_sub[:,27])
subjet_pz = X_sub[:,25]*np.cos(X_sub[:,26])
X_sub = np.column_stack((X_sub, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X_sub[:,11] = np.abs(X_sub[:,11])
#tau phi
X_sub[:,15] = np.abs(X_sub[:,15])
#lep phi
X_sub[:,18] = np.abs(X_sub[:,18])
#met phi
X_sub[:,20] = np.abs(X_sub[:,20])
#lead jet phi
X_sub[:,24] = np.abs(X_sub[:,24])
#sublead jet phi
X_sub[:,27] = np.abs(X_sub[:,27])

X_sub = make_features(X_sub)

In [None]:
p_1 = mlp_1.feed_forward(X_sub.T)
p_2 = mlp_2.feed_forward(X_sub.T)
p_3 = mlp_3.feed_forward(X_sub.T)
p = np.mean((p_1,p_2,p_3),axis = 0)
p = p > 0.5
sub_pred = p*2 -1

In [None]:
sub_pred = sub_pred.squeeze()

In [None]:
create_csv_submission(ids, sub_pred, "submission_test.csv")

In [None]:
t = pd.read_csv("submission_test.csv")

In [None]:
t.to_csv("submission_test.csv",index = False)