In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
from implementations import *
%load_ext autoreload
%autoreload 2

## data preperation

### file opening

In [None]:
File_test = open('test.csv')
File_train = open('train.csv')

data_test = np.array(list(csv.reader(File_test)))
data_train = np.array(list(csv.reader(File_train)))

File_test.close()
File_train.close()

### features 

In [None]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

In [None]:


# converting the strings into floats and removing features names, labels and indexes
X = np.array(data_train[1:,2:]).astype(np.float)

num_train = 150000
num_val = 50000
num_test = 50000
N = X.shape[0]
assert num_train + num_val + num_test == N

training_set = make_features(X[:num_train])
validation_set = make_features(X[num_train:num_train+num_val])
test_set = make_features(X[-num_test:])

In [None]:
print(training_set.shape)
print(validation_set.shape)
print(test_set.shape)

### labels

In [None]:
def make_labels(data):
    labels = np.array(data[1:,1])
    return np.where(labels == 'b', 1, 0)
    
training_labels = make_labels(data_train[:num_train+1])
validation_labels = make_labels(data_train[num_train:num_train+num_val+1])
test_labels = make_labels(data_train[-num_test-1:])

In [None]:
print(training_labels.shape)
print(validation_labels.shape)
print(test_labels.shape)

## learning Least Squares

In [None]:
losses = []
ws = []

for gamma in np.linspace(0.1,1,10):
    loss, w = least_squares_GD(validation_labels, validation_set, np.zeros(validation_set.shape[1]), 100, gamma)
    losses.append(loss)
    ws.append(w)

In [None]:
w = ws[np.argmin(losses)]

## test Least Squares

In [None]:
pred_tr = training_set @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(training_labels - pred_tr)) / training_labels.shape[0]
print("accuracy on training set is of {} %".format(accuracy))

In [None]:
pred = test_set @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(test_labels - pred)) / test_labels.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

# MLP

In [None]:
import numpy as np
from implementations_clean import *
from proj1_helpers import *

y,X,ids = load_csv_data("train.csv")
#ADD BIAS

import pandas as pd
X = np.where(X == -999., np.nan, X)
df = pd.DataFrame(X)
df.head()

#feature 1: correlations der_mass_MMC
col_means = np.nanmean(X, axis=0)
idxs = np.where(np.isnan(X))
X[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X[:,0][X[:,0] > 140] = 140
X = np.column_stack((X, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X[:,13]*np.cos(X[:,15])
tau_py = X[:,13]*np.sin(X[:,15])
tau_pz = X[:,13]*np.sinh(X[:,14])
X = np.column_stack((X, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X[:,16]*np.cos(X[:,18])
lep_py = X[:,16]*np.cos(X[:,18])
lep_pz = X[:,16]*np.cos(X[:,17])
X = np.column_stack((X, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X[:,22]*np.cos(X[:,24])
jet_py = X[:,22]*np.cos(X[:,24])
jet_pz = X[:,22]*np.cos(X[:,23])
X = np.column_stack((X, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X[:,25]*np.cos(X[:,27])
subjet_py = X[:,25]*np.cos(X[:,27])
subjet_pz = X[:,25]*np.cos(X[:,26])
X = np.column_stack((X, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X[:,11] = np.abs(X[:,11])
#tau phi
X[:,15] = np.abs(X[:,15])
#lep phi
X[:,18] = np.abs(X[:,18])
#met phi
X[:,20] = np.abs(X[:,20])
#lead jet phi
X[:,24] = np.abs(X[:,24])
#sublead jet phi
X[:,27] = np.abs(X[:,27])

df = pd.DataFrame(X)
df.head(20)
X[:,11].mean()

In [None]:
df = pd.DataFrame(X)

In [None]:
# the 22nd column shouln't be normalized and be expanded as one new colomn per discrete value
df[22].head()

In [None]:
# the columns with index 1, 3 and 31 have a high weight value in the first layer and could be expanded in polynomss
df.iloc[:,[1,3,31]].head()

In [None]:
X = make_features(X)
df = pd.DataFrame(X)
df.head()

In [None]:
cutoff = int(0.8*((X.shape)[0]))
X_train = X[:cutoff]
y_train = y[:cutoff]
X_test = X[cutoff:]
y_test = y[cutoff:]

### Activation functions

In [None]:
def compute_loss(y, tx, w, mse = True):
    N = y.shape[0]
    if mse:
        e = y - tx @ w
        loss = 1/(2 * N) * e.T @ e
    else:
        loss = np.mean(np.abs(y - tx @ w))
    return loss

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis = 0)
    x = x - mean_x
    std_x = np.std(x, axis = 0)
    x = x / std_x
    return x, mean_x, std_x

### Class

### backprop

For MSE:

$ 
    \frac{\delta L}{a_n} = \frac{\delta (a_n - y)^2}{\delta a_{n}} = 2(a_n - y)  \\
    \frac{\delta a_{i}}{\delta z_{i}} = \frac{\delta S(z_{i})}{\delta z_{i}} = S(z_{i})(1 - S(z_{i})) \\ 
    \frac{\delta z_{i+1}}{\delta w_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta w_{i}} = a_{i} \\
    \frac{\delta z_{i+1}}{\delta b_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta b_{i}} = 1  \\
    \frac{\delta z_{i+1}}{\delta a_{i}} = \frac{\delta (a_{i} * w_{i} + b_{i})}{\delta a_{i}}= w_{i} $

In [None]:
class MLP:    
    #activations: 'relu', 'sigmoid', 'linear'
    #loss assumed to be BCE
    def __init__(self, lambda_ = 0.001,  dimensions = [2,10,1], activations = ['relu','sigmoid'] ,weight_decay = 0):
        assert (len(dimensions)-1) == len(activations), "Number of dimensions and activation functions do not match"
        # number of layers of our MLP
        self.dimensions = dimensions
        self.num_layers = len(dimensions)
        self.lambda_ = lambda_
        self.weight_decay = weight_decay
        
        # initialize the weights
        self.weights = {}
        self.bias = {}
        # the first layer is the input data
        self.activations = {}
        self.activations_grad = {}
        
        for n in np.arange(self.num_layers - 1):
            # the weights are initialized acccording to a normal distribution and divided by the size of the layer they're on
            self.weights[n + 1] = np.random.randn(dimensions[n + 1],dimensions[n]) / np.sqrt(dimensions[n])
            # bias are all initialized to zero
            self.bias[n + 1] = np.zeros(dimensions[n + 1])
            
            if activations[n] == 'relu':
                self.activations[n+1] = self.relu
                self.activations_grad[n+1] = self.relu_gradient
            elif activations[n] == 'sigmoid':
                self.activations[n+1] = self.sigmoid
                self.activations_grad[n+1] = self.sigmoid_gradient
            else:
                self.activations[n+1] = lambda x : x
                self.activations_grad[n+1] = lambda x : 1
    
    def feed_forward(self, x):
        
        # keep track of all z and a to compute gradient in the backpropagation
        z = {}
        # the first layer is the input data
        a = {1:x}
        # We compute z[n+1] = a[n] * w[n] + b[n]
        # and a[n+1] = f(z[n+1]) = f(a[n] * x[n] + b[n]) where * is the inner product

        
        batch_size = x.shape[0]
        for n in np.arange(1, self.num_layers):
            # we compute z_ and a_ for every sample and store them inside a single numpy array for every layer
            # this actually may require some memory since we store a lot of data in two 3D matrix
            z_ = np.zeros((batch_size, self.dimensions[n]))
            a_ = np.zeros((batch_size, self.dimensions[n]))
            for i in range(batch_size):
                z_[i,:] = self.weights[n] @ a[n][i,:] + self.bias[n]
                a_[i,:] = self.activations[n](z_[i,:])
            z[n + 1] = z_
            a[n + 1] = a_

        # the prediction is the final layer
        y_pred = a[n+1]
        return y_pred, a, z
    
    # returns a prediction
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y_i_proba,_,_ = self.feed_forward(X[i].squeeze()) 
            preds[i] = (y_i_proba > 0.5)
        return preds
    
    def back_propagate(self, y, y_pred, a, z):
        
        batch_size = y_pred.shape[0]
        
        weights_gradient = {}
        bias_gradient = {}
        
        # base gradient of every sample (batch_size x 1)
        nabla_n = self.BCE_gradient(y,y_pred)
        
        for n in np.flip(np.arange(1, self.num_layers)):
            # the weights gradients we want to compute for every sample (dim_weights x batch_size)
            weight_gradients = np.zeros((self.dimensions[n], self.dimensions[n-1], batch_size))
            # the bias gradients we want to compute for every sample (dim_n x batch_size)
            bias_gradients = np.zeros((self.dimensions[n], batch_size))
            # the next nablas for every sample (batch_size x previous_dim)
            nabla = np.zeros((batch_size, self.dimensions[n-1]))
            
            for i in range(batch_size):
                # we compute the gradients of the weigths for every sample
                nabla_ = nabla_n[i] * self.activations_grad[n]((z[n+1][i,:]))
                weight_gradients[:,:,i] = np.outer(nabla_, a[n][i,:])
                
                # we compute the bias gradients for every sample
                bias_gradients[:,i] = nabla_
                # we compute the nabla for the next iteration
                nabla[i] = np.dot(nabla_n[i], self.weights[n])
                
            nabla_n = nabla
            # for both weights and bias we take the mean over all samples
            weights_gradient[n] = np.mean(weight_gradients, axis = 2)
            bias_gradient[n] = np.mean(bias_gradients, axis = 1)
        
        return weights_gradient, bias_gradient
        ## self.gradient_descent_step(weights_gradient, bias_gradient)
    
    #weight decay : l2 reg
    def gradient_descent_step(self, weights_gradient, bias_gradient):
        for n in np.arange(1, self.num_layers):
            self.weights[n] = self.weights[n] - self.lambda_ * (weights_gradient[n] + self.weight_decay*self.weights[n])
            self.bias[n] = self.bias[n] - self.lambda_ * (bias_gradient[n] + self.weight_decay*self.bias[n])            
    
    #batch size = 1 for now
    def train(self, X, y, max_iter, batch_size = 5):
        for i in np.arange(max_iter):
            idxs = np.random.randint(0, X.shape[0],batch_size)
            X_batch = X[idxs].squeeze()
            y_batch = y[idxs]
            
            y_pred, a, z = self.feed_forward(X_batch)
            weights_gradient, bias_gradient = self.back_propagate(y_batch,y_pred,a, z)

            self.gradient_descent_step(weights_gradient, bias_gradient)
            
            if (i % 1500 == 0):
                loss = self.BCE_loss(X,y)
                print("computing loss...")
                print("Iteration : {}, loss : {}".format(i,loss))
        loss = self.BCE_loss(X,y)
        return loss
            
        
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_gradient(self,z):
        return sigmoid(z) * (1 - sigmoid(z))

    def relu(self,z):
        return np.where(z < 0, 0, z)

    def relu_gradient(self, z):
        return np.where(z < 0, 0, 1)
        
    #check if possible to vectorize
    def BCE_loss(self,X, y):
        loss = 0
        N = len(y)
        for i in range(N):
            y_pred,_,_ = self.feed_forward(X)
            eps = 1e-7
            loss_i = -(y[i]*np.log(y_pred+eps) + (1-y[i])*np.log(1-y_pred+eps))
            loss = loss + loss_i/N
        return loss
    
    def BCE_gradient(self,y,y_pred):
        return y_pred.flatten()-y


In [None]:
in_dim = X_train.shape[1]
n_h1 = 30
n_h2 = 30
n_h3 = 30
out_dim = 1
dimensions = [in_dim, n_h1,n_h2,n_h3,out_dim]
activations = ['relu','relu','relu','sigmoid']
lambda_ = 0.001
weight_decay = 0.001
mlp = MLP(lambda_ = lambda_, dimensions = dimensions, activations = activations, weight_decay = weight_decay)

In [None]:
mlp.train(X_train,y_train,max_iter = 3000)

In [None]:
y_pred = mlp.predict(X_train)
acc = 1-np.sum(np.abs(y_pred - y_train)) / X_train.shape[0]
print("accuracy at training: {} % ".format(acc * 100))

In [None]:
y_pred = mlp.predict(X_test)
acc = 1-np.sum(np.abs(y_pred - y_test)) / X_test.shape[0]
print("accuracy at testing: {} %".format(acc * 100))

# tryin' ma best to vectorize baby

In [3]:
import numpy as np
from implementations_clean import *
from proj1_helpers import *

y,X,ids = load_csv_data("train.csv")
#ADD BIAS

import pandas as pd
X = np.where(X == -999., np.nan, X)
df = pd.DataFrame(X)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,197.76,...,-0.277,258.733,2.0,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,160.937,68.768,103.235,48.146,,,,3.473,2.078,125.157,...,-1.916,164.546,1.0,46.226,0.725,1.158,,,,46.226
2,,162.172,125.953,35.635,,,,3.148,9.336,197.814,...,-2.186,260.414,1.0,44.251,2.053,-2.028,,,,44.251
3,143.905,81.417,80.943,0.414,,,,3.31,0.414,75.968,...,0.06,86.062,0.0,,,,,,,0.0
4,175.864,16.915,134.805,16.405,,,,3.891,16.405,57.983,...,-0.871,53.131,0.0,,,,,,,0.0


In [4]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

In [5]:
#feature 1: correlations der_mass_MMC
col_means = np.nanmean(X, axis=0)
idxs = np.where(np.isnan(X))
X[idxs] = np.take(col_means, idxs[1])
X_gt_mmc = np.array(X[:,0], copy=True)
X_gt_mmc[X_gt_mmc <= 140] = 140
# X = np.column_stack((X, X_gt_mmc))
X[:,0][X[:,0] > 140] = 140
X = np.column_stack((X, X_gt_mmc))

#feature 2: add momentums
#tau momentum
tau_px = X[:,13]*np.cos(X[:,15])
tau_py = X[:,13]*np.sin(X[:,15])
tau_pz = X[:,13]*np.sinh(X[:,14])
X = np.column_stack((X, tau_px,tau_py,tau_pz))
#lep momentum
lep_px = X[:,16]*np.cos(X[:,18])
lep_py = X[:,16]*np.cos(X[:,18])
lep_pz = X[:,16]*np.cos(X[:,17])
X = np.column_stack((X, lep_px,lep_py,lep_pz))
#leading jet momentum
jet_px = X[:,22]*np.cos(X[:,24])
jet_py = X[:,22]*np.cos(X[:,24])
jet_pz = X[:,22]*np.cos(X[:,23])
X = np.column_stack((X, jet_px,jet_py,jet_pz))
#subleading jet momentum
subjet_px = X[:,25]*np.cos(X[:,27])
subjet_py = X[:,25]*np.cos(X[:,27])
subjet_pz = X[:,25]*np.cos(X[:,26])
X = np.column_stack((X, subjet_px,subjet_py,subjet_pz))

# feature 3: abs angles
#der_met_phi_centrality
X[:,11] = np.abs(X[:,11])
#tau phi
X[:,15] = np.abs(X[:,15])
#lep phi
X[:,18] = np.abs(X[:,18])
#met phi
X[:,20] = np.abs(X[:,20])
#lead jet phi
X[:,24] = np.abs(X[:,24])
#sublead jet phi
X[:,27] = np.abs(X[:,27])

df = pd.DataFrame(X)
df.head(20)
X[:,11].mean()

X = make_features(X)

df = pd.DataFrame(X)

# np.random.shuffle(X)
cutoff = int(0.8*((X.shape)[0]))
X_train = X[:cutoff]
y_train = y[:cutoff]
X_test = X[cutoff:]
y_test = y[cutoff:]

In [None]:
def relu(z):
    return np.where(z < 0, 0, z)

def relu_gradient(z):
    return np.where(z < 0, 0, 1)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_gradient(z):
    return sigmoid(z) * (1 - sigmoid(z))

def grad_loss(y_pred, y):
    return y_pred - y

def BCE_gradient(y,y_pred):
    #return y_pred-y
    return (-y/y_pred + (1-y)/(1-y_pred))

class layer:

    def __init__(self, dim_0, dim_1, activation):
        self.w = np.random.randn(dim_0, dim_1) / np.sqrt(dim_0)
        self.b = np.zeros(dim_0)
        if activation == 'relu':
            self.f = relu
            self.f_grad = relu_gradient
        if activation == 'sigmoid':
            self.f = sigmoid
            self.f_grad = sigmoid_gradient

    def feed_forward(self, a):
        self.a_prev = a
        if len(a.shape) == 1:
            # for batch_size 1
            
            self.z = np.dot(self.w, a) + self.b
        else:
            self.z = np.dot(self.w, a) + np.tile(self.b, (a.shape[1],1)).T
        self.a = self.f(self.z)
        return self.a

    def back_propagate(self, grad):
        grad = grad * self.f_grad(self.z)
        self.w_grad = grad @ self.a_prev.T
        self.b_grad = np.sum(grad, axis = 1)
        return (grad.T @ self.w).T


class MLP_2:
    # trying a vectorized MLP
    def __init__(self, dim, activations):
        
        self.layers = []
        for n in range(len(dim)-1):
            self.layers.append(layer(dim[n + 1], dim[n], activations[n]))
        
    def feed_forward(self, X):
        a = X
        for l in self.layers:
            a = l.feed_forward(a)
        return a
    
    def back_propagate(self, y_pred, y):
        grad = BCE_gradient(y, y_pred)
        for l in np.flip(self.layers):
            grad = l.back_propagate(grad)
        
            
    def gradient_descent_step(self, lambda_, weight_decay):
        for l in self.layers:
            l.w -= (l.w_grad + l.w * weight_decay) * lambda_
            l.b -= (l.b_grad + l.b * weight_decay) * lambda_
            
    def train(self, X, Y, batch_size, max_iter, lambda_, weight_decay):
        
        for i in range(max_iter):
            idxs = np.arange(X.shape[0])
            np.random.shuffle(idxs)
            idxs = idxs[:batch_size]
            X_batch = X[idxs]
            y_batch = y[idxs]
            y_pred = self.feed_forward(X_batch.T)
            self.back_propagate(y_pred, y_batch)
            self.gradient_descent_step(lambda_ / batch_size, weight_decay)
            if i % 500 == 0:
                print(self.BCE_loss(X, Y))

    def BCE_loss(self,X, y):
        loss = 0
        N = len(y)
        for i in range(N):
            y_pred = self.feed_forward(X[i])
            eps = 1e-7
            loss_i = -(y[i]*np.log(y_pred+eps) + (1-y[i])*np.log(1-y_pred+eps))
            loss = loss + loss_i/N
        return loss
    


In [None]:
in_dim = X_train.shape[1]
n_h1 = 70
n_h2 = 70
n_h3 = 70
n_h4 = 70
n_h5 = 70
n_h6 = 70
out_dim = 1
dimensions = [in_dim, n_h1,n_h2,n_h3,n_h4,n_h5,n_h6,out_dim]
activations = ['relu','relu','relu','relu','relu','relu','sigmoid']
lambda_ = 0.001
weight_decay = 0

In [None]:
mlp = MLP_2(dimensions, activations)

In [None]:
batch_size = 16
max_iter = 1000
lambda_ = 0.01
weight_decay = 0.01

mlp.train(X_train, y_train, batch_size, max_iter,  lambda_, weight_decay)

[0.68873247]
[0.68237118]
[0.68742344]
[0.68040697]
[0.67766475]
[0.66861691]
[0.66450832]
