In [52]:
import numpy as np

In [53]:
def init_layers(nn_architecture, seed = 99):
    # random seed initiation
    np.random.seed(seed)
    # number of layers in our neural network
    number_of_layers = len(nn_architecture)
    # parameters storage initiation
    params_values = {}
    
    # iteration over network layers
    for idx, layer in enumerate(nn_architecture):
        # we number network layers from 1
        layer_idx = idx + 1
        
        # extracting the number of units in layers
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        # initiating the values of the W matrix
        # and vector b for subsequent layers
        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_output_size, 1) * 0.1
        
    return params_values

In [54]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

In [55]:
def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation="relu"):
    # calculation of the input value for the activation function
    Z_curr = np.dot(W_curr, A_prev) + b_curr
    
    # selection of activation function
    if activation is "relu":
        activation_func = relu
    elif activation is "sigmoid":
        activation_func = sigmoid
    else:
        raise Exception('Non-supported activation function')
        
    # return of calculated activation A and the intermediate Z matrix
    return activation_func(Z_curr), Z_curr


  if activation is "relu":
  elif activation is "sigmoid":


In [None]:
def full_forward_propagation(X, params_values, nn_architecture):
    # creating a temporary memory to store the information needed for a backward step
    memory = {}
    # X vector is the activation for layer 0 
    A_curr = X
    
    # iteration over network layers
    for idx, layer in enumerate(nn_architecture):
        # we number network layers from 1
        layer_idx = idx + 1
        # transfer the activation from the previous iteration
        A_prev = A_curr
        
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]
        # extraction of W for the current layer
        W_curr = params_values["W" + str(layer_idx)]
        # extraction of b for the current layer
        b_curr = params_values["b" + str(layer_idx)]
        # calculation of activation for the current layer
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
        
        # saving calculated values in the memory
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr
       
    # return of prediction vector and a dictionary containing intermediate values
    return A_curr, memory

In [57]:
def get_cost_value(Y_hat, Y):
    # number of examples
    m = Y_hat.shape[1]
    # calculation of the cost according to the formula
    cost = -1 / m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
    return np.squeeze(cost)

In [58]:
# an auxiliary function that converts probability into class
def convert_prob_into_class(probs):
    probs_ = np.copy(probs)
    probs_[probs_ > 0.5] = 1
    probs_[probs_ <= 0.5] = 0
    return probs_

In [59]:
def get_accuracy_value(Y_hat, Y):
    Y_hat_ = convert_prob_into_class(Y_hat)

    return (Y_hat_ == Y).all(axis=0).mean()

In [60]:
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
    # number of examples
    m = A_prev.shape[1]
    
    # selection of activation function
    if activation is "relu":
        backward_activation_func = relu_backward
    elif activation is "sigmoid":
        backward_activation_func = sigmoid_backward
    else:
        raise Exception('Non-supported activation function')
    
    # calculation of the activation function derivative
    dZ_curr = backward_activation_func(dA_curr, Z_curr)
    
    # derivative of the matrix W
    dW_curr = np.dot(dZ_curr, A_prev.T) / m
    # derivative of the vector b
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    # derivative of the matrix A_prev
    dA_prev = np.dot(W_curr.T, dZ_curr)

    return dA_prev, dW_curr, db_curr

  if activation is "relu":
  elif activation is "sigmoid":


In [61]:
def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    
    # number of examples
    m = Y.shape[1]
    # a hack ensuring the same shape of the prediction vector and labels vector
    Y = Y.reshape(Y_hat.shape)
    
    # initiation of gradient descent algorithm
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));
    
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        # we number network layers from 1
        layer_idx_curr = layer_idx_prev + 1
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
            dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
        grads_values["db" + str(layer_idx_curr)] = db_curr
    
    return grads_values

In [62]:
def update(params_values, grads_values, nn_architecture, learning_rate):

    # iteration over network layers
    for layer_idx, layer in enumerate(nn_architecture, 1):
        params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)]        
        params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)]

    return params_values

In [63]:
def train(X, Y, nn_architecture, epochs, learning_rate, verbose=True, callback=None):
    # initiation of neural net parameters
    params_values = init_layers(nn_architecture, 2)
    # initiation of lists storing the history 
    # of metrics calculated during the learning process 
    cost_history = []
    accuracy_history = []
    
    # performing calculations for subsequent iterations
    for i in range(epochs):
        # step forward
        Y_hat, cashe = full_forward_propagation(X, params_values, nn_architecture)
        
        # calculating metrics and saving them in history
        cost = get_cost_value(Y_hat, Y)
        cost_history.append(cost)
        accuracy = get_accuracy_value(Y_hat, Y)
        accuracy_history.append(accuracy)
        
        # step backward - calculating gradient
        grads_values = full_backward_propagation(Y_hat, Y, cashe, params_values, nn_architecture)
        # updating model state
        params_values = update(params_values, grads_values, nn_architecture, learning_rate)
        
        if i % 10 == 0:
            if verbose:
                print("Iteration: {:05} - cost: {:.5f} - accuracy: {:.5f}".format(i, cost, accuracy))
            if callback is not None :
                callback(i, params_values)
            
    return params_values

In [75]:
from proj1_helpers import load_csv_data
from implementations import lasso_reg
DATA_TRAIN_PATH = 'data/train.csv'
GAMMA = 0.01
MAX_ITER = 100
LAMBDA = 1e-9

y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# init_weights = np.random.random_sample((tX.shape[1], 1))
# w, _ = lasso_reg(y.reshape((-1, 1)), tX, init_weights, 40, GAMMA, LAMBDA)
#
# removed_features = np.where(w == 0)
# tX = np.delete(tX, removed_features, axis=1)
# np.isnan(tX)

In [76]:
inds = np.where(tX == -999)
tX[inds] = np.nan

col_mean = np.nanmedian(tX, axis=0)
print(col_mean)

#Find indices that you need to replace
inds = np.where(np.isnan(tX))

#Place column means in the indices. Align the arrays using ta`ke
tX[inds] = np.take(col_mean, inds[1])
print(np.nanmean(tX, axis=0))

xmin, xmax = np.min(tX, axis=0), np.max(tX, axis=0)
tX = (tX - xmin) / (xmax-xmin)

print(tX.shape)

[ 1.124060e+02  4.652400e+01  7.375200e+01  3.846750e+01  2.107000e+00
  2.258850e+02 -2.440000e-01  2.491500e+00  1.231550e+01  1.206645e+02
  1.280000e+00 -3.560000e-01  4.540000e-01  3.180400e+01 -2.300000e-02
 -3.300000e-02  4.051600e+01 -4.500000e-02  8.600000e-02  3.480200e+01
 -2.400000e-02  1.797390e+02  1.000000e+00  6.556100e+01  0.000000e+00
 -3.300000e-02  4.790200e+01 -1.000000e-02 -2.000000e-03  4.051250e+01]
[ 1.20417434e+02  4.92398193e+01  8.11819816e+01  5.78959617e+01
  2.19310420e+00  2.68220619e+02 -4.11628932e-01  2.37309984e+00
  1.89173324e+01  1.58432217e+02  1.43760943e+00 -1.28304708e-01
  4.55244780e-01  3.87074191e+01 -1.09730480e-02 -8.17107200e-03
  4.66602072e+01 -1.95074680e-02  4.35429640e-02  4.17172345e+01
 -1.01191920e-02  2.09797178e+02  9.79176000e-01  7.71243656e+01
 -1.96589200e-03 -2.06285240e-02  5.07391493e+01 -1.05354440e-02
 -1.87879200e-03  7.30645914e+01]
(250000, 30)


In [None]:
init_weights = np.random.random_sample((tX.shape[1], 1))
w, _ = lasso_reg(y.reshape((-1, 1)), tX, init_weights, 100, GAMMA, LAMBDA)

removed_features = np.where(w == 0)
tX = np.delete(tX, removed_features, axis=1)

print("remaining features {}", len(tX[0]))

NN_ARCHITECTURE = [
    {"input_dim": len(tX[0]), "output_dim": 16, "activation": "relu"},
    {"input_dim": 16, "output_dim": 16, "activation": "relu"},
    {"input_dim": 16, "output_dim": 8, "activation": "relu"},
    {"input_dim": 8, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 1, "activation": "sigmoid"},
]

In [79]:
NN_ARCHITECTURE = [
    {"input_dim": len(tX[0]), "output_dim": 32, "activation": "relu"},
    {"input_dim": 32, "output_dim": 64, "activation": "relu"},
    {"input_dim": 64, "output_dim": 128, "activation": "relu"},
    {"input_dim": 128, "output_dim": 64, "activation": "relu"},
    {"input_dim": 64, "output_dim": 1, "activation": "sigmoid"},
]
y[y == -1] = 0
inds = np.where(y == 0)[0]
np.random.shuffle(inds)
size = inds.shape[0]
# copy = np.copy(inds)
# np.random.shuffle(inds)
# print(np.equal(inds, copy))
y_balanced = np.delete(y, inds[:size - len(y[y == 1])])

tX_balanced = np.delete(tX, inds[:size - len(y[y == 1])], 0)

p = np.random.permutation(len(tX))
params_values = train(np.transpose(tX), np.transpose(y.reshape((-1 , 1))), NN_ARCHITECTURE, 10000, 0.1)

Iteration: 00000 - cost: 0.68314 - accuracy: 0.65733
Iteration: 00010 - cost: 0.64877 - accuracy: 0.65733
Iteration: 00020 - cost: 0.64307 - accuracy: 0.65733
Iteration: 00030 - cost: 0.64197 - accuracy: 0.65733
Iteration: 00040 - cost: 0.64154 - accuracy: 0.65733
Iteration: 00050 - cost: 0.64121 - accuracy: 0.65733
Iteration: 00060 - cost: 0.64089 - accuracy: 0.65733
Iteration: 00070 - cost: 0.64054 - accuracy: 0.65733
Iteration: 00080 - cost: 0.64016 - accuracy: 0.65733
Iteration: 00090 - cost: 0.63972 - accuracy: 0.65733
Iteration: 00100 - cost: 0.63922 - accuracy: 0.65733
Iteration: 00110 - cost: 0.63864 - accuracy: 0.65733
Iteration: 00120 - cost: 0.63798 - accuracy: 0.65733
Iteration: 00130 - cost: 0.63725 - accuracy: 0.65733
Iteration: 00140 - cost: 0.63643 - accuracy: 0.65733
Iteration: 00150 - cost: 0.63551 - accuracy: 0.65733
Iteration: 00160 - cost: 0.63446 - accuracy: 0.65733
Iteration: 00170 - cost: 0.63327 - accuracy: 0.65733
Iteration: 00180 - cost: 0.63190 - accuracy: 0

In [80]:
y_test, tX_test, ids = load_csv_data('data/test.csv')

inds = np.where(tX_test == -999)
tX_test[inds] = np.nan

col_mean = np.nanmedian(tX_test, axis=0)
print(col_mean)

#Find indices that you need to replace
inds = np.where(np.isnan(tX_test))

#Place column means in the indices. Align the arrays using ta`ke
tX_test[inds] = np.take(col_mean, inds[1])
print(np.nanmean(tX_test, axis=0))

xmin, xmax = np.min(tX_test, axis=0), np.max(tX_test, axis=0)
tX_test = (tX_test - xmin) / (xmax-xmin)

Y_test_pred, _ = full_forward_propagation(np.transpose(tX_test), params_values, NN_ARCHITECTURE)
Y_test_pred[Y_test_pred == 0] = -1

[ 1.12541e+02  4.64670e+01  7.37400e+01  3.84720e+01  2.09900e+00
  2.26345e+02 -2.44000e-01  2.49200e+00  1.24130e+01  1.20666e+02
  1.28200e+00 -3.56000e-01  4.56000e-01  3.17655e+01 -2.20000e-02
 -4.20000e-02  4.05530e+01 -3.80000e-02  9.70000e-02  3.47540e+01
 -1.60000e-02  1.79940e+02  1.00000e+00  6.58390e+01  1.00000e-03
 -4.80000e-02  4.80370e+01 -1.20000e-02 -2.70000e-02  4.05040e+01]
[ 1.20451557e+02  4.92583872e+01  8.11223377e+01  5.78290937e+01
  2.18809638e+00  2.68855809e+02 -4.15146982e-01  2.37421083e+00
  1.89926203e+01  1.58668286e+02  1.43928858e+00 -1.26825167e-01
  4.56746303e-01  3.86940752e+01 -1.19663944e-02 -1.53522257e-02
  4.67065833e+01 -1.88911706e-02  5.20638271e-02  4.16269376e+01
 -7.98097804e-03  2.09957809e+02  9.80251233e-01  7.72944149e+01
  1.87115258e-04 -3.22294215e-02  5.08991836e+01 -9.78708569e-03
 -2.33218123e-02  7.32676287e+01]


In [85]:
from proj1_helpers import create_csv_submission

print(Y_test_pred)
create_csv_submission(ids, Y_test_pred[0], 'data/nn.csv')

[[-1. -1. -1. ... -1. -1. -1.]]


In [92]:
import pickle

a_file = open("data/weights.pkl", "wb")
pickle.dump(params_values, a_file)
a_file.close()