# Modular Neural Network

_Objective_: To build a modular neural network program for experimentation with mimimum use of libraries

_References:_

Overall base: deeplearning.ai specialization, Andrew Ng, Coursera  
[CS231n](cs231n.github.io), Andrej Karpathy   
[Stats385](stats385.github.io)


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### Importing required libraries

In [3]:
import pandas as pd
import os
import struct
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
#import dill as pickle

## Parsing through data [MNIST idx]

In [2]:
# source: https://gist.github.com/akesling/5358964 + corrections

def read(dataset = "train", path = "../data/"):
    """
    Python function for importing the MNIST data set.  It returns an iterator
    of 2-tuples with the first element being the label and the second element
    being a numpy.uint8 2D array of pixel data for the given image.
    """

    if dataset is "train":
        fname_img = os.path.join(path, 'train-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
    elif dataset is "test":
        fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
    else:
        raise ValueError("dataset must be 'test' or 'train'")

    # Load everything in some numpy arrays
    with open(fname_lbl, 'rb') as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        lbl = np.fromfile(flbl, dtype=np.int8)

    with open(fname_img, 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)
        
   # return img, lbl

    get_img = lambda idx: (lbl[idx], img[idx])

    # Create an iterator which returns each image in turn
    for i in range(len(lbl)):
        yield get_img(i)


In [5]:
def show(image):
    ax = plt.figure().add_subplot(1,1,1).imshow(image, cmap=mpl.cm.Greens)
    plt.show()

In [38]:
training_data = read(dataset="train")

In [29]:
test_data = read(dataset="test")

### MNIST [kaggle csv]

In [8]:
df = pd.read_csv("../data/train.csv")
train = df.as_matrix()

train_y = train[:,0].astype('int8')
train_x = train[:,1:].astype('float64')

train = None

print("Shape Train Images: (%d,%d)" % train_x.shape)
print("Shape Labels: (%d)" % train_y.shape)

Shape Train Images: (42000,784)
Shape Labels: (42000)


In [4]:
``train = df.as_matrix()
train_y = train[:,0].astype('int8')
train_x = train[:,1:].astype('float64')

In [5]:
train_x.shape

(42000, 784)

In [6]:
train_y.shape

(42000,)

In [5]:
df = pd.read_csv("../data/test.csv")
test = df.as_matrix().astype('float64')
print("Shape Test Images: (%d,%d)" % test.shape)

Shape Test Images: (28000,784)


In [8]:
train_x /=255
test /= 255

In [9]:
train_y = pd.get_dummies(train_y).as_matrix()

In [10]:
train_y.shape

(42000, 10)

In [11]:
train_x

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [12]:
train_y

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

## Basic functions:

In [7]:
def image2vector(image):
    '''
    Flattening array
    '''
    v = image.reshape(image.shape[0]*image.shape[1],1)    
    return v

In [8]:
def normalizeRows(x):
    '''
    Normalising rows
    '''
    x_norm = np.linalg.norm(x,ord=2,axis=1,keepdims=True)
    x = x/x_norm
    return x

## Initialization Functions

In [11]:
def initialize_parameters_zeros(layers_dims):    
    parameters = {}
    L = len(layers_dims)                
    for l in range(1, L):
        parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return parameters

In [12]:
def initialize_parameters_random(layers_dims):
    parameters = {}
    L = len(layers_dims)
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 10
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return parameters

In [13]:
def initialize_parameters_he(layers_dims):
    parameters = {}
    L = len(layers_dims)
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return parameters

In [14]:
def init_params(layers_dims, md="he"):
    if md=="he":
        return initialize_parameters_he(layers_dims)
    elif md=="random":
        return initialize_parameters_random(layers_dims)
    elif md=="zeros":
        return initialize_parameters_zeros(layers_dims)
    else:
        return ValueError("Enter mode as either he, random or zeros")

## Activation functions

In [15]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [16]:
def softmax(x):
    x_exp = np.exp(x)
    x_sum = np.sum(x_exp, axis = 1, keepdims=True )
    s = x_exp/x_sum
    return s

In [17]:
def tanh(x):
    return np.tanh(x)

In [18]:
def relu(x):
    return np.maximum(0,x)

## Optimization Functions

In [19]:
def update_parameters_with_gd(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural networks
    for l in range(L):
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]
    return parameters

In [20]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    
    m = X.shape[1]                  # number of training examples
    mini_batches = []
        
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1,m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:,k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        end = m - mini_batch_size * math.floor(m / mini_batch_size)
        mini_batch_X = shuffled_X[:,num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:,num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [21]:
def initialize_velocity(parameters):
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    # Initialize velocity
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l+1)])
        v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l+1)])
    return v

In [22]:
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural networks
    # Momentum update for each parameter
    for l in range(L):
        # compute velocities
        v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]
        v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]
        # update parameters
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]
    return parameters, v

In [23]:
def initialize_adam(parameters) :
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)])
        v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)])

        s["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l + 1)])
        s["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l + 1)])
    return v, s

In [24]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01,
                                beta1=0.9, beta2=0.999, epsilon=1e-8):
  
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        v["dW" + str(l + 1)] = beta1 * v["dW" + str(l + 1)] + (1 - beta1) * grads['dW' + str(l + 1)]
        v["db" + str(l + 1)] = beta1 * v["db" + str(l + 1)] + (1 - beta1) * grads['db' + str(l + 1)]

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        v_corrected["dW" + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t))
        v_corrected["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - np.power(beta1, t))

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)
        s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t))
        s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t))

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v_corrected["dW" + str(l + 1)] / np.sqrt(s["dW" + str(l + 1)] + epsilon)
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v_corrected["db" + str(l + 1)] / np.sqrt(s["db" + str(l + 1)] + epsilon)

    return parameters, v, s

## Cost Functions

In [25]:
def compute_cost(a3, Y):
    m = Y.shape[1]
    
    logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
    cost = 1./m * np.sum(logprobs)
    
    return cost

## Model Creation

### Forward Prop

In [26]:
def forward_propagation(X, parameters):    
    # retrieve parameters
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    z1 = np.dot(W1, X) + b1
    a1 = relu(z1)
    z2 = np.dot(W2, a1) + b2
    a2 = relu(z2)
    z3 = np.dot(W3, a2) + b3
    a3 = softmax(z3)
    
    cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
    
    return a3, cache


### Backward Prop

In [57]:
def backward_propagation(X, Y, cache):
    m = X.shape[1]
    (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
    print(m)
    print(cache)
    dz3 = 1./m * (a3 - Y)
    dW3 = np.dot(dz3, a2.T)
    db3 = np.sum(dz3, axis=1, keepdims = True)
    
    da2 = np.dot(W3.T, dz3)
    dz2 = np.multiply(da2, np.int64(a2 > 0))
    dW2 = np.dot(dz2, a1.T)
    db2 = np.sum(dz2, axis=1, keepdims = True)
    
    da1 = np.dot(W2.T, dz2)
    dz1 = np.multiply(da1, np.int64(a1 > 0))
    dW1 = np.dot(dz1, X.T)
    db1 = np.sum(dz1, axis=1, keepdims = True)
    
    gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
                 "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
                 "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
    
    return gradients

### Accuracy and prediction

In [28]:
def predict(X, y, parameters):
    m = X.shape[1]
    p = np.zeros((1,m), dtype = np.int)
    
    # Forward propagation
    a3, caches = forward_propagation(X, parameters)
    
    # convert probas to 0/1 predictions
    for i in range(0, a3.shape[1]):
        if a3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    # print results

    print ("predictions: " + str(p[0,:]))
    print ("true labels: " + str(y[0,:]))
    print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
    
    return p

### Actual model

In [63]:
def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
    """
    Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
    learning_rate -- learning rate of the optimization
    num_iterations -- number of iterations of the optimization loop
    print_cost -- If True, print the cost every 10000 iterations
    lambd -- regularization hyperparameter, scalar
    keep_prob - probability of keeping a neuron active during drop-out, scalar.
    
    Returns:
    parameters -- parameters learned by the model. They can then be used to predict.
    """
        
    grads = {}
    costs = []                            # to keep track of the cost
    m = X.shape[1]                        # number of examples
    layers_dims = [train_x.shape[0], 5, 2, 10]

    # Initialize parameters dictionary.
    parameters = init_params(layers_dims)

    # Loop (gradient descent)

    for i in range(0, num_iterations):
        a3, cache = forward_propagation(X, parameters)
        cost = compute_cost(a3,Y)
        grads = backward_propagation(X, Y, cache)
        parameters = update_parameters_with_gd(parameters, grads, learning_rate)
        
        # Print the loss every 10000 iterations
        if print_cost and i % 10000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)
    
    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (x1,000)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

### Random experimentation

In [None]:
tr

In [None]:
tr[1].shape

In [None]:
print(tr[0])
show(tr[1])

In [None]:
tr = next(training_data)

In [None]:
tr.count

In [None]:
tr.index(1)

In [21]:
tr_new = list(training_data)

In [24]:
len(tr_new)

59999

In [25]:
tr_img, tr_lbls = read()

ValueError: too many values to unpack (expected 2)

In [30]:
training_data

<generator object read at 0x0000029679084D00>

In [34]:
tr_data.shape

(784, 60000)

In [55]:
training_data = read(dataset="train")

In [56]:
tr_labels = np.empty((10, 60000))

In [57]:
tr_data = np.empty((28*28, 60000)

In [None]:
tr_n = next(training_data)
    #tr_data= np.hstack((tr_data, image2vector(tr_n[1]))
    

In [60]:
nu = image2vector(tr_n[1])

In [67]:
nu = np.hstack((nu, nu))

In [70]:
for i in range(60000):
    tr_n = next(training_data)
    tr_data= np.hstack((tr_data, image2vector(tr_n[1]))
    #tr_labels[tr_n[0], i] = 1

SyntaxError: unexpected EOF while parsing (<ipython-input-70-cf67dc29ae88>, line 4)

In [42]:
tr_labels

array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [81]:
t[500:520]

NameError: name 't' is not defined

In [32]:
train_x.shape

(42000, 784)

In [33]:
train_x=train_x.T

In [34]:
train_y.shape

(42000, 10)

In [36]:
train_y=train_y.T

In [65]:
layers_dims = [train_x.shape[0], 5, 2, 10]

parameters = model(train_x, train_y, layers_dims)

42000
(array([[ -49.57701056,   35.68702283, -174.89322574, ...,  147.18622045,
          -3.08510095,  -54.89736408],
       [  32.55248259,  321.50454062,   14.45155067, ...,  134.79397836,
         191.58131382,  142.81672822],
       [-104.80275498, -256.08073169,   -3.3408231 , ..., -116.95198032,
        -201.49814271,  -56.91014992],
       [ 127.90784274,  -41.10446556,   30.40461665, ...,   42.35057771,
          24.67811221,   41.88663278],
       [ 220.24616769,  259.80575819,  104.42081127, ...,  247.00134759,
         256.54309844,  315.01751943]]), array([[   0.        ,   35.68702283,    0.        , ...,  147.18622045,
           0.        ,    0.        ],
       [  32.55248259,  321.50454062,   14.45155067, ...,  134.79397836,
         191.58131382,  142.81672822],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [ 127.90784274,    0.        ,   30.40461665, ...,   42.35057771,
          24.67811221,

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


ValueError: operands could not be broadcast together with shapes (4,) (5,784) 