# CS-GY 9223-E: Deep Learning Homework 1
Due on Sunday, 11th February 2018, 11:55 PM

This homework can be done in pairs. Everyone must submit on NYU Classes individually.

Write down the UNIs (NetIDs) of your group (if applicable)

Member 1: John Martinez, jzm218

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import os
import sys
# you shouldn't need to make any more imports

In [2]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        self.parameters = {} # To be a dictionary of random weights, zero biases 
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        
        for l in range(1, self.num_layers):
            
            weights = np.random.randn(layer_dimensions[l], layer_dimensions[l-1]) * 0.01
            biases = np.zeros((layer_dimensions[l], 1)) 
            
            # Creating W1, b1, W2, b2...Wl, bl
            
            self.parameters['W'+str(l)] = weights
            self.parameters['b'+str(l)] = biases
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        cache = {'W':W, 
                 'Aprev':A, 
                 'b':b}
        
        return np.matrix(W) * np.matrix(A) + b, cache
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        return self.relu(A)


    def relu(self, X):
        return np.maximum(0, X)
            
    def dropout(self, A, prob):
        """
        :param A: Activation
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0], A.shape[1])
        M = (M > prob) * 1.0
        M /= 1 - prob
        A = np.multiply(A, M)
        
        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        
        L = len(self.parameters) // 2
        cache = [None] * (L+1)
        Aprev = X
        Z = None
        
        for layer in range(1,L):
            
            W = self.parameters['W'+str(layer)] 
            b = self.parameters['b'+str(layer)]
            Z, cacheItem = self.affineForward(Aprev, W, b) # W * Aprev + b 
            A = self.activationForward(Z)
            cacheItem['Z'] = Z
            cacheItem['A'] = A
            
            # Applying dropout
            if self.drop_prob > 0:
                A, M = self.dropout(A, self.drop_prob) 
                cacheItem['M'] = M
            cache[layer] = cacheItem
            
            Aprev = A
        
        
        # Applying softmax for last layer 
        WL = self.parameters['W'+str(L)]
        bL = self.parameters['b'+str(L)]
        
        ZL, cacheItem = self.affineForward(Aprev, WL, bL)
        AL = np.exp(ZL) / np.sum(np.exp(ZL), axis = 0)
        
        cacheItem['Z'] = ZL
        cacheItem['A'] = AL
        
        cache[L] = cacheItem
        
        return AL, cache
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        m = y.shape[0]
        yhat = AL         
        y = one_hot(y)
        
        cost = -(1/m) * np.sum(np.multiply(y , np.log(yhat+1e-6)))
        
        if self.reg_lambda > 0:
            # add regularization
            RW = 0
            for l in range(1, self.num_layers):
                RW += np.sum(np.square(self.parameters['W'+str(l)]))
            cost += self.reg_lambda * RW
       
        
        # gradient of cost
        dAL = AL - y
        
        
        return cost, dAL

    def affineBackward(self, dZ, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """  
        Aprev = cache['Aprev']
        
        
        dW = np.dot(dZ, Aprev.T)

        db = np.matrix(np.sum(dZ, axis=1))
        

        
        return dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        #### Potentially takes in linear activation backward cache ####
        """
        Z = cache['Z']
        return self.relu_derivative(Z)   

        
    def relu_derivative(self, Z):
        dA = Z
        dA[dA <= 0] = 0
        dA[dA > 0] = 1
        return dA

    def dropout_backward(self, dA, cache):
        M = cache['M']
        dA = np.multiply(dA,  M)
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        L = len(self.parameters) // 2
        gradients = {}
        
        
        cacheItem = cache[L]
       
        dZL = np.multiply(dAL, self.activationBackward(None, cacheItem))
        dWL, dbL = self.affineBackward(dZL, cacheItem)
        dA = dAL
        dZ = dZL
        
        gradients['dW'+str(L)] = dWL
        gradients['db'+str(L)] = dbL
        
        
        for layer in range(L-1,0,-1):
            
            cacheItem = cache[layer]
            
            
            
            
            dA = self.parameters['W'+str(layer+1)].T * dZ #dZprev
            
            dZ = np.multiply(dA, self.activationBackward(None, cacheItem)) # dZ = dA *  relu'(Z)
            
            if self.drop_prob > 0:
                dA = self.dropout_backward(dA, cacheItem)
            
            dW, db = self.affineBackward(dZ, cacheItem)
            
            gradients["dW" + str(layer)] = dW
            gradients["db" + str(layer)] = db
            
            
        if self.reg_lambda > 0:
            m = Y.shape[0]
            # add gradients from L2 regularization to each dW
            for layer in range(1, self.num_layers):
                gradients['dW'+str(layer)] += (self.reg_lambda * self.parameters['W'+str(layer)]) / m
                
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        L = len(self.parameters) // 2        
        
        for layer in range(1, L+1):
            l = str(layer)
            
            #print("Weight shape:",self.parameters['W'+l].shape )
            #print("dWeight shape:",gradients['dW'+l].shape )
                
            self.parameters['W'+l] -= alpha * gradients['dW'+l]
            self.parameters['b'+l] -= alpha * gradients['db'+l]

    def train(self, X, y, iters=1000, alpha=0.00001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        L = len(self.parameters) // 2
        
        for i in range(0, iters):
            
            # get minibatch
            X_batch, y_batch = self.get_batch(X, y, batch_size)
            # forward prop
            AL, cache = self.forwardPropagation(X_batch)
            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)
            # compute gradients
            gradients = self.backPropagation(dAL, y_batch, cache)
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            
            if i % print_every == 0:
                # print cost, train and validation set accuracies
                print("Cost: ", cost)
                y_pred = self.predict(X_batch)
                yhat = np.array(np.argmax(y_pred, axis = 0))[0]
                acc = (yhat == y_batch).sum() / len(y_batch)
                print("Train Acc: ",acc)

                
                
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        oldProp = self.drop_prob
        self.drop_prob = 0
        y_pred, _ = self.forwardPropagation(X)
        self.drop_prob = oldProp
        #y_pred = np.array(np.argmax(AL, axis = 0))[0]
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        # Random indices for the samples
        indices = np.random.randint(y.shape[0]-1, size= batch_size)
        
        X_batch = X[:, indices]
        y_batch = y[indices]
        
        return X_batch, y_batch

In [3]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [4]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    One-hot encoding converts categorical labels to binary values
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    print(os.listdir())
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1 
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [5]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

['.DS_Store', '.ipynb_checkpoints', 'ans1-jzm218.npy', 'ans2-jzm218.npy', 'cifar10-hw1', 'HW1-jzm218', 'HW1-jzm218.ipynb', 'HW1-jzm218.zip']
{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
  import sys


Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [22]:
layer_dimensions = [X_train.shape[0], 350, 10]
#print(len(X_train))
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=3000, alpha=0.0001, batch_size=1000, print_every=50)
print("Training done.")

Cost:  2.30323901302
Train Acc:  0.101
Cost:  2.22883061247
Train Acc:  0.206
Cost:  1.96476805444
Train Acc:  0.3
Cost:  1.92057473983
Train Acc:  0.335
Cost:  1.94263085437
Train Acc:  0.329
Cost:  1.83765864936
Train Acc:  0.37
Cost:  1.85888278629
Train Acc:  0.324
Cost:  1.8005661853
Train Acc:  0.363
Cost:  1.77722494272
Train Acc:  0.371
Cost:  1.78980160478
Train Acc:  0.408
Cost:  1.73870857489
Train Acc:  0.38
Cost:  1.73654169065
Train Acc:  0.398
Cost:  1.68537355342
Train Acc:  0.428
Cost:  1.72725751551
Train Acc:  0.393
Cost:  1.68316603682
Train Acc:  0.431
Cost:  1.61213548572
Train Acc:  0.473
Cost:  1.64230054421
Train Acc:  0.429
Cost:  1.6167988685
Train Acc:  0.453
Cost:  1.57022090477
Train Acc:  0.464
Cost:  1.62137752004
Train Acc:  0.411
Cost:  1.59005735512
Train Acc:  0.466
Cost:  1.60876869769
Train Acc:  0.481
Cost:  1.68478307058
Train Acc:  0.397
Cost:  1.60598757271
Train Acc:  0.442
Cost:  1.60529744213
Train Acc:  0.456
Cost:  1.57848486719
Train Acc:

In [23]:
y_predicted = NN.predict(X_train)
save_predictions('ans1-jzm218', y_predicted)

In [24]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-jzm218.npy')
print(loaded_y.shape)
loaded_y[:10]

(10, 50000)


array([[  4.09969685e-03,   4.98947432e-02,   2.19850889e-04, ...,
          6.53461956e-04,   9.51099880e-04,   5.39207648e-02],
       [  1.93937161e-02,   2.59585061e-01,   1.70883498e-03, ...,
          1.88101614e-02,   4.33333949e-02,   1.58799188e-01],
       [  3.04691592e-02,   5.66266798e-02,   3.62050631e-02, ...,
          1.34537273e-03,   8.70547878e-03,   1.52529334e-02],
       ..., 
       [  9.08645197e-02,   2.20032236e-01,   8.59933775e-02, ...,
          4.89924282e-03,   2.98749227e-04,   2.34915702e-01],
       [  1.49307627e-04,   7.13461037e-02,   4.33978770e-03, ...,
          1.48357466e-04,   7.35182790e-03,   5.77879021e-03],
       [  1.99752972e-03,   1.65344568e-02,   7.44482797e-03, ...,
          2.19231205e-02,   7.56605056e-03,   2.66325274e-01]])

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [25]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.1, reg_lambda=0.02)
NN2.train(X_train, y_train, iters=3000, alpha=0.0001, batch_size=2000, print_every=40)
print("Training done.")

Cost:  4.45478646887
Train Acc:  0.107
Cost:  4.40635340065
Train Acc:  0.169
Cost:  4.20289236924
Train Acc:  0.274
Cost:  4.32834138302
Train Acc:  0.3285
Cost:  4.29283857643
Train Acc:  0.308
Cost:  4.2890273611
Train Acc:  0.351
Cost:  4.22569517187
Train Acc:  0.36
Cost:  4.30956121708
Train Acc:  0.359
Cost:  4.31177515108
Train Acc:  0.3805
Cost:  4.42599489511
Train Acc:  0.3535
Cost:  4.51383189002
Train Acc:  0.376
Cost:  4.58800729961
Train Acc:  0.3405
Cost:  4.40465266441
Train Acc:  0.381
Cost:  4.37090855272
Train Acc:  0.418
Cost:  4.62740720479
Train Acc:  0.391
Cost:  4.44762507036
Train Acc:  0.427
Cost:  4.57821707814
Train Acc:  0.416
Cost:  4.62852427804
Train Acc:  0.3915
Cost:  4.53527502405
Train Acc:  0.466
Cost:  4.59153294856
Train Acc:  0.4595
Cost:  4.57273069995
Train Acc:  0.4555
Cost:  4.67727815841
Train Acc:  0.446
Cost:  4.68096783748
Train Acc:  0.434
Cost:  4.72973835597
Train Acc:  0.414
Cost:  4.71462527772
Train Acc:  0.4255
Cost:  4.9000228272

In [134]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-jzm218', y_predicted2)