# CS-GY 9223-D: Deep Learning Homework 1
Due on Friday, 15th February 2019, 11:55 PM

This homework can be done in pairs. Everyone must submit on NYU Classes individually.

Write down the UNIs (NetIDs) of your group (if applicable)

Member 1: Hupo Tang, ht1073

Member 2: Yuan Tang Lin, ytl329

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

## Data Loading

In [2]:
import torch
import torchvision
from torchvision.transforms import transforms

In [3]:
# Download the data
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
Files already downloaded and verified


In [40]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=10,
                                         shuffle=False, num_workers=2)
len(testloader)

1000

## Model Architecture

In [91]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        # here layer_dimensions is [3072, 256, 128, 10], 2 hidden layers
        # I set num_layers to 2, except input layer and output layer which are No.0,3 layer
        np.random.seed(1)
        
        self.batch_size = 100
        self.devide_ratio = 0.9
        self.alpha = 0.01
        self.eps = 0.0000001
        
        self.num_layers = len(layer_dimensions) - 2
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        self.parameters = {} # save layer No.1,2,3 's parameters
        for i in range(1, len(layer_dimensions)):
            eps = np.sqrt(2.0 / (layer_dimensions[i] + layer_dimensions[i-1]))
            W = np.random.randn(layer_dimensions[i], layer_dimensions[i-1]) * eps
            b = np.zeros((layer_dimensions[i],))
            self.parameters.update({i: {'W': W, 'b': b}})
        
        # init parameters
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass. W_i, A_(i-1), b_i
        """
        Z = W.dot(A) + np.tile(b, (A.shape[1], 1)).T
        cache = {'Z': Z, 'A': A, 'W': W, 'b': b}
        if self.drop_prob > 0:
            Z, m = self.dropout(Z, self.drop_prob)
            cache['m'] = m
        return Z, cache
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        return self.relu(A)


    def relu(self, X):
        return np.maximum(X, 0)

            
    def dropout(self, A, prob):
        """
        :param A: Activation
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0], A.shape[1])
        M = (M >= prob) * 1
        A = np.multiply(A, M)
        return A, M
    
    def softmax(self, X):
#         X = X - X.max()
        return np.exp(X) / np.sum(np.exp(X), axis=0)
    
    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        AL = X
        cache = {}
        for l in range(1, self.num_layers+1):
            W = self.parameters[l]['W']
            b = self.parameters[l]['b']
            Z, _cache = self.affineForward(AL, W, b)
            AL = self.activationForward(Z, activation="relu")
            cache.update({l:_cache})
        last_W, last_b = self.parameters[len(layer_dimensions)-1]['W'], self.parameters[len(layer_dimensions)-1]['b'] # W_3, b_3
        last_Z, _cache = self.affineForward(AL, last_W, last_b) # Z_3
        cache.update({self.num_layers+1: {'W': last_W, 'b': last_b, 'Z': last_Z, 'A': AL}})
        AL = last_Z
        return AL, cache
    
    def one_hot(self, y, num_classes=10):
        y_one_hot = np.zeros((y.shape[0], num_classes))
        y_one_hot[np.arange(y.shape[0]), y] = 1
        return np.array(y_one_hot)
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        _y = self.softmax(AL) # size (num_classes, batch_size) here 10*50
        y = self.one_hot(y).T # size (num_classes, batch_size) here 10*50
#         _y_prob = AL[y, y.shape[0]] # size here 50
#         _y = np.argmax(AL, axis=0) # get 50 labels
        cost = - (y * (np.log(_y + self.eps)) + (1-y)*np.log(1 - _y + self.eps))
        
        reg = 0
        if self.reg_lambda > 0:
            # add regularization
            penalty = 0
            for i in range(1, self.num_layers+1):
                penalty += (self.parameters[i]['W'] ** 2).sum()
            reg = self.reg_lambda * penalty / 2
        
        cost = cost.mean() + reg
        
        # gradient of cost
        dAL = (_y - y) / y.shape[0] # dZ_3
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer. dA_2, dA_1
        :param cache: cache returned in affineForward. cache[2], cache[1]
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        
        dZ = self.activationBackward(dA_prev, cache, activation='relu') # dZ_2, dZ_1
        db = np.sum(dZ, axis=1) # db_2, db_1
        dW = dZ.dot(cache['A'].T) + self.reg_lambda * cache['W'] # dW_2, dW_1
        dA = cache['W'].T.dot(dZ) # dA_1, dx
        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        return np.multiply(dA, self.relu_derivative(dA, cache['Z']))

        
    def relu_derivative(self, dx, cached_x):
        dx = 1.0 * (cached_x>0)
        return dx

    def dropout_backward(self, dA, cache):
        dA = np.multiply(dA, cache['m'])
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        dZ = dAL # dZ_3, activation derivative is calculated in def costFunction
        dW = dZ.dot(cache[3]['A'].T) + self.reg_lambda * cache[3]['W'] # dW_3
        db = np.sum(dZ, axis=1) # db_3
        dA = cache[3]['W'].T.dot(dZ) # dA_2
        gradients[3] = {'dW': dW, 'db': db}
        for i in range(2, 0, -1):

            if self.drop_prob > 0:
                #call dropout_backward
                dA = self.dropout_backward(dA, cache[i])
                
            dA, dW, db = self.affineBackward(dA, cache[i])
            gradients[i] = {'dW': dW, 'db': db}
#         if self.reg_lambda > 0:  # already add during above codes
            # add gradients from L2 regularization to each dW
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        for i in range(1, self.num_layers+1):
            self.parameters[i]['W'] -= alpha * gradients[i]['dW']
            self.parameters[i]['b'] -= alpha * gradients[i]['db']
            
    def makeTrainAndValidation(self, X, y, devide_ratio):
        p = int(X.shape[1] * devide_ratio)
        x_t, y_t = X[:, :p], y[:p]
        x_v, y_v = X[:, p:], y[p:]
        splited_set = {
            'train': (np.array(x_t), np.array(y_t)),
            'validation': (np.array(x_v), np.array(y_v))
        }
        return splited_set
    
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        data_set = self.makeTrainAndValidation(X, y, self.devide_ratio)
        for e in range(0, iters):
            # get minibatch
            for phase in ['train', 'validation']:
                phase_X, phase_y = data_set[phase]
                X_batch, y_batch = self.get_batch(phase_X, phase_y, batch_size)
                
                overall_loss, overall_corrects = 0.0, 0
                num_batches = phase_X.shape[1] // batch_size
#                 flag = num_batches // 100
                
                for i, data in enumerate(zip(X_batch, y_batch)):
                    x_minibatch, y_minibatch = data
                
            # forward prop
                    AL, cache = self.forwardPropagation(x_minibatch)
                
            # compute loss
                    batch_loss, dAL = self.costFunction(AL, y_minibatch)
                    _y = self.softmax(AL)
                    batch_corrects = np.sum(np.argmax(_y, axis=0)==y_minibatch)
                    
                    overall_loss += batch_loss
                    overall_corrects += batch_corrects
                    
            # compute gradients
                    if phase=='train':
                        g = self.backPropagation(dAL, y_batch, cache)
                    
            # update weights and biases based on gradient
                        self.updateParameters(g, alpha)
                        
            if e % print_every == 0:
                # print cost, train and validation set accuracies
                print("Epoch {:3d} , phase {:10} finished, global_loss:{:6.4f}, global_acc:{:6.4f}%".format(e+1, phase, overall_loss/i, 100*overall_corrects/(i*batch_size)))
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        AL, cache = self.forwardPropagation(X)
        y_pred = self.softmax(AL)
        y_pred = self.ont_hot(y_pred)
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        # resort X,y
        n = X.shape[1]
        perm = np.random.permutation(n)
        X = X[:, perm]
        y = y[perm]
        
        X_b, y_b = [], []
        for i in range(0, n, batch_size):
            _x = X[:, i:i+batch_size]
            _y = y[i:i+batch_size]
            X_b.append(_x)
            y_b.append(_y)
        X_batch, y_batch = np.array(X_b), np.array(y_b)
        return X_batch, y_batch


def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [92]:
traindataiter = iter(trainloader)
X_train, y_train = traindataiter.next()
X_train, y_train = X_train.numpy(), y_train.numpy()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2]*X_train.shape[3]).T

layer_dimensions = [X_train.shape[0], 256, 128, 10]
MyNN = NeuralNetwork(layer_dimensions, drop_prob=0.03, reg_lambda=0.000001)
MyNN.train(X_train, y_train)

# for _ in range(len(trainloader)):
#     MyNN.train(X_train, y_train)
#     X_train, y_train = traindataiter.next()
#     X_train, y_train = X_train.numpy(), y_train.numpy()
#     X_train = X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2]*X_train.shape[3]).T
    



Epoch   1 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 101 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 201 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 301 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 401 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 501 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 601 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 701 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 801 , phase validation finished, global_loss:   inf, global_acc:   inf%
Epoch 901 , phase validation finished, global_loss:   inf, global_acc:   inf%
