# Updated Lab 7 Code for Homework

**Harvard University**<br>
**Fall 2018**<br>
**Instructors: Rahul Dave**<br>


**Instructions:**

- Upload your final answers in the form of a Jupyter notebook containing all work to Canvas.

- Structure your notebook and your work to maximize readability.

In [1]:
import numpy as np
import scipy.stats
import scipy.special

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib import cm
import pandas as pd
%matplotlib inline

## Standard boilerplate to import torch and torch related modules
import torch

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.autograd import Variable
import torch.nn

import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

In [3]:
# Regression Parent Class
class Regression(object):
    
    def __init__(self):
        self.params = dict()
    
    def get_params(self, k):
        return self.params.get(k, None)
    
    def set_params(self, **kwargs):
        for k,v in kwargs.items():
            self.params[k] = v
        
                    
    def fit(self, X, y):
        raise NotImplementedError()
        
    def predict(self, X):
        raise NotImplementedError()
        
    def score(self, X, y):
        raise NotImplementedError()
        

In [4]:
## Our PyTorch implementation of Logistic Regression
class LRPyTorch(nn.Module):

    ## the constructor is where we'll define all our layers (input, hidden, and output)
    def __init__(self):

        ## this line creates an instance of our parent (or base) class which in this case
        ## is nn.Module.
        super().__init__()

        ## in the lines below we'll create instance variables and assign them torch.nn Models
        ## in order to create our layers.  You should ordinarily have one variable definition for each layer
        ## in your neural network except for the output layer.  The output layer is defined by the number of
        ## outputs in your last layer. Since we're dealing with simple Artificial Neural Networks, we should
        ## predominantly be using nn.Linear.  
        self.l1 = nn.Linear(784, 10)

 
    # forwards takes as a parameter x -- the batch of inputs that we want to feed into our neural network model
    # and returns the output of the model ... i.e. the results of the output layer of the model after forward
    # propagation through our model. practically this means you should call each layer you defined in the
    # constructor in sequence plus any activation functions on each layer.
    def forward(self, x):
     
        # call all our layers on our input (in this case we only need one)
        x = self.l1(x)

        # Since we're using Cross Entropy Loss
        # we can return our output directly
        return x


In [1]:

class Artificial_Neural_Network(Regression):
    
    def __init__(self, input_model, reg_rate = 0.01, learning_rate=0.1, batch_size=256, epochs=30, hidden=None):
        
        super().__init__()
        
        
        ## Load MNIST Data
        train_dataset, test_dataset, train_loader, test_loader, validation_loader, train_idx, validation_idx = self.load_data(batch_size=batch_size)
        
        ## Add Datasets and Data Loaders to our params
        self.set_params(train_dataset=train_dataset, 
                        train_loader=train_loader,
                        test_dataset=test_dataset,
                        test_loader=test_loader,
                        validation_loader=validation_loader,
                        train_idx=train_idx,
                        validation_idx=validation_idx
                       )
        
        
        ## Here we instantiate the PyTorch model that we so nicely defined previously
        if hidden == None:
            model = input_model()
        else:
            model = input_model(hidden=hidden)

        ## Here we define our loss function.  We're using CrossEntropyLoss but other options include
        ## NLLLoss (negative log likelihood loss for when the log_softmax activation is explicitly defined
        ## on the output layer), MSELoss for OLS Regression, KLLDivLoss for KL Divergence, BCELoss
        ## for binary cross entropy and many others
        criterion = nn.CrossEntropyLoss()

        ## Here we define our optimizer.  In class we've been using SGD although in practice one will often
        ## use other optimizers like Adam or RMSProp.  The primary parameter the optimizer takes is the
        ## set of parameters in your model.  Fortunately those are easily accessible via model.paramters()
        ## where model is the instance of the model you defined.  Other useful parameters include lr for the
        ## learning rate and weight_decay for the rate of l2 regularization.
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.01)
        
        
        ## Set the rest of our parameters -- batch_size, learning_rate, epochs, optimizer,
        ## model and criterion
        
        ## Add Datasets and Data Loaders to our params
        self.set_params(optimizer=optimizer, 
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        model=model,
                        criterion=criterion,
                        epochs=epochs)   
        
    def load_data(self, validation_split=10000, batch_size=256):
        """load the MNIST training and test sets from MNIST"""
        
        
        ## We start by defining our training dataset
        ## --root-- a string pointing to the relative path of the directory where we'll store our MNIST data
        ## --train-- tells us whether to download the training set (True) or the test set (False)
        ## MNIST in torchvision only has train (60K) and test (10K) datasets.  Other datasets also have a validation set
        ## --transforms-- is a torchvision.transforms object that specifies what transforms to apply to each element
        ## in the dataset.  The required transform is transforms.ToTensor() that turns each element into a PyTorch floating
        ## point tensor object.  You could also add others like transforms.Normalize if you wished
        ## --download-- specifies whether to download the data from the online urls.   If set to false, then you should
        ## provide the data locally yourself
        train_dataset = datasets.MNIST(root='./hw3_data',
                                    train=True,
                                    transform=transforms.Compose([transforms.ToTensor(),
                                      transforms.Normalize((0.1307,), (0.3081,)),
                                     ]),
                                    download=True)

        ## similar to the above, the main difference is that we should set train=False since we want the

        ## test set data
        test_dataset = datasets.MNIST(root='./hw3_data',
                                   train=False,
                                   transform=transforms.Compose([transforms.ToTensor(),
                                      transforms.Normalize((0.1307,), (0.3081,)),
                                     ]),
                                   download=True)

        ## A DataLoader or (Dataset Loader) turns the specified data set into a sequence of data elements
        ## that you can access in your loops for training or evaluating accuracy, etc.

        ## First we need to further split our training dataset into training and validation sets.

        # Define the indices
        indices = list(range(len(train_dataset))) # start with all the indices in training set

        # Define your batch_size
        batch_size = batch_size

        # Random, non-contiguous split
        validation_idx = np.random.choice(indices, size=validation_split, replace=False)
        train_idx = list(set(indices) - set(validation_idx))

        # define our samplers -- we use a SubsetRandomSampler because it will return
        # a random subset of the split defined by the given indices without replacement
        train_sampler = SubsetRandomSampler(train_idx)
        validation_sampler = SubsetRandomSampler(validation_idx)

        # Create the train_loader -- use your real batch_size which you
        # I hope have defined somewhere above
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                        batch_size=batch_size, sampler=train_sampler)

        # You can use your above batch_size or just set it to 1 here.  Your validation
        # operations shouldn't be computationally intensive or require batching.
        validation_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                        batch_size=validation_split, sampler=validation_sampler)

        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                  batch_size=len(test_dataset),
                                                  shuffle=False)

        return (train_dataset, test_dataset, 
                train_loader, test_loader, 
                validation_loader,
                train_idx, validation_idx
               )
    
    def sample_training_images(self):
        """Create a set of sample images from the MNIST training images"""
        
        training_set = self.get_params('train_dataset')
        train_idx = self.get_params('train_idx')
        sample_indices = np.random.choice(train_idx, 10)
        
        sample_images = training_set.train_data[sample_indices,:,:].numpy()
        sample_labels = [training_set.train_labels[x] for x in sample_indices]
        
        self.set_params(sample_training_images=sample_images)
        self.set_params(sample_training_labels=sample_labels)
        
    def save_misclassified(self, predictions, images, labels):
        """Create and save a set of sample images misclassified images by the model"""
             
        sample_indices = np.random.choice(range(len(predictions)), 10)
        
        sample_images = [images[x].reshape(28,28) for x in sample_indices]
        sample_labels = [predictions[x] for x in sample_indices]
        true_labels = [labels[x] for x in sample_indices]

        ## save the random samples -- images, labels, ground_truth
        self.set_params(misclassified_images=sample_images)
        self.set_params(misclassified_labels=sample_labels)
        self.set_params(misclassified_true_labels=true_labels)
        
        ## save all the misclassified predictions and labels
        self.set_params(all_missed_labels=predictions)
        self.set_params(all_missed_true_labels=labels)
        
        
        
        
    def viz_training_images(self):
        """Visualize/Plot sample training images"""
        
        if not self.get_params('training_labels'):
            self.sample_training_images()
        
        # get the images and labels
        sample_images = self.get_params("sample_training_images")
        sample_labels = self.get_params("sample_training_labels")
        
        fig, (ax1, ax2) = plt.subplots(2, 5, figsize=(20, 10))
        plt.suptitle("Some Sample Images from MNIST", fontsize=20, weight='heavy')

        for i in range(5):
            ax1[i].imshow(sample_images[i])
            ax1[i].set_title("MNIST Label: {}".format(sample_labels[i]))
            ax2[i].imshow(sample_images[i+5])
            ax2[i].set_title("MNIST Label: {}".format(sample_labels[i+5]), weight='bold')
            
        plt.show()

    def viz_misclassified_images(self):
        """Visualize/Plot misclassified training images"""

        # get the images and labels
        sample_images = self.get_params("misclassified_images")
        sample_labels = self.get_params("misclassified_labels")
        true_labels = self.get_params("misclassified_true_labels")

        if not sample_labels:
            raise(Exception("Please run predict() or score() with save_misclassified=True"))

        fig, (ax1, ax2) = plt.subplots(2, 5, figsize=(20, 10))
        plt.suptitle("Some Sample Misclassified Images", fontsize=20, weight='heavy')

        for i in range(5):
            ax1[i].imshow(sample_images[i])
            ax1[i].set_title("MNIST Label: {} Classified: {}".format(true_labels[i], sample_labels[i]), weight='bold')
            ax2[i].imshow(sample_images[i+5])
            ax2[i].set_title("MNIST Label: {} Classified: {}".format(true_labels[i+5], sample_labels[i+5]), weight='bold')

        plt.show()
        
    
    ## Stolen from excellent visualization from submission from Madeleine Duran/Sarah Walker
    def viz_training_loss(self, epochs=30):
        """Visualize/Plot our training loss"""
        
        losses = self.get_params("training_losses")
        
        if type(losses) == type(None):
            raise("Please run fit() to train data")
        
        fig, axes = plt.subplots(nrows=1, ncols=epochs, figsize=(20,5), sharex=True, sharey=True)
        plt.suptitle("Loss Trajectory for MNIST LR Model", fontsize=20, weight='heavy')
        
        for i in range(epochs):
            axes[i].plot(range(len(losses[i])), losses[i])
            axes[i].set_title("epoch {}".format(i))
            if i % 2 == 1:
                axes[i].axvspan(0, len(losses[i]), facecolor='gray', alpha=0.2)
        plt.subplots_adjust(wspace=0)
        plt.show()
        
    def get_loader(self, dataset):
        """Retrieve dataloader, images, labels based upon dataset name"""
        
        if dataset == 'Test':
            loader = self.get_params('test_loader')
        elif dataset == 'Validation':
            loader = self.get_params('validation_loader')
        else:
            loader = self.get_params('train_loader')
            
        # Get Loader
        return loader
    
    def predict(self, dataset='Test', save_misclassified=True):
        """Classify images based on the fitted logistic regression model"""

        loader = self.get_loader(dataset)
        
        predictions = []
        all_labels = []
        misclassified = []
        misclassified_images = []
        misclassified_labels = []
        misclassified_preds = np.array([])
        correct = 0
        model = self.get_params('model')

        for inputs, labels in loader:

            ## get the inputs from the dataloader and turn into a variable for 
            ## feeding to the model
            inputs = Variable(inputs)

            ## Reshape so that batches work properly
            inputs = inputs.view(-1, 28*28)

            # run our model on the inputs
            outputs = model(inputs)

            # get the class of the max log-probability
            pred = outputs.data.max(1)[1]
            
            # get the correct predictions
            correct += (pred == labels).sum()

            # save current batch of predictions
            predictions += list(pred)
            
            # save all labels
            all_labels += list(labels)
            
            if save_misclassified:
                
                # keep track of the misclassified labels, images, and prediction
                missed = (pred != labels)
                missed_labels = labels[missed]
                images = inputs.data.numpy()
                missed_images = [images[index] for index,value in enumerate(missed) if value==True]

                misclassified_labels = np.append(misclassified_labels, missed_labels)
                misclassified_images += missed_images
                misclassified_preds = np.append(misclassified_preds, pred[missed])
                
                            
            
        self.set_params(predictions=predictions, 
                        correct_predictions=correct,
                        prediction_dataset_length=len(predictions),
                        all_labels=all_labels
                       )
        
        # Save misclassified images/predictions/labels for visualizing later
        if save_misclassified:
            self.save_misclassified(misclassified_preds, misclassified_images, misclassified_labels,)
            
        return np.array(predictions)
    
    
    def score(self, dataset='Test', print_score=True, save_misclassified=True ):
        """Calculate accuracy score based upon model classification"""
        
        self.predict(dataset=dataset, save_misclassified=save_misclassified)
        correct = self.get_params('correct_predictions')
        total = self.get_params('prediction_dataset_length')
        
        if print_score:
            print('Dataset: {} \nAccuracy: {}/{} ({:.1f}%)\n'.format(
                dataset, correct, total, 100.0 * correct / total))
        
        return(correct/total)
    
    
    def missed_number(self, number):
        """Calculate accuracy for a certain true label"""
        
        labels = np.array(self.get_params("all_labels"))
        predictions = np.array(self.get_params("predictions"))
        missed = np.array(self.get_params("all_missed_true_labels"))
        
        total_number = len(labels[labels == number])
        missed_number = len(missed[missed == number])
        
        return(1-missed_number/total_number)
    
    
    def generate_missed_percentages(self):    
        for i in range(10):
            print("Label: {} -- Accuracy: {}".format(i, 100*self.missed_number(i)))
               
        
    def fit(self, do_validation=True, show_validation=True):
        """Fit our logistic regression model on MNIST training set"""
        
        ## We defined a number of variables in our constructor -- let's reclaim them here
        optimizer=self.get_params("optimizer")
        model=self.get_params("model")
        epochs=self.get_params("epochs")
        criterion=self.get_params("criterion")
        train_loader=self.get_params("train_loader")
        
        ## Get the Total size of training set
        self.get_params('train_dataset')
        training_size = len(self.get_params('train_idx'))
        
        iterations = int(np.ceil(training_size/self.get_params("batch_size")))
        
        ## We need something to keep track of our losses
        losses = np.zeros((epochs, iterations))
        
        ## We need something ot keep track of our validation scores
        validation_scores = np.zeros(epochs)
  
        
        ## Our training loop.  We can loop over a fixed number of epochs or
        ## using a sensitivity parameter (i.e. until net change in loss is
        ## below a certain tolerance).  Here we iterate over a fixed number of
        ## epochs
        for epoch in range(epochs):

            ## We defined our train_loader DataLoader earlier.  The train_loader is a
            ## sequence of tuples with the first element of each tuple being
            ## the batched training inputs (the batch_size being defined in your DataLoader)
            ## and the second second element of each tuple being the corresponding labels
            ## more or less all the pytorch classes are built to handle batching transparently

            ## loop through the DataLoader.  Each loop is one iteration.  All the loops
            ## form one epoch
            for batch_index, (inputs, labels) in enumerate(train_loader):

                # Convert the inputs/labels passed from the DataLoader into
                # autograd Variables.  The dataloader provides them as PyTorch Tensors
                # per the transforms.ToTensor() operation.
                inputs, labels = Variable(inputs), Variable(labels)

                ## as mentioned above we receive the inputs as tensors of size (batch_size,1, 28, 28)
                ## which is effectively (batch_size, 28, 28) basically as a 3 dimensional tensor
                ## representing a stack of (28x28) matrices with each matrix element a floating point number
                ## representing the value of that pixel in the image.  Unfortunately our Neural Network model
                ## can't handle that representation and needs a pixel matrices to be flattened into a row vector
                ## of inputs.  The model takes a 2d tensor representing batch of such row vectors each row vector
                ## representing one set of inputs corresponding to one image.  In order to accomplish this
                ## flattening we use the .view method defined on autograd Variables.
                inputs = inputs.view(-1, 28*28)

                # we need to zero out our gradients after each pass
                optimizer.zero_grad()


                ## This is the optimize - forward step - backwards step part of our design pattern

                # this is the forward step --> we calculate the new outputs based upon the input data from
                # this batch and store the outputs in a variable
                outputs = model(inputs)

                # we compare the outputs to the ground truth labels in the batch to calculate the loss for this step
                loss = criterion(outputs, labels)
                
                ## count the loss
                losses[epoch,batch_index] = loss.data[0]

                # we run backpropagation on the loss variable which repopulates the gradients all the way
                # back through our model to the input layer
                loss.backward()

                # Use the gradients calculated in the backprop that took place in .backwards() to do a new
                # gradient descent step
                optimizer.step()
            
            ## After each epoch -- we should test validation accuracy
            if do_validation:
                
                ### Do your validation scoring here!!!!\
                pass
            
                
        ## Set Loss Matrix for visualizing
        self.set_params(training_losses=losses)
        self.set_params(validation_scores=validation_scores)
        
        return self
        

IndentationError: expected an indented block (<ipython-input-1-0dc4dfc40739>, line 417)