In [None]:
import vugrad as vg
import numpy as np
from vugrad.core import Module, TensorNode

from vugrad.ops import *
from vugrad.functions import *

import matplotlib.pyplot as plt
%matplotlib inline

# Question 8

In [None]:
class Linear(Module):
    """
    A linear operation. Applies a matrix transformation and a vector translation.
    """

    def __init__(self, input_size, output_size):
        super().__init__()

        # weights of the matrix transformation
        glorot_std = 2.0 / (input_size + output_size) # scalar for Glorot init
        w = np.random.randn(output_size, input_size) * glorot_std
        self.w = TensorNode(w)

        # weights of the bias (the translation)
        b = np.zeros((1, output_size))
        self.b = TensorNode(b)
        # -- We initialize the biases to zero for simplicity. This is a common approach, but with ReLU units it's
        #    sometimes best to add a little noise to avoid dead neurons.

    def forward(self, input):

        outsize, insize = self.w.size()
        n, f = input.size()

        assert f == insize, f'Number of features in input ({f}) does not match input dimension ({insize}).'
        assert len(input.size()) == 2

        # Multiply all input vectors by the weight matrix.
        x = BatchMM.do_forward(self.w, input)

        assert x.size() == (n, outsize)

        exb = Expand.do_forward(self.b, dim=0, repeats=n)
        # -- We are broadcasting the (1, outsize) vector b over the (n, outsize) matrix x. Numpy normally does this
        #    automatically, if we just do `x + self.b`, but we wouldn't get a gradient over that operation. Expand
        #    is a minimal broadcasting op that is sufficient for our purposes.
        # -- In pytorch, full-featured broadcasting is implemented so there you would actually be able to do `x + self.b`.

        assert x.size() == exb.size()

        return x + exb

    def parameters(self):
        return [self.w, self.b]

class MLP(Module):
    """
    A simple MLP with one hidden layer, and a sigmoid non-linearity on the hidden layer and a softmax on the
    output.
    """

    def __init__(self, input_size, output_size, hidden_act='sigmoid', hidden_mult=4):
        """

        :param input_size:
        :param output_size:
        :param hidden_mult: Multiplier that indicates how many times bigger the hidden layer is than the input layer.
        """
        super().__init__()

        hidden_size = hidden_mult * input_size
        self.hidden_act = hidden_act
        # -- There is no common wisdom on how big the hidden size should be, apart from the idea
        #    that it should be strictly _bigger_ than the input. If it isn't, the network won't be
        #    able to learn closed shapes like circles.

        self.layer1 = Linear(input_size, hidden_size)
        self.layer2 = Linear(hidden_size, output_size)
        

    def forward(self, input):

        assert len(input.size()) == 2

        # first layer
        hidden = self.layer1(input)

        # non-linearity
        if self.hidden_act == 'sigmoid':
            hidden = sigmoid(hidden)
        elif self.hidden_act == 'ReLu':
            hidden = relu(hidden)
        else:
            raise Exception(f'Activation {hidden_act} not recognized.')
        
        # second layer
        output = self.layer2(hidden)
        output = softmax(output)

        return output

    def parameters(self):

        return self.layer1.parameters() + self.layer2.parameters()


In [None]:
def train(data='synth', hidden_act='sigmoid', hidden_mult=4, batch_size=128, epochs=20, learning_rate=0.01):

    ## Load the data
    if data == 'synth':
        (xtrain, ytrain), (xval, yval), num_classes = vg.load_synth()
    elif data == 'mnist':
        (xtrain, ytrain), (xval, yval), num_classes = vg.load_mnist(final=False, flatten=True)
    else:
        raise Exception(f'Dataset {args.data} not recognized.')

    print(f'## loaded data:')
    print(f'         number of instances: {xtrain.shape[0]} in training, {xval.shape[0]} in validation')
    print(f' training class distribution: {np.bincount(ytrain)}')
    print(f'     val. class distribution: {np.bincount(yval)}')

    num_instances, num_features = xtrain.shape

    ## Create the model.
    mlp = MLP(input_size=num_features, output_size=num_classes, hidden_act=hidden_act)

    n, m = xtrain.shape
    b = batch_size

    print('\n## Starting training')

    cl = '...'
    validation_acc = list()
    train_acc = list()

    for epoch in range(epochs):

        print(f'epoch {epoch:03}')

        if epoch % 1 == 0:
            ## Compute validation accuracy

            o = mlp(vg.TensorNode(xval))
            oval = o.value

            predictions = np.argmax(oval, axis=1)
            num_correct = (predictions == yval).sum()
            acc = num_correct / yval.shape[0]
            validation_acc.append(acc)
            
            o.clear() # gc the computation graph
            
            o = mlp(vg.TensorNode(xtrain))
            otrain = o.value

            predictions = np.argmax(otrain, axis=1)
            num_correct = (predictions == ytrain).sum()
            acc = num_correct / ytrain.shape[0]
            train_acc.append(acc)
            
            o.clear() # gc the computation graph

            print(f'       accuracy: {acc:.4}')

        cl = 0.0 # running sum of the training loss

        # We loop over the data in batches of size `b`
        for fr in range(0, n, b):

            # The end index of the batch
            to = min(fr + b, n)

            # Slice out the batch and its corresponding target values
            batch, targets = xtrain[fr:to, :], ytrain[fr:to]

            # Wrap the inputs in a Node
            batch = vg.TensorNode(value=batch)

            outputs = mlp(batch)
            loss = vg.celoss(outputs, targets)
            # -- The computation graph is now complete. It consists of the mlp, together with the computation of
            #    the scalar loss.
            # -- The variable `loss` is the TreeNode at the very top of our computation graph. This means we can call
            #    it to perform operations on the computation graph, like clearing the gradients, starting the backpropgation
            #    and clearing the graph.

            cl += loss.value
            # -- We must be careful here to extract the _raw_ value for the running loss. What would happen if we kept
            #    a running sum using the TensorNode?

            # Start the backpropagation
            loss.backward()

            # pply gradient descent
            for parm in mlp.parameters():
                parm.value -= learning_rate * parm.grad
                # -- Note that we are directly manipulating the members of the parm TensorNode. This means that for this
                #    part, we are not building up a computation graph.

            # -- In Pytorch, the gradient descent is abstracted away into an Optimizer. This allows us to build slightly more
            #    complexoptimizers than plain graident descent.

            # Finally, we need to reset the gradients to zero ...
            loss.zero_grad()
            # ... and delete the parts of the computation graph we don't need to remember.
            loss.clear()

        print(f'   running loss: {cl:.4}')
        
    return validation_acc, train_acc


In [None]:
sigs = []
for i in range(5):
    sigval, sigtrain = train(hidden_act='sigmoid')
    sigs.append(sigval)
    print()
    print()

In [None]:
rels = []
for i in range(5):
    relval, reltrain = train(hidden_act='ReLu')
    rels.append(relval)
    print()
    print()

In [None]:
t = np.arange(1, len(sigval)+1)

sigs = np.asarray(sigs)
rels = np.asarray(rels)

plt.plot(t, sigs.mean(axis=0), lw=2, label='Sigmoid', color='blue')
plt.plot(t, rels.mean(axis=0), lw=2, label='ReLu', color='red')
plt.fill_between(t, sigs.mean(axis=0) + sigs.std(axis=0), 
                 sigs.mean(axis=0) - sigs.std(axis=0), 
                 facecolor='blue', alpha=0.5)
plt.fill_between(t, rels.mean(axis=0) + rels.std(axis=0), 
                 rels.mean(axis=0) - rels.std(axis=0), 
                 facecolor='red', alpha=0.5)


plt.legend(loc='lower right')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.grid()

plt.savefig('Question 8')
plt.show()

# Question 9

In [None]:
from __future__ import print_function
import torch
import torchvision

import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 5)
        self.conv3 = nn.Conv2d(32, 64, 5)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout(x)
        x = self.pool2(F.relu(self.conv3(x)))
        x = self.dropout(x)
        x = x.view(-1, 1024)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

In [None]:
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1


for i in range(10):
    print('Accuracy of %5s : %2d %%' % (
        classes[i], 100 * class_correct[i] / class_total[i]))