### Train Neural Network

In [1]:
#First imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from collections import OrderedDict

import numpy as np
import time

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import helper

In [None]:
# First create a random tensor
x = torch.rand(4,4, requires_grad=True)
x

In [None]:
y = x**2
y

To calculate the gradients, you need to run the `.backward` method on a Variable, `z` for example. This will calculate the gradient for `z` with respect to `x`

$$
\frac{\partial z}{\partial x} = \frac{\partial}{\partial x}\left[\frac{1}{n}\sum_i^n x_i^2\right] = \frac{x}{8}
$$

In this example we n = 16

In [None]:
z = y.mean()
z.backward()
print(x.grad)
print(x*0.125)

## Get the data and define the network

We'll load the MNIST dataset and define our network.

In [2]:
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize([0.5],[0.5], inplace=True)])
# Download and load the training data
trainset = datasets.MNIST('MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

I'll build a network with `nn.Sequential` here. Only difference from the last part is I'm not actually using softmax on the output, but instead just using the raw output from the last layer. This is because the output from softmax is a probability distribution. Often, the output will have values really close to zero or really close to one. Due to [inaccuracies with representing numbers as floating points](https://docs.python.org/3/tutorial/floatingpoint.html), computations with a softmax output can lose accuracy and become unstable. To get around this, we'll use the raw output, called the **logits**, to calculate the loss.

In [3]:
# Hyperparameters for our network
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

# Build a feed-forward network
model = nn.Sequential(OrderedDict([
                      ('fc1', nn.Linear(input_size, hidden_sizes[0])),
                      ('relu1', nn.ReLU()),
                      ('fc2', nn.Linear(hidden_sizes[0], hidden_sizes[1])),
                      ('relu2', nn.ReLU()),
                      ('logits', nn.Linear(hidden_sizes[1], output_size))]))

## Training the network!

The first thing we need to do for training is define our loss function. In PyTorch, you'll usually see this as `criterion`. Here we're using softmax output, so we want to use `criterion = nn.CrossEntropyLoss()` as our loss. Later when training, you use `loss = criterion(output, targets)` to calculate the actual loss.

We also need to define the optimizer we're using, SGD or Adam, or something along those lines. Here I'll just use SGD with `torch.optim.SGD`, passing in the network parameters and the learning rate.

In [4]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01) #lr is learnign rate

First, let's consider just one learning step before looping through all the data. The general process with PyTorch:

* Make a forward pass through the network to get the logits 
* Use the logits to calculate the loss
* Perform a backward pass through the network with `loss.backward()` to calculate the gradients
* Take a step with the optimizer to update the weights

Below I'll go through one training step and print out the weights and gradients so you can see how it changes.

In [5]:
# So this is basically one step of training
print("Before", model.fc1.weight)
images, labels = next(iter(trainloader))
images.resize_(64, 784)

# Here we zero out the gradients because in case we would not 
# gradient from every epoch would cumulate
optimizer.zero_grad()

# Forward pass
output = model.forward(images)
loss = criterion(output, labels)
loss.backward()
print("Gradient - ", model.fc1.weight.grad)
optimizer.step()

Before Parameter containing:
tensor([[-0.0315,  0.0113, -0.0030,  ..., -0.0283,  0.0346, -0.0038],
        [-0.0277,  0.0219, -0.0158,  ...,  0.0278, -0.0135, -0.0305],
        [ 0.0024, -0.0276,  0.0306,  ..., -0.0113,  0.0036, -0.0310],
        ...,
        [-0.0166, -0.0077,  0.0053,  ..., -0.0076, -0.0300,  0.0196],
        [-0.0027,  0.0167,  0.0286,  ...,  0.0203, -0.0054,  0.0245],
        [ 0.0258, -0.0265, -0.0287,  ..., -0.0203, -0.0078,  0.0091]],
       requires_grad=True)
Gradient -  tensor([[ 6.7617e-04,  6.7617e-04,  6.7617e-04,  ...,  6.7617e-04,
          6.7617e-04,  6.7617e-04],
        [-3.1951e-03, -3.1951e-03, -3.1951e-03,  ..., -3.1951e-03,
         -3.1951e-03, -3.1951e-03],
        [ 2.8656e-03,  2.8656e-03,  2.8656e-03,  ...,  2.8656e-03,
          2.8656e-03,  2.8656e-03],
        ...,
        [-5.9396e-04, -5.9396e-04, -5.9396e-04,  ..., -5.9396e-04,
         -5.9396e-04, -5.9396e-04],
        [-7.7773e-05, -7.7773e-05, -7.7773e-05,  ..., -7.7773e-05,
      

In [6]:
print("Updated weights - ", model.fc1.weight)

Updated weights -  Parameter containing:
tensor([[-0.0315,  0.0113, -0.0030,  ..., -0.0283,  0.0346, -0.0038],
        [-0.0277,  0.0219, -0.0157,  ...,  0.0278, -0.0135, -0.0305],
        [ 0.0024, -0.0277,  0.0305,  ..., -0.0113,  0.0036, -0.0311],
        ...,
        [-0.0166, -0.0077,  0.0053,  ..., -0.0076, -0.0299,  0.0196],
        [-0.0027,  0.0167,  0.0286,  ...,  0.0203, -0.0054,  0.0245],
        [ 0.0258, -0.0265, -0.0288,  ..., -0.0204, -0.0079,  0.0091]],
       requires_grad=True)


### Real training
Now we have to put that logic into a loop in order to train on every image in dataset

In [7]:
for images, labels in trainloader:
    print(images.shape)
    break

torch.Size([64, 1, 28, 28])


In [8]:
optimizer = optim.SGD(model.parameters(), lr=0.003)

In [9]:
epochs = 3
print_every = 40
steps = 0
for e in range(epochs):
    running_loss = 0
    for images, labels in iter(trainloader):
        steps += 1
        # Flatten MNIST images into a 784 long vector
        images.resize_(images.size()[0], 784)
        
        optimizer.zero_grad()
        
        # Forward and backward passes
        output = model.forward(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if steps % print_every == 0:
            print("Epoch: {}/{}... ".format(e+1, epochs),
                  "Loss: {:.4f}".format(running_loss/print_every))
            
            running_loss = 0

Epoch: 1/3...  Loss: 2.2932
Epoch: 1/3...  Loss: 2.2729
Epoch: 1/3...  Loss: 2.2529
Epoch: 1/3...  Loss: 2.2315
Epoch: 1/3...  Loss: 2.2118
Epoch: 1/3...  Loss: 2.1828
Epoch: 1/3...  Loss: 2.1491
Epoch: 1/3...  Loss: 2.1201
Epoch: 1/3...  Loss: 2.0831
Epoch: 1/3...  Loss: 2.0574
Epoch: 1/3...  Loss: 2.0092
Epoch: 1/3...  Loss: 1.9585
Epoch: 1/3...  Loss: 1.8938
Epoch: 1/3...  Loss: 1.8533
Epoch: 1/3...  Loss: 1.7820
Epoch: 1/3...  Loss: 1.7332
Epoch: 1/3...  Loss: 1.6605
Epoch: 1/3...  Loss: 1.5799
Epoch: 1/3...  Loss: 1.5208
Epoch: 1/3...  Loss: 1.4705
Epoch: 1/3...  Loss: 1.4004
Epoch: 1/3...  Loss: 1.3549
Epoch: 1/3...  Loss: 1.2748
Epoch: 2/3...  Loss: 0.6569
Epoch: 2/3...  Loss: 1.1701
Epoch: 2/3...  Loss: 1.1474
Epoch: 2/3...  Loss: 1.0847
Epoch: 2/3...  Loss: 1.0185
Epoch: 2/3...  Loss: 0.9996
Epoch: 2/3...  Loss: 0.9430
Epoch: 2/3...  Loss: 0.9111
Epoch: 2/3...  Loss: 0.8825
Epoch: 2/3...  Loss: 0.8574
Epoch: 2/3...  Loss: 0.8418
Epoch: 2/3...  Loss: 0.7937
Epoch: 2/3...  Loss:

In [None]:
# we have to declare number of epochs
epochs = 3
print_every = 40
steps = 0
for epoch in range(epochs):
    running_loss = 0
    for images, labels in iter(trainloader):
        steps += 1
        
        # here we put image into a vector 28 x 28 = 784
        images.resize_(images.shape[0], 784)
        
        # zero out gradient
        optimizer.zero_grad()
        
        output = model.forward(images)
        # here loss is just a number = scalar tensor
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        #we want to extract number from scalar tensor
        running_loss += loss.item()
        
        if steps & print_every == 0:
            print(f"Epoch: {epoch+1}/{epochs}",
                 f"Loss: {(running_loss/print_every)}")
            running_loss = 0
        
        
        
        