In [1]:
# Setting up the dataset

from pathlib import Path
import requests

# Downloading the dataset from the link

DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)

URL = "https://github.com/pytorch/tutorials/raw/master/_static/"
FILENAME = "mnist.pkl.gz"

if not (PATH / FILENAME).exists():
        content = requests.get(URL + FILENAME).content
        (PATH / FILENAME).open("wb").write(content)

In [2]:
# unpacking the data by unpickling
import pickle
import gzip

with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")

In [3]:
import torch

# Transforming the x and y datasets into tensors for Pytorch usage
x_train, y_train, x_valid, y_valid = map(
    torch.tensor, (x_train, y_train, x_valid, y_valid)
)

n, c = x_train.shape
print(x_train, y_train)
print(x_train.shape)
print(y_train.min(), y_train.max())

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor([5, 0, 4,  ..., 8, 4, 8])
torch.Size([50000, 784])
tensor(0) tensor(9)


In [4]:
# Training a neural net without the nn module
import math

weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_() # We can use requires_grad_() to calculate gradient automatically on the weights
bias = torch.zeros(10, requires_grad=True)

In [5]:
# We need an activation function
def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

# We can return any function as a model because pytorch has automatic gradient calculation
def model(xb):
    return log_softmax(xb @ weights + bias) # @ symbol is a dot product

In [6]:
# Create one forward pass
bs = 64
xb = x_train[0:bs] # 1 minibatch
preds = model(xb)
preds[0], preds.shape
print(preds[0], preds.shape)

tensor([-2.6538, -2.4375, -2.5406, -2.0018, -2.1615, -1.7220, -2.7385, -2.6682,
        -2.0823, -2.5799], grad_fn=<SelectBackward0>) torch.Size([64, 10])


In [7]:
# Create a loss function to do backpropagation
def nll(input, target):
  return -input[range(target.shape[0]), target].mean()

loss_func = nll

In [8]:
# Calculate the loss on the predictions vs the ground truth
yb = y_train[0:bs]
print(loss_func(preds, yb))

tensor(2.3777, grad_fn=<NegBackward0>)


In [9]:
# Calculate the accuracy of our predictions
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

In [10]:
print(accuracy(preds, yb))

tensor(0.0781)


In [11]:
# Basic training

lr = 0.5
epochs = 2

for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
    start_i = i * bs
    end_i = start_i + bs
    xb = x_train[start_i:end_i] # get the next minibatch
    yb = y_train[start_i:end_i]
    pred = model(xb) # predict the values using the model
    loss = loss_func(pred, yb) # calculate the loss from the predictions vs ground truth

    loss.backward() # perform backprop
    with torch.no_grad():
      weights -= weights.grad * lr # adjust the weights
      bias -= bias.grad * lr # adjust the biases
      weights.grad.zero_() # zero the gradients for the next minibatch
      bias.grad.zero_()

In [12]:
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0814, grad_fn=<NegBackward0>) tensor(1.)


In [13]:
# Implementing loss function with nn.functional
import torch.nn.functional as F

loss_func = F.cross_entropy

def model(xb):
  return xb @ weights + bias

In [14]:
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0814, grad_fn=<NllLossBackward0>) tensor(1.)


In [15]:
# Implementing the weights and biases using built in nn.Module functionality
from torch import nn

class Mnist_Logistic(nn.Module):
  def __init__(self):
    super().__init__() # call super to subclass our module from nn.Module
    self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784)) # implement weights into the model
    self.bias = nn.Parameter(torch.zeros(10)) # implement biases into the model

  # define what a single forward pass will look like in our model
  def forward(self, xb):
    return xb @ self.weights + self.bias

In [16]:
model = Mnist_Logistic()

In [17]:
print(loss_func(model(xb), yb))

tensor(2.4315, grad_fn=<NllLossBackward0>)


In [18]:
# Redefine our training loop into a function
def fit():
    for epoch in range(epochs):
        for i in range((n - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            pred = model(xb)
            loss = loss_func(pred, yb)

            loss.backward()

            # We can simplify our training loop so that we update all of our parameters at once
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad() # We can also zero out the gradient for the entire model with one command via nn module

In [19]:
fit()
print(loss_func(model(xb), yb))

tensor(0.0831, grad_fn=<NllLossBackward0>)


In [20]:
# Refactor our model using Linear layers
class Mnist_Logistic(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin = nn.Linear(784, 10) # Define a linear layer, which contains the weights and biases
  
  def forward(self, xb):
    return self.lin(xb)

In [21]:
# Check that everything works the same as before
model = Mnist_Logistic()
print(loss_func(model(xb), yb))

tensor(2.3576, grad_fn=<NllLossBackward0>)


In [22]:
fit()

print(loss_func(model(xb), yb))

tensor(0.0818, grad_fn=<NllLossBackward0>)


In [23]:
# We can further improve our training function by using torch.optim
from torch import optim

In [24]:
# We can get the model and optimizer in one function
def get_model():
  model = Mnist_Logistic()
  return model, optim.SGD(model.parameters(), lr=lr)

model, opt = get_model()


In [25]:
for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step() # Call the optimizer to update our parameters via the forward step function
        opt.zero_grad() # Zero out the gradients for the next minibatch

print(loss_func(model(xb), yb))

tensor(0.0797, grad_fn=<NllLossBackward0>)


In [26]:
# Learning how to use Dataset

from torch.utils.data import TensorDataset

train_ds = TensorDataset(x_train, y_train) # combining them into a single tensor makes it easier to iterate over

In [27]:
model, opt = get_model()

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        xb, yb = train_ds[i * bs: i * bs + bs] # We can get x batch and y batch in one statement
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

print(loss_func(model(xb), yb))

tensor(0.0815, grad_fn=<NllLossBackward0>)


In [28]:
# Further improvements to minibatch via dataloader
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs) # Batches are generated automatically

In [29]:
model, opt = get_model()

for epoch in range(epochs):
    for xb, yb in train_dl: # train_dl will generate the minibatches automatically
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

print(loss_func(model(xb), yb))

tensor(0.0813, grad_fn=<NllLossBackward0>)


In [30]:
# Adding validation to our training
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid)
valid_dl = DataLoader(valid_ds, batch_size=bs * 2) # Validation batches are twice as big since we don't need to backprop them

In [31]:
model, opt = get_model()

for epoch in range(epochs):
    model.train() # We indicate this is the training stage (use the train set)
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

    model.eval() # Indicate this is the eval stage (use the validation set)
    # We do this because some layers in neural networks behave differently during training vs eval
    with torch.no_grad():
        valid_loss = sum(loss_func(model(xb), yb) for xb, yb in valid_dl)

    print(epoch, valid_loss / len(valid_dl))

0 tensor(0.4322)
1 tensor(0.2791)


In [32]:
# Make Calculating loss into it's own function
def loss_batch(model, loss_func, xb, yb, opt=None):
  loss = loss_func(model(xb), yb)

  if opt is not None: # If there is an optimizer then perform backprop
    loss.backward()
    opt.step()
    opt.zero_grad()
  
  return loss.item(), len(xb)

In [33]:
# Define the fit function for training/validation
import numpy as np

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
  for epoch in range(epochs):
    model.train()
    for xb, yb in train_dl:
      loss_batch(model, loss_func, xb, yb, opt) # we include opt so backprop is performed
    
    model.eval()
    with torch.no_grad():
      losses, nums = zip(
          *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl] # no opt is included here
      )
    val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

    print(epoch, val_loss)

In [34]:
# Generate the train and validation batches
# We shuffle training batch so that there isn't overfitting based on batch familiarity
def get_data(train_ds, valid_ds, bs):
  return (
      DataLoader(train_ds, batch_size=bs, shuffle=True),
      DataLoader(valid_ds, batch_size=bs * 2),
  )

In [35]:
# We can use these three lines to perform our training
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
model, opt = get_model()
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

0 0.33284156425595285
1 0.2883918229401112


In [36]:
# Switching our model to a CNN
class Mnist_CNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1) # Pytorch has a predefined conv2d layer
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
    self.conv3 = nn.Conv2d(in_channels=16, out_channels=10, kernel_size=3, stride=2, padding=1)
  
  def forward(self, xb):
    xb = xb.view(-1, 1, 28, 28) # Reshape the batch dimensions to 28 x 28
    xb = F.relu(self.conv1(xb)) # perform relu after each cnn layer
    xb = F.relu(self.conv2(xb))
    xb = F.relu(self.conv3(xb))
    xb = F.avg_pool2d(xb, 4) # End the forward pass with an average pooling
    return xb.view(-1, xb.size(1))

lr = 0.1

In [37]:
model = Mnist_CNN()
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9) # include momentum into our optimizer to improve training

fit(epochs, model, loss_func, opt, train_dl, valid_dl)

0 0.315023463845253
1 0.24123036992549896


In [38]:
# Implement sequential to improve the model implementation

# We need to create a custom layer that will reshape our tensor first, since view is not a layer

class Lambda(nn.Module):
  def __init__(self, func):
    super().__init__()
    self.func = func
  
  def forward(self, x):
    return self.func(x)
  
def preprocess(x):
  return x.view(-1, 1, 28, 28)

In [39]:
model = nn.Sequential(
    Lambda(preprocess),
    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.AvgPool2d(4),
    Lambda(lambda x: x.view(x.size(0), -1)),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

fit(epochs, model, loss_func, opt, train_dl, valid_dl)

0 0.31321035504341127
1 0.2552851420402527


In [40]:
# Creating a dataloader that works with any sized image

def preprocess(x, y):
  return x.view(-1, 1, 28, 28), y

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))

      

In [41]:
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
train_dl = WrappedDataLoader(train_dl, preprocess) # Preprocess our batches to be the correct size
valid_dl = WrappedDataLoader(valid_dl, preprocess)

In [42]:
# Use adaptive AvgPool2d so that the output tensor can be custom size
model = nn.Sequential(
    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1), # note that the initial preprocess is gone
    nn.ReLU(),
    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.AdaptiveAvgPool2d(1), 
    Lambda(lambda x: x.view(x.size(0), -1)),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [43]:
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

0 0.3511363983631134
1 0.23028322924375533
