In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [4]:
from torch.nn.functional import conv2d, max_pool2d, cross_entropy

In [5]:
plt.rc("figure", dpi=100)      
batch_size = 100            #It is used to define the number of samples/images that will be processed in each batch during training

In [6]:
# transform images into normalized tensors
transform = transforms.Compose([
    transforms.ToTensor(),     #It converts the image from PIL (Python Imaging Library) format or numpy array format to a tensor representation that can be used as input to a neural network.
    transforms.Normalize(mean=(0.5,), std=(0.5,))    #This transformation normalizes the tensor values by subtracting the mean and dividing by the standard deviation. In this case, the mean is set to 0.5, and the standard deviation is also set to 0.5.
])


In [7]:
train_dataset = datasets.MNIST(
    "./",
    download=True,
    train=True,
    transform=transform,
)

In [8]:
test_dataset = datasets.MNIST(
    "./",
    download=True,
    train=False,
    transform=transform,
)


In [9]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
    pin_memory=True,
)


In [10]:
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
)


In [11]:
def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape) * std
    w.requires_grad = True
    return w


In [12]:
def rectify(x):
    # Rectified Linear Unit (ReLU)
    return torch.max(torch.zeros_like(x), x)

In [13]:
class RMSprop(optim.Optimizer):
    """
    This is a reduced version of the PyTorch internal RMSprop optimizer
    It serves here as an example
    """
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # state initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])

In [14]:
# define the neural network
def model(x, w_h, w_h2, w_o):
    h = rectify(x @ w_h)
    h2 = rectify(h @ w_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [15]:
# initialize weights

# input shape is (B, 784)
w_h = init_weights((784, 625))
# hidden layer with 625 neurons
w_h2 = init_weights((625, 625))
# hidden layer with 625 neurons
w_o = init_weights((625, 10))
# output shape is (B, 10)
optimizer = RMSprop(params=[w_h, w_h2, w_o])
n_epochs = 100

train_loss = []
test_loss = []

In [None]:
# put this into a training loop over 100 epochs
for epoch in range(n_epochs + 1):
    train_loss_this_epoch = []
    for idx, batch in enumerate(train_dataloader):
        x, y = batch

        # our model requires flattened input
        x = x.reshape(batch_size, 784)
        # feed input through model
        noise_py_x = model(x, w_h, w_h2, w_o)

        # reset the gradient
        optimizer.zero_grad()

        # the cross-entropy loss function already contains the softmax
        loss = cross_entropy(noise_py_x, y, reduction="mean")

        train_loss_this_epoch.append(float(loss))

        # compute the gradient
        loss.backward()
        # update weights
        optimizer.step()

    train_loss.append(np.mean(train_loss_this_epoch))
     # test periodically
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}")
        print(f"Mean Train Loss: {train_loss[-1]:.2e}")
        test_loss_this_epoch = []

        # no need to compute gradients for validation
        with torch.no_grad():
            for idx, batch in enumerate(test_dataloader):
                x, y = batch
                x = x.reshape(batch_size, 784)
                noise_py_x = model(x, w_h, w_h2, w_o)

                loss = cross_entropy(noise_py_x, y, reduction="mean")
                test_loss_this_epoch.append(float(loss))

        test_loss.append(np.mean(test_loss_this_epoch))

        print(f"Mean Test Loss:  {test_loss[-1]:.2e}")

plt.plot(np.arange(n_epochs + 1), train_loss, label="Train")
plt.plot(np.arange(1, n_epochs + 2, 10), test_loss, label="Test")
plt.title("Train and Test Loss over Training")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

Epoch: 0
Mean Train Loss: 3.96e-01
Mean Test Loss:  2.39e-01
Epoch: 10
Mean Train Loss: 1.45e-01
Mean Test Loss:  3.67e-01
Epoch: 20
Mean Train Loss: 1.00e-01
Mean Test Loss:  4.36e-01
Epoch: 30
Mean Train Loss: 6.88e-02
Mean Test Loss:  5.47e-01
Epoch: 40
Mean Train Loss: 4.71e-02
Mean Test Loss:  7.53e-01
Epoch: 50
Mean Train Loss: 4.30e-02
Mean Test Loss:  6.80e-01
Epoch: 60
Mean Train Loss: 3.35e-02
Mean Test Loss:  7.38e-01
Epoch: 70
Mean Train Loss: 2.02e-02
Mean Test Loss:  8.58e-01
Epoch: 80
Mean Train Loss: 1.84e-02
Mean Test Loss:  8.45e-01
Epoch: 90
Mean Train Loss: 1.31e-02
Mean Test Loss:  8.68e-01
Epoch: 100
Mean Train Loss: 8.88e-03
Mean Test Loss:  1.01e+00


<matplotlib.legend.Legend at 0x2381497a5b0>