In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [2]:
# get the data from the dataset

training_data = datasets.FashionMNIST (
    root = "data",
    train = True,
    download = True,
    transform = ToTensor(),
)

test_data = datasets.FashionMNIST(
    root = "data",
    train = False,
    download = True,
    transform = ToTensor(),
)


In [3]:
batch_size = 64

# dataloader allows teh data to become iterable so we can do stuff on the dataset

train_dataloader = DataLoader(training_data, batch_size = batch_size)
test_dataloader = DataLoader(test_data, batch_size = batch_size)

# X is the tensor of images - dataset that contains a 4D array or a rank 4 tensor 
# image tensors are usually split into 4 dimensions: N = batch size or the number of images in each tensor
# C = number of channel (1 for gray images and 3 for coloured images)
# H = height of the image
# W = width of the image


# y contains the labels for the images

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    print(y)
    break


Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64
tensor([9, 2, 1, 1, 6, 1, 4, 6, 5, 7, 4, 5, 7, 3, 4, 1, 2, 4, 8, 0, 2, 5, 7, 9,
        1, 4, 6, 0, 9, 3, 8, 8, 3, 3, 8, 0, 7, 5, 7, 9, 6, 1, 3, 7, 6, 7, 2, 1,
        2, 2, 4, 4, 5, 8, 2, 2, 8, 4, 8, 0, 7, 7, 8, 5])


In [4]:
# essentially you can choose a device where you want to run the model
# GPUs allow for parallelisation because of multi threading
# Therefore machine learning tasks can be given to GPUs to do usually
# However, if the GPU is not available we use CPU

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"

# model inherited from the nn.Module

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten() # attribute flatten 
        self.linear_relu_stack = nn.Sequential( # attribute whcih is just a sequential stack of linear and relu layers
            nn.Linear(28*28,512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )
        
    # the logits are the raw outputs of the NN - they mean nothing essentially, 
    # but the magnitude and direction define a likelihood of sorts.
    # the bigger the number and the more positive the more likely it is that class.
    # for a classification task of 3 classes for example, for each image 
    # you will get logits like this [2.5, -1.6, 1]. So this means it is likely the first class
    # 
    # not a probability - probability achieved by using a normalisation technique
    # for classifcation we use softmax: exp(z^i)/exp(z^j) for all j from 1 to n
    # it makes the negative values of the logits into positive but small values (because of how exponentiation works)
    # and then the division by all other values in teh vector normalises beteween 0 and 1 to make it into probabiliies
    # softmax values are used in training, but we only pass in the logits pytorch handles the rest.
    # 
    #   
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x) # long ass explanation - see above
        return logits

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [5]:
# so cross entropy is essentially a way of comparing two probability distributions
# low entropy means something is predictable and a high entropy means something is random
# if you compute the cross entropy between your true distribution (which is just a one-hot vector)
# a one hot vector is a vector like this [0, 0, 1] where the correct label class is signified as a 1 and everything else is a 0
# your predicted disribution will be softamx(logits) for an image
# then you compute the cross entropy -> H(p,q) = - sum(p(i)log(q(i))) for all classes from 0 to C
# log(0) - log(1) are negative numbers so what you need the negative at the front to make it positive.
#
# benefit: it penalises confidently wrong guesses because if cross entropy loss is high for that guess then the weights are readjusted to make the model learn.

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # loss function traversed using stochastic gradient descent - more detail elsewhere.

In [6]:
def train(model, dataloader, loss_fn, optimizer):

    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X) # prediction of the model
        loss = loss_fn(pred, y) # loss function takes the prediciton and the true value 

        loss.backward() # backward propogation - computes teh gradienst w.r.t all weights
        optimizer.step() # apply gradient update to weights
        optimizer.zero_grad() # remove gradient after batch done otherwise will accumulate for next batch

        if batch % 100 == 0:

            loss, current = loss.item(), (batch+1)*len(X) # loss.item() - raw tensor loss value
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [7]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() # done for evaluation
    test_loss, correct = 0, 0
    
    with torch.no_grad():
        for X, y, in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item() # loss across all batches - we are evaluating so need to see loss on all dataset
            
            ### Let’s break this down:
            # 1. `pred.argmax(1)`  
            #    - Finds the predicted class (highest logit) for each example in the batch  
            #    - Output shape: `[batch_size]`

            # 2. `(pred.argmax(1) == y)`  
            #    - Compares predictions to true labels  
            #    - Returns a boolean tensor (e.g. `[True, False, True, ...]`)

            # 3. `.type(torch.float)`  
            #    - Converts `True → 1.0`, `False → 0.0`  
            #    - So now you can **sum** correct predictions

            # 4. `.sum().item()`  
            #    - Counts the number of correct predictions in the batch  
            #    - `.item()` converts the tensor to a Python number
            
            correct += (pred.argmax(1) == y).type(torch.float).sum().item() # number of correct prediciton in the batch compared to the total count
    test_loss /= num_batches # then gives average loss across all batches
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 30
for t in range(epochs):
    print(f"Epoch {t+1}\n--------------")
    train(model, train_dataloader, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
--------------
loss: 1.152426 [   64/60000]
loss: 1.148739 [ 6464/60000]
loss: 0.963525 [12864/60000]
loss: 1.107605 [19264/60000]
loss: 0.975491 [25664/60000]
loss: 1.008418 [32064/60000]
loss: 1.046919 [38464/60000]
loss: 0.993784 [44864/60000]
loss: 1.031559 [51264/60000]
loss: 0.972891 [57664/60000]
Test Error: 
 Accuracy: 65.9%, Avg loss: 0.974498 

Epoch 2
--------------
loss: 1.037228 [   64/60000]
loss: 1.054946 [ 6464/60000]
loss: 0.853713 [12864/60000]
loss: 1.021217 [19264/60000]
loss: 0.894258 [25664/60000]
loss: 0.917217 [32064/60000]
loss: 0.973438 [38464/60000]
loss: 0.923854 [44864/60000]
loss: 0.956373 [51264/60000]
loss: 0.910694 [57664/60000]
Test Error: 
 Accuracy: 67.2%, Avg loss: 0.905968 

Epoch 3
--------------
loss: 0.953907 [   64/60000]
loss: 0.990146 [ 6464/60000]
loss: 0.775420 [12864/60000]
loss: 0.960174 [19264/60000]
loss: 0.839879 [25664/60000]
loss: 0.850401 [32064/60000]
loss: 0.921674 [38464/60000]
loss: 0.877234 [44864/60000]
loss: 0.902476 

In [30]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

pred_array = []
acc_array = []

model.eval()
for i in range(len(test_data)):

    x,y = test_data[i][0], test_data[i][1]
    with torch.no_grad():
        x = x.to(device)
        pred = model(x)
        predicted, actual = classes[pred[0].argmax(0)], classes[y] # need to understand this line

        pred_array.append(1)

        if predicted == actual:
            acc_array.append(1)

        # print(f'Predicted: "{predicted}", Actual: "{actual}"')
print(f"Total Images: {sum(pred_array)} / Right guesses: {sum(acc_array)} / Wrong guesses: {sum(pred_array)-sum(acc_array)}\nAccuracy: {sum(acc_array)/sum(pred_array)*100}%")


Total Images: 10000 / Right guesses: 7813 / Wrong guesses: 2187
Accuracy: 78.13%
