In [1]:
import sys
import torch
from torch import nn
from torchinfo import summary
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

import os
from pathlib import Path
from tqdm.auto import tqdm
from timeit import default_timer as timer

In [4]:
BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
simple_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

train_transform_trivial_augment = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.TrivialAugmentWide(num_magnitude_bins=31),
    transforms.ToTensor()
])

In [5]:
venv_dir = Path(sys.prefix)
project_root = venv_dir.parent
image_path = project_root/"data/pizza_steak_sushi"
train_dir = image_path / "train"
test_dir = image_path / "test"

train_data_simple = datasets.ImageFolder(root=train_dir, transform=simple_transform)
test_data_simple = datasets.ImageFolder(root=test_dir, transform=simple_transform)
train_data_augmented = datasets.ImageFolder(root=train_dir, transform=train_transform_trivial_augment)

train_dataloader_simple = DataLoader(
    dataset=train_data_simple,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=True
)

test_dataloader_simple = DataLoader(
    dataset=test_data_simple,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=False
)

train_dataloader_augmented = DataLoader(
    dataset=train_data_augmented,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=True
)

In [24]:
class TinyVGG(nn.Module):
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int) -> None:
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(
                in_channels=input_shape,
                out_channels=hidden_units,
                kernel_size=3,
                padding=1
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=2
            )
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                padding=1
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=2
            )
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=hidden_units*16*16, out_features=output_shape)
        )
    
    def forward(self, x: torch.Tensor):
        # Apparently this leverages the benefits of operator fusion, just seemed smarter
        return self.classifier(self.conv_block_2(self.conv_block_1(x)))
    
modelV0 = TinyVGG(
    input_shape=3,
    hidden_units=10,
    output_shape=len(train_data_simple.classes)
).to(DEVICE)

summary(modelV0, input_size=[1, 3, 64, 64])

Layer (type:depth-idx)                   Output Shape              Param #
TinyVGG                                  [1, 3]                    --
├─Sequential: 1-1                        [1, 10, 32, 32]           --
│    └─Conv2d: 2-1                       [1, 10, 64, 64]           280
│    └─ReLU: 2-2                         [1, 10, 64, 64]           --
│    └─Conv2d: 2-3                       [1, 10, 64, 64]           910
│    └─ReLU: 2-4                         [1, 10, 64, 64]           --
│    └─MaxPool2d: 2-5                    [1, 10, 32, 32]           --
├─Sequential: 1-2                        [1, 10, 16, 16]           --
│    └─Conv2d: 2-6                       [1, 10, 32, 32]           910
│    └─ReLU: 2-7                         [1, 10, 32, 32]           --
│    └─Conv2d: 2-8                       [1, 10, 32, 32]           910
│    └─ReLU: 2-9                         [1, 10, 32, 32]           --
│    └─MaxPool2d: 2-10                   [1, 10, 16, 16]           --
├─Sequentia

In [25]:
def train_step(model: torch.nn.Module, dataloader: torch.utils.data.DataLoader, loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer):
    model.train()
    train_loss, train_acc = 0, 0

    for (X, y) in iter(dataloader):
        X, y = X.to(DEVICE), y.to(DEVICE)

        y_pred = model(X)

        loss = loss_fn(y_pred, y)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item()/len(y_pred)

    train_acc /= len(dataloader)
    train_loss /= len(dataloader)
    return train_loss, train_acc

def test_step(model: torch.nn.Module, dataloader: torch.utils.data.DataLoader, loss_fn: torch.nn.Module):
    model.eval()
    test_loss, test_acc = 0, 0

    with torch.inference_mode():
        for (X, y) in iter(dataloader):
            X, y = X.to(DEVICE), y.to(DEVICE)

            y_pred = model(X)

            loss = loss_fn(y_pred, y)
            test_loss += loss.item()

            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            test_acc += (y_pred_class == y).sum().item()/len(y_pred)

    test_acc /= len(dataloader)
    test_loss /= len(dataloader)
    return test_loss, test_acc

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, test_dataloader: torch.utils.data.DataLoader, 
          loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer,
          epochs = 5):
    results = {
        "train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": [],
    }

    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model, train_dataloader, loss_fn, optimizer)
        test_loss, test_acc = test_step(model, test_dataloader, loss_fn)

        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    return results

In [26]:
modelV0 = TinyVGG(
    input_shape=3,
    hidden_units=10,
    output_shape=len(train_data_simple.classes)
).to(DEVICE)

NUM_EPOCHS = 6
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=modelV0.parameters(), lr=0.001)

start_time = timer()
modelV0_results = train(modelV0, train_dataloader_simple, test_dataloader_simple, loss_fn, optimizer, NUM_EPOCHS)
end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")

  0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.1034 | train_acc: 0.2812 | test_loss: 1.0883 | test_acc: 0.5417
Epoch: 2 | train_loss: 1.0996 | train_acc: 0.2812 | test_loss: 1.0889 | test_acc: 0.5417
Epoch: 3 | train_loss: 1.1027 | train_acc: 0.2812 | test_loss: 1.0899 | test_acc: 0.5417
Epoch: 4 | train_loss: 1.0990 | train_acc: 0.2812 | test_loss: 1.0906 | test_acc: 0.5417
Epoch: 5 | train_loss: 1.0966 | train_acc: 0.4023 | test_loss: 1.0898 | test_acc: 0.5417
Epoch: 6 | train_loss: 1.1023 | train_acc: 0.2812 | test_loss: 1.0907 | test_acc: 0.5417
Total training time: 5.467 seconds


A model is overfitting when the training loss is far lower that the test loss, meaning that it's learning patterns that are too specific and not translating to the test data.
A common technique to prevent overfitting is regularization, we don't to make the model more regular, as in capable of dealing with more kinds of data.
There are many ways to go about this:
1) Get more data
2) Simplify the model
3) Use data augmentation to make the data harder to learn and also add variety artificially
4) Use transfer learning to start with a model already trained for a general task, like recognizing images, then tuning it for our specific task
5) Use dropout layers, that randomly remove connections between hidden layers, simplifying the network and making the remaining connections better
6) Use learning rate decay, slowly decreasing the learning rate the closer we get to convergence
7) Use early stopping to interrupt the training after a model's loss has stopped decreasing, effectively preventing overfitting

On the other hand, a model is underfitting if the training and test loss are too high. To increase the model predictive power:
1) Add more layers/units to the model
2) Tweak the learning rate
3) Use transfer learning
4) Train for longer
5) Use less regularization

There is a fine line between underfitting and overfitting, the model needs to learn the data but not too well. Transfer learning is one of the most powerful techniques when it comes to dealing with both underfitting and overfitting. Currently, our model seems to be underfitting, as the final training loss is really high.

In [28]:
modelV1 = TinyVGG(
    input_shape=3,
    hidden_units=10,
    output_shape=len(train_data_augmented.classes)
)

NUM_EPOCHS = 20
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=modelV1.parameters(), lr=0.001)

start_time = timer()
modelV1_results = train(modelV1, train_dataloader_augmented, test_dataloader_simple, loss_fn, optimizer, NUM_EPOCHS)
end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.0999 | train_acc: 0.2930 | test_loss: 1.1025 | test_acc: 0.1979
Epoch: 2 | train_loss: 1.1007 | train_acc: 0.2930 | test_loss: 1.1027 | test_acc: 0.1979
Epoch: 3 | train_loss: 1.1002 | train_acc: 0.2930 | test_loss: 1.1029 | test_acc: 0.1979
Epoch: 4 | train_loss: 1.0965 | train_acc: 0.4141 | test_loss: 1.1033 | test_acc: 0.1979
Epoch: 5 | train_loss: 1.0961 | train_acc: 0.4141 | test_loss: 1.1036 | test_acc: 0.1979
Epoch: 6 | train_loss: 1.1002 | train_acc: 0.2930 | test_loss: 1.1038 | test_acc: 0.1979
Epoch: 7 | train_loss: 1.0958 | train_acc: 0.4141 | test_loss: 1.1042 | test_acc: 0.1979
Epoch: 8 | train_loss: 1.0958 | train_acc: 0.4141 | test_loss: 1.1046 | test_acc: 0.1979
Epoch: 9 | train_loss: 1.1005 | train_acc: 0.2930 | test_loss: 1.1041 | test_acc: 0.1979
Epoch: 10 | train_loss: 1.1000 | train_acc: 0.2930 | test_loss: 1.1043 | test_acc: 0.1979
Epoch: 11 | train_loss: 1.1003 | train_acc: 0.2930 | test_loss: 1.1038 | test_acc: 0.1979
Epoch: 12 | train_l