In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
! cd /kaggle/working/
! cp -r /kaggle/input/erav1-s17/S17 .

In [21]:
cd /kaggle/working/S17

/kaggle/working/S17


In [22]:
import matplotlib.pyplot as plt
import torch
import torchvision
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from torch import nn
from torchvision import transforms
import torch.nn.functional as F

# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

from src.model import Transformer

In [23]:
image_path = "data/pizza_steak_sushi"
train_dir = image_path + "/train"
test_dir = image_path + "/test"

In [24]:
# Create image size (from Table 3 in the ViT paper) 
IMG_SIZE = 224

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])           
print(f"Manually created transforms: {manual_transforms}")

Manually created transforms: Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
    ToTensor()
)


In [25]:
"""
Contains functionality for creating PyTorch DataLoaders for 
image classification data.
"""
import os

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

NUM_WORKERS = os.cpu_count()

def create_dataloaders(
    train_dir: str, 
    test_dir: str, 
    transform: transforms.Compose, 
    batch_size: int, 
    num_workers: int=NUM_WORKERS
):
  """Creates training and testing DataLoaders.

  Takes in a training directory and testing directory path and turns
  them into PyTorch Datasets and then into PyTorch DataLoaders.

  Args:
    train_dir: Path to training directory.
    test_dir: Path to testing directory.
    transform: torchvision transforms to perform on training and testing data.
    batch_size: Number of samples per batch in each of the DataLoaders.
    num_workers: An integer for number of workers per DataLoader.

  Returns:
    A tuple of (train_dataloader, test_dataloader, class_names).
    Where class_names is a list of the target classes.
    Example usage:
      train_dataloader, test_dataloader, class_names = \
        = create_dataloaders(train_dir=path/to/train_dir,
                             test_dir=path/to/test_dir,
                             transform=some_transform,
                             batch_size=32,
                             num_workers=4)
  """
  # Use ImageFolder to create dataset(s)
  train_data = datasets.ImageFolder(train_dir, transform=transform)
  test_data = datasets.ImageFolder(test_dir, transform=transform)

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size=batch_size,
      shuffle=True,
      num_workers=num_workers,
      pin_memory=True,
  )
  test_dataloader = DataLoader(
      test_data,
      batch_size=batch_size,
      shuffle=False,
      num_workers=num_workers,
      pin_memory=True,
  )

  return train_dataloader, test_dataloader, class_names

In [26]:
# Set the batch size
BATCH_SIZE = 32 # this is lower than the ViT paper but it's because we're starting small

# Create data loaders
train_dataloader, test_dataloader, class_names = create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    transform=manual_transforms, # use manually created transforms
    batch_size=BATCH_SIZE
)

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x78738c403a00>,
 <torch.utils.data.dataloader.DataLoader at 0x7872d8c84e50>,
 ['pizza', 'steak', 'sushi'])

In [27]:
# 1. Create a class which subclasses nn.Module
class PatchEmbedding(nn.Module):
    """Turns a 2D input image into a 1D sequence learnable embedding vector.
    
    Args:
        in_channels (int): Number of color channels for the input images. Defaults to 3.
        patch_size (int): Size of patches to convert input image into. Defaults to 16.
        embedding_dim (int): Size of embedding to turn image into. Defaults to 768.
    """ 
    # 2. Initialize the class with appropriate variables
    def __init__(self, 
                 in_channels:int=3,
                 patch_size:int=16,
                 embedding_dim:int=768):
        super().__init__()
        self.patch_size = patch_size
        
        # 3. Create a layer to turn an image into patches
        self.patcher = nn.Conv2d(in_channels=in_channels,
                                 out_channels=embedding_dim,
                                 kernel_size=patch_size,
                                 stride=patch_size,
                                 padding=0)

        # 4. Create a layer to flatten the patch feature maps into a single dimension
        self.flatten = nn.Flatten(start_dim=2, # only flatten the feature map dimensions into a single vector
                                  end_dim=3)

    # 5. Define the forward method 
    def forward(self, x):
        # Create assertion to check that inputs are the correct shape
        image_resolution = x.shape[-1]
        assert image_resolution % self.patch_size == 0, f"Input image size must be divisble by patch size, image shape: {image_resolution}, patch size: {patch_size}"
        
        # Perform the forward pass
        x_patched = self.patcher(x)
        x_flattened = self.flatten(x_patched) 
        # 6. Make sure the output shape has the right order 
        return x_flattened.permute(0, 2, 1) # adjust so the embedding is on the final dimension [batch_size, P^2•C, N] -> [batch_size, N, P^2•C]

In [28]:
# Create random input sizes
random_input_image = (1, 3, 224, 224)
random_input_image_error = (1, 3, 250, 250) # will error because image size is incompatible with patch_size

# Get a summary of the input and outputs of PatchEmbedding (uncomment for full output)
summary(PatchEmbedding(), 
        input_size=random_input_image, # try swapping this for "random_input_image_error" 
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
PatchEmbedding (PatchEmbedding)          [1, 3, 224, 224]     [1, 196, 768]        --                   True
├─Conv2d (patcher)                       [1, 3, 224, 224]     [1, 768, 14, 14]     590,592              True
├─Flatten (flatten)                      [1, 768, 14, 14]     [1, 768, 196]        --                   --
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
Total mult-adds (M): 115.76
Input size (MB): 0.60
Forward/backward pass size (MB): 1.20
Params size (MB): 2.36
Estimated Total Size (MB): 4.17

In [29]:
model = Transformer(
    embed_dim=768,
    num_heads=12,
    attn_dropout=0,
    mlp_dim=3072,
    mlp_dropout=0.1,
    mlp_activation=nn.GELU(),
    num_layers=12,
    embed_dict_size=None,
    max_seq_len=((224 // 16)**2 + 1),
    pad_idx=None,
    add_cls_token=True,
    pe_requires_grad=True,
    need_embedding=False,
)

In [30]:
model

Transformer(
  (mlp_activation): GELU(approximate='none')
  (pos_embed_layer): PositionalEmbedding()
  (cls_embed_layer): Embedding(1, 768)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (mha_block): MultiheadSelfAttentionBlock(
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
      )
      (mlp_block): MultiLayerPerceptronBlock(
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (1): TransformerBlock(
      (mha_block): MultiheadSelfAttention

In [31]:
summary(model)

Layer (type:depth-idx)                                            Param #
Transformer                                                       --
├─GELU: 1-1                                                       --
├─PositionalEmbedding: 1-2                                        --
├─Embedding: 1-3                                                  768
├─Sequential: 1-4                                                 --
│    └─TransformerBlock: 2-1                                      --
│    │    └─MultiheadSelfAttentionBlock: 3-1                      2,363,904
│    │    └─MultiLayerPerceptronBlock: 3-2                        4,723,968
│    └─TransformerBlock: 2-2                                      --
│    │    └─MultiheadSelfAttentionBlock: 3-3                      2,363,904
│    │    └─MultiLayerPerceptronBlock: 3-4                        4,723,968
│    └─TransformerBlock: 2-3                                      --
│    │    └─MultiheadSelfAttentionBlock: 3-5                      2,3

In [32]:
# Create random input sizes
random_input_image = (1, 3, 224, 224)
random_input_image_error = (1, 3, 250, 250) # will error because image size is incompatible with patch_size

# Get a summary of the input and outputs of PatchEmbedding (uncomment for full output)
summary(PatchEmbedding(), 
        input_size=random_input_image, # try swapping this for "random_input_image_error" 
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
PatchEmbedding (PatchEmbedding)          [1, 3, 224, 224]     [1, 196, 768]        --                   True
├─Conv2d (patcher)                       [1, 3, 224, 224]     [1, 768, 14, 14]     590,592              True
├─Flatten (flatten)                      [1, 768, 14, 14]     [1, 768, 196]        --                   --
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
Total mult-adds (M): 115.76
Input size (MB): 0.60
Forward/backward pass size (MB): 1.20
Params size (MB): 2.36
Estimated Total Size (MB): 4.17

In [33]:
class VIT(nn.Module):
    def __init__(self, model, embed_dim, num_classes):
        super().__init__()
        self.patch_embedding = PatchEmbedding(in_channels=3, patch_size=16, embedding_dim=embed_dim)
        self.model = model
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embedding(x)

        x = self.model(x, attn_mask=None)
        x = self.linear(x[:, 0])
        return x

In [34]:
vit = VIT(model, embed_dim=768, num_classes=len(class_names))

In [35]:
summary(vit)

Layer (type:depth-idx)                                                 Param #
VIT                                                                    --
├─PatchEmbedding: 1-1                                                  --
│    └─Conv2d: 2-1                                                     590,592
│    └─Flatten: 2-2                                                    --
├─Transformer: 1-2                                                     --
│    └─GELU: 2-3                                                       --
│    └─PositionalEmbedding: 2-4                                        --
│    └─Embedding: 2-5                                                  768
│    └─Sequential: 2-6                                                 --
│    │    └─TransformerBlock: 3-1                                      7,087,872
│    │    └─TransformerBlock: 3-2                                      7,087,872
│    │    └─TransformerBlock: 3-3                                      7,087,872
│    │

In [36]:
from super_repo import data_setup, engine, utils
# Setup the optimizer to optimize our ViT model parameters using hyperparameters from the ViT paper 
optimizer = torch.optim.Adam(params=vit.parameters(), 
                             lr=3e-4, # Base LR from Table 3 for ViT-* ImageNet-1k
                             betas=(0.9, 0.999), # default values but also mentioned in ViT paper section 4.1 (Training & Fine-tuning)
                             weight_decay=0.3) # from the ViT paper section 4.1 (Training & Fine-tuning) and Table 3 for ViT-* ImageNet-1k

# Setup the loss function for multi-class classification
loss_fn = torch.nn.CrossEntropyLoss()
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Train the model and save the training results to a dictionary
results = engine.train(model=vit,
                       train_dataloader=train_dataloader,
                       test_dataloader=test_dataloader,
                       optimizer=optimizer,
                       loss_fn=loss_fn,
                       epochs=100,
                       device=device)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 3.4481 | train_acc: 0.2539 | test_loss: 1.2632 | test_acc: 0.2604
Epoch: 2 | train_loss: 1.5748 | train_acc: 0.2852 | test_loss: 1.0242 | test_acc: 0.5417
Epoch: 3 | train_loss: 1.5433 | train_acc: 0.2812 | test_loss: 1.0469 | test_acc: 0.5417
Epoch: 4 | train_loss: 1.5354 | train_acc: 0.2656 | test_loss: 1.1309 | test_acc: 0.2604
Epoch: 5 | train_loss: 1.2837 | train_acc: 0.2930 | test_loss: 1.5298 | test_acc: 0.1979
Epoch: 6 | train_loss: 1.2606 | train_acc: 0.4570 | test_loss: 1.1664 | test_acc: 0.5417
Epoch: 7 | train_loss: 1.2223 | train_acc: 0.4453 | test_loss: 1.2570 | test_acc: 0.2604
Epoch: 8 | train_loss: 1.3074 | train_acc: 0.2773 | test_loss: 1.3663 | test_acc: 0.1979
Epoch: 9 | train_loss: 1.1170 | train_acc: 0.3828 | test_loss: 1.0790 | test_acc: 0.5417
Epoch: 10 | train_loss: 1.2714 | train_acc: 0.4141 | test_loss: 1.8607 | test_acc: 0.2604
Epoch: 11 | train_loss: 1.4248 | train_acc: 0.2578 | test_loss: 1.3159 | test_acc: 0.1979
Epoch: 12 | train_l

KeyboardInterrupt: 