In [7]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda # This import is for if you have an Nvidia GPU and run on your pc
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.utils.data import DataLoader
import time
from torchvision import datasets, transforms
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm
# import timm # This import provides a bunch of prebuilt image models you can use for experimentation/comparisons
import os
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

In [8]:
# Training Settings
# How many images we will process concurrently, try to use powers of 2, you may want to lower if you're getting the "out of memory" error
batch_size = 32
# The number of times we iterate over the entire dataset
epochs = 5
# Learning Rate: We may later want to utilize learning rate decay
lr = 3e-5
gamma = 0.7
# For reproducible results
seed = 8743749123

In [16]:
# First let's get our dataset

# Define the transformations
# Need to add more and play around with different transformations
# RandomCrop is probably a good data augmentation technique we'll want to use
transform = transforms.Compose([
    transforms.Resize((224,224))
    transforms.ToTensor(),
])

# Need to troubleshoot INaturalist dataset -- Loads the whole dataset over 100GB and overflows the colab disk
# Taxonomy structure: Domain: Eukaryota, Kingdom: Animalia, Phylum: Arthropoda, Class: Insecta, Order: Hymenoptera, Suborder: Apocrita, Superfamily: Apoidea, Epifamily: Anthophila
#dataset = datasets.INaturalist(root='./data', version='2021_train_mini', target_type="genus", transform=transforms.ToTensor(), download=True)
#dset_size = len(dataset)



# Load Data [Locally Stored]
cwd = os.getcwd()
path = "%s/../data/train/" %(cwd)

dataset = datasets.ImageFolder(root=path, transform=transform) # Automatically assigns labels based on sub-directory name

# # Generate 2 splits: Train (80%), Test (20%)
# # (No hyperparameter validation this time around)
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

train, test = torch.utils.data.random_split(dataset, [train_size, test_size])

# # Create Data Loaders for splits
train_dl = DataLoader(train, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test, batch_size=batch_size, shuffle=False)

SyntaxError: invalid syntax (1277130155.py, line 8)

In [17]:
next(iter(train_dl))

RuntimeError: stack expects each tensor to be equal size, but got [3, 265, 500] at entry 0 and [3, 500, 500] at entry 1

In [1]:
# Our classes will be the tribe names
classes = [""]

In [None]:
def train_model(model, name):
    # Loss Function
    criterion = nn.CrossEntropyLoss()
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # Scheduler
    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

    print("")
    print("++++++++++++++++++++++++++++++++++++++++")
    print(f"Training Run [Model: {name}]")
    print("++++++++++++++++++++++++++++++++++++++++")

    # Training Time
    start_event = cuda.Event(enable_timing=True)
    end_event = cuda.Event(enable_timing=True)
    # Begin Clock
    start_event.record()

    # Training Loop
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_accuracy = 0

        for data, label in tqdm(bg_train_dl if background else no_bg_train_dl):
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            loss = criterion(output, label)
            #training_loss[f"{name}"].append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc = (output.argmax(dim=1) == label).float().mean()
            #training_accuracy[f"{name}"].append(acc)
            epoch_accuracy += acc / len(bg_train_dl if background else no_bg_train_dl)
            epoch_loss += loss / len(bg_train_dl if background else no_bg_train_dl)

        print(f"Epoch: {epoch+1} - loss: {epoch_loss:.4f} - acc: {epoch_accuracy:.4f}")

    # End Clock
    end_event.record()
    cuda.synchronize() # Wait for GPU operations to complete
    time = start_event.elapsed_time(end_event) / 1000 # Convert to seconds
    num_examples = batch_size * len(bg_train_dl if background else no_bg_train_dl)
    time_per_example = time / (num_examples * epochs)
    print(f"It took {time} seconds to train {name} on {num_examples} examples over {epochs} epochs.")
    print(f"That averages to {time_per_example} seconds per example")

    print("++++++++++++++++++++++++++++++++++++++++")
    print(f"Test Run [Model: {name}] ")
    print("++++++++++++++++++++++++++++++++++++++++")
    accuracies = []
    batch_acc = 0
    for data, label in tqdm(bg_test_dl if background else no_bg_test_dl):
        data = data.to(device)
        label = label.to(device)
        output = model(data)
        acc = (output.argmax(dim=1) == label).float().mean().cpu().detach().numpy()
        batch_acc += acc / len(bg_test_dl if background else no_bg_test_dl)
        accuracies.append(batch_acc)

    print(f"Test Accuracy: {accuracies[-1]} - Number of test cases: {len(bg_test_dl if background else no_bg_test_dl) * batch_size}")


In [2]:
# Model 1: VanillaViT
from vit_pytorch import ViT

VanillaViT = ViT(
    image_size = 128,
    patch_size = 8,
    num_classes = 6,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
).to(device)
train_model(VanillaViT, "VanillaViT")

print("")
print("-------------------------------------------------------------")
print(f"Attention Visualization")
print(f"[Model: name]")
print("-------------------------------------------------------------")
print("")
bg_imgs, no_bg_imgs = load_jpgs()
from vit_pytorch.recorder import Recorder
v = Recorder(VanillaViT)
"""
# Experimenting with attention weight visualization

for index, img in enumerate(bg_imgs):
    img.unsqueeze(0)
    preds, attns = v(img)
    attns
    attns.shape()
    plot_mats(img, attns, 'test', categories[index], 'test_plt')

NameError: name 'train_dset' is not defined

In [None]:
# Model 2: SimpleViT
from vit_pytorch import SimpleViT

SimpleViT = SimpleViT(
    image_size = 128,
    patch_size = 8,
    num_classes = 6,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048
).to(device)
train_model(SimpleViT, "SimpleViT")

In [None]:
# Model 4: T2TViT
from vit_pytorch.t2t import T2TViT

T2TViT = T2TViT(
    dim = 512,
    image_size = 128,
    depth = 5,
    heads = 8,
    mlp_dim = 512,
    num_classes = 6,
    t2t_layers = ((7, 4), (3, 2), (3, 2)) # tuples of the kernel size and stride of  each consecutive layers of the initial token to token module
).to(device)
train_model(T2TViT, "T2TViT")

In [None]:
# Model 6: CrossViT
from vit_pytorch.cross_vit import CrossViT

CrossViT = CrossViT(
    image_size = 128,
    num_classes = 6,
    depth = 4,               # number of multi-scale encoding blocks
    sm_dim = 192,            # high res dimension
    sm_patch_size = 16,      # high res patch size (should be smaller than lg_patch_size)
    sm_enc_depth = 2,        # high res depth
    sm_enc_heads = 8,        # high res heads
    sm_enc_mlp_dim = 2048,   # high res feedforward dimension
    lg_dim = 384,            # low res dimension
    lg_patch_size = 64,      # low res patch size
    lg_enc_depth = 3,        # low res depth
    lg_enc_heads = 8,        # low res heads
    lg_enc_mlp_dim = 2048,   # low res feedforward dimensions
    cross_attn_depth = 2,    # cross attention rounds
    cross_attn_heads = 8,    # cross attention heads
    dropout = 0.1,
    emb_dropout = 0.1
).to(device)
train_model(CrossViT, "CrossViT")

NameError: name 'epochs' is not defined

In [None]:
# Model 7: PiT
from vit_pytorch.pit import PiT

PiT = PiT(
    image_size = 128,
    patch_size = 16,
    dim = 256,
    num_classes = 6,
    depth = (3, 3, 3),     # list of depths, indicating the number of rounds of each stage before a downsample
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
).to(device)
train_model(PiT, "PiT")

In [None]:
# Model 8: LeViT
from vit_pytorch.levit import LeViT

LeViT = LeViT(
    image_size = 128,
    num_classes = 6,
    stages = 3,             # number of stages
    dim = (256, 384, 512),  # dimensions at each stage
    depth = 4,              # transformer of depth 4 at each stage
    heads = (4, 6, 8),      # heads at each stage
    mlp_mult = 2,
    dropout = 0.1
).to(device)
train_model(LeViT, "LeViT")

In [None]:


# Model 9: CvT
from vit_pytorch.cvt import CvT

CvT = CvT(
    num_classes = 6,
    s1_emb_dim = 64,        # stage 1 - dimension
    s1_emb_kernel = 7,      # stage 1 - conv kernel
    s1_emb_stride = 4,      # stage 1 - conv stride
    s1_proj_kernel = 3,     # stage 1 - attention ds-conv kernel size
    s1_kv_proj_stride = 2,  # stage 1 - attention key / value projection stride
    s1_heads = 1,           # stage 1 - heads
    s1_depth = 1,           # stage 1 - depth
    s1_mlp_mult = 4,        # stage 1 - feedforward expansion factor
    s2_emb_dim = 192,       # stage 2 - (same as above)
    s2_emb_kernel = 3,
    s2_emb_stride = 2,
    s2_proj_kernel = 3,
    s2_kv_proj_stride = 2,
    s2_heads = 3,
    s2_depth = 2,
    s2_mlp_mult = 4,
    s3_emb_dim = 384,       # stage 3 - (same as above)
    s3_emb_kernel = 3,
    s3_emb_stride = 2,
    s3_proj_kernel = 3,
    s3_kv_proj_stride = 2,
    s3_heads = 4,
    s3_depth = 10,
    s3_mlp_mult = 4,
    dropout = 0.
).to(device)
train_model(CvT, "CvT")

In [None]:
# Model 13: ScalableViT
from vit_pytorch.scalable_vit import ScalableViT

ScalableViT = ScalableViT(
    num_classes = 6,
    dim = 64,                               # starting model dimension. at every stage, dimension is doubled
    heads = (2, 4, 8, 16),                  # number of attention heads at each stage
    depth = (2, 2, 20, 2),                  # number of transformer blocks at each stage
    ssa_dim_key = (40, 40, 40, 32),         # the dimension of the attention keys (and queries) for SSA. in the paper, they represented this as a scale factor on the base dimension per key (ssa_dim_key / dim_key)
    reduction_factor = (8, 4, 2, 1),        # downsampling of the key / values in SSA. in the paper, this was represented as (reduction_factor ** -2)
    window_size = (32, 16, None, None),     # window size of the IWSA at each stage. None means no windowing needed
    dropout = 0.1,                          # attention and feedforward dropout
).to(device)
train_model(ScalableViT, "ScalableViT")

In [None]:
# Model 15: MobileViT
from vit_pytorch.mobile_vit import MobileViT

MobileViT = MobileViT(
    image_size = (128, 128),
    dims = [96, 120, 144],
    channels = [16, 32, 48, 48, 64, 64, 80, 80, 96, 96, 384],
    num_classes = 6
).to(device)
train_model(MobileViT, "MobileViT")

In [None]:
# Model 16: SmallDataViT
from vit_pytorch.vit_for_small_dataset import ViT as SmallDataViT

SmallDataViT = SmallDataViT(
    image_size = 128,
    patch_size = 16,
    num_classes = 6,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
).to(device)
train_model(SmallDataViT, "SmallDataViT")