In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader as dataloader
import torchvision.models as models

import time
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
from PIL import Image, ImageOps
import copy
import pandas as pd

from Trainer import ModelTrainer
from Datasets import CUB200

In [None]:
# Pytorch version >= 0.15 have v2 transforms that work on both the images and the bounding boxes
# https://pytorch.org/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/
# It's currently in beta so you'll get a warning if you try to use it!

# import torchvision.transforms.v2 as transforms

In [None]:
# The size of our mini batches
batch_size = 64

# How many itterations of our dataset
num_epochs = 30

# Optimizer learning rate
learning_rate = 1e-4

# Where to load/save the dataset from 
# data_set_root = "../../datasets"
data_set_root = "/media/luke/Quick Storage/Data/cub_200"

# What to resize our images to 
image_size = 128

In [None]:
start_from_checkpoint = True

save_dir = '../data/Models'
model_name = 'ResNet18_CUB'

In [None]:
# Set device to GPU_indx if GPU is avaliable
gpu_indx = 0
device = torch.device(gpu_indx if torch.cuda.is_available() else 'cpu')

# Data Augmentation Transform
After training ResNet with no augmentations record the results, then implement the data augmentation. <br>
With a small dataset our large model will more then likely simply overfit to (or memorize) the training data which will often lead to bad evaluation results<br>
We can "create more" data from our limited dataset by applying random transformations as we sample images from our dataset instead of simply resizing them<br>
By applying these transformations we are also forcing our model to generalise better to unseen images<br>
You can also apply random affine transformations (shifts, scaling, rotations etc) - see <a href="https://pytorch.org/vision/0.12/">Pytorch documentations</a>.<br>
NOTE: you should only apply transforms that make sense, eg if at test time you'll never see an upside-down cat, don't flip your images vertically 


In [None]:
# Only include the augmentations if you can use the v2 transforms that will augment 
# both the image and bounding boxes (you'll need to modify the dataset class too!)

train_transform = transforms.Compose([transforms.Resize(image_size),
                                      transforms.ToTensor(),
#                                       transforms.RandomHorizontalFlip(),
#                                       transforms.RandomRotation(10),
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                           std=[0.229, 0.224, 0.225]),
                                     ])

transform = transforms.Compose([transforms.Resize(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                     std=[0.229, 0.224, 0.225])])

In [None]:
class BboxIOU(nn.Module):
    
    def xyhw_to_xyxy(self, bbox):
        bbox[:, 2] += bbox[:, 0]
        bbox[:, 3] += bbox[:, 1]
        return bbox

    def bb_intersection_over_union(self, pred_bbox, target_bbox):
        pred_bbox = self.xyhw_to_xyxy(pred_bbox)
        target_bbox = self.xyhw_to_xyxy(target_bbox)

        # determine the (x, y)-coordinates of the intersection rectangle
        xA = torch.cat((pred_bbox[:, 0:1], target_bbox[:, 0:1]), 1).max(dim=1)[0].unsqueeze(1)
        yA = torch.cat((pred_bbox[:, 1:2], target_bbox[:, 1:2]), 1).max(dim=1)[0].unsqueeze(1)
        xB = torch.cat((pred_bbox[:, 2:3], target_bbox[:, 2:3]), 1).min(dim=1)[0].unsqueeze(1)
        yB = torch.cat((pred_bbox[:, 3:4], target_bbox[:, 3:4]), 1).min(dim=1)[0].unsqueeze(1)

        # compute the area of intersection rectangle
        x_area = (xB - xA).abs()
        y_area = (yB - yA).abs()
        interArea = x_area * y_area

        w1 = (pred_bbox[:, 0:1] - pred_bbox[:, 2:3]).abs()
        h1 = (pred_bbox[:, 1:2] - pred_bbox[:, 3:4]).abs()

        w2 = (target_bbox[:, 0:1] - target_bbox[:, 2:3]).abs()
        h2 = (target_bbox[:, 1:2] - target_bbox[:, 3:4]).abs()

        area1 = w1 * h1
        area2 = w2 * h2

        # compute the intersection over union by taking the intersection
        # area and dividing it by the sum of prediction + ground-truth
        # areas - the interesection area
        iou = interArea / (area1 + area2 - interArea)

        # return the intersection over union value
        return iou

    def forward(self, predictions, data):
        pred_bbox = torch.sigmoid(predictions[:, :4])
        target_bbox = data[1].to(pred_bbox.device)
        
        return self.bb_intersection_over_union(pred_bbox, target_bbox)

# Create the training, testing and validation data 

In [None]:
# Define our STL10 Datasets
# https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.STL10

# Dataset definition is a bit differenet to MNIST and CIFAR10
# STL10 has 3 different datasets, test, train and unlabeled
# http://ai.stanford.edu/~acoates/stl10/
# training set only has 5000 images and test set only 8000
# Image size in this dataset are 96x96, larger then what we've been using

train_data = CUB200(data_set_root, transform=train_transform, image_size=image_size, test_train=0)
test_data = CUB200(data_set_root, transform=transform, image_size=image_size, test_train=1)

# Split trainging data into train and validation set with 90/10% traning/validation split
validation_split = 0.9

n_train_examples = int(len(train_data)*validation_split)
n_valid_examples = len(train_data) - n_train_examples
train_data, valid_data = torch.utils.data.random_split(train_data, [n_train_examples, n_valid_examples],
                                                       generator=torch.Generator().manual_seed(42))

# Create the Pretrained Network

In [None]:
# Create an instance of the ResNet18 Model
res_net = models.resnet18(pretrained=True)

In [None]:
model_trainer = ModelTrainer(model=res_net.to(device), output_size=4, device=device, 
                             loss_fun=nn.BCEWithLogitsLoss(), batch_size=batch_size, 
                             learning_rate=learning_rate, save_dir=save_dir, model_name=model_name,
                             elav_metric=BboxIOU(), start_from_checkpoint=start_from_checkpoint)

In [None]:
model_trainer.set_data(train_set=train_data, test_set=test_data, val_set=valid_data)

# Set a Learning Rate Scheduler
We can dynamically change the <a href="https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate">learning rate</a> during training to help our model converge to a better minimum!

In [None]:
model_trainer.set_lr_schedule(optim.lr_scheduler.StepLR(model_trainer.optimizer, 
                                                        step_size=1, 
                                                        gamma=0.95))

# View Data

In [None]:
plt.figure(figsize = (20,10))
images, bbox, labels = next(iter(model_trainer.test_loader))
out = torchvision.utils.make_grid(images[0:16], normalize=True)
_ = plt.imshow(out.numpy().transpose((1, 2, 0)))

In [None]:
example_indx = 0
ex_img = images[example_indx]
ex_label = bbox[example_indx].unsqueeze(0) * image_size
ex_label[:, 2] += ex_label[:, 0]
ex_label[:, 3] += ex_label[:, 1]

img_out = (((ex_img - ex_img.min())/(ex_img.max() - ex_img.min())) * 255).to(torch.uint8)
img_box = torchvision.utils.draw_bounding_boxes(img_out, ex_label, colors=(0, 255, 0))

In [None]:
plt.figure(figsize = (5,5))
out = torchvision.utils.make_grid(img_box.unsqueeze(0).float(), normalize=True)
_ = plt.imshow(out.numpy().transpose((1, 2, 0)))

In [None]:
# Lets see how many Parameter's our Model has!
num_params = 0
for param in model_trainer.model.parameters():
    num_params += param.flatten().shape[0]
print("This model has %d (approximately %d Million) Parameters!" % (num_params, num_params//1e6))

# Train Model!
Our full training method is now fully contained within the trainner class! Simply run the run_training method and specify how many epochs it should train for!

In [None]:
model_trainer.run_training(num_epochs=num_epochs)

# View Results

In [None]:
print("The highest validation IoU was %.2f" %(model_trainer.best_valid_acc))

In [None]:
_ = plt.figure(figsize = (10,5))
train_x = np.linspace(0, num_epochs, len(model_trainer.train_loss_logger))
_ = plt.plot(train_x, model_trainer.train_loss_logger)
_ = plt.title("Training Loss")

## Model Prediction!

In [None]:
# Select an image to test
example_indx = 0
ex_img = images[example_indx]
img_out = (((ex_img - ex_img.min())/(ex_img.max() - ex_img.min())) * 255).to(torch.uint8)

# Get the model's prediction for the Bounding Box
model_trainer.eval()
with torch.no_grad():
    pred_out = torch.sigmoid(model_trainer(ex_img.unsqueeze(0).to(device)))
    ex_label = (pred_out * image_size).cpu()
    ex_label[:, 2] += ex_label[:, 0]
    ex_label[:, 3] += ex_label[:, 1]
    
# Draw the box on the image
img_box = torchvision.utils.draw_bounding_boxes(img_out, ex_label, colors=(0, 255, 0))

plt.figure(figsize = (5,5))
out = torchvision.utils.make_grid(img_box.unsqueeze(0).float(), normalize=True)
_ = plt.imshow(out.numpy().transpose((1, 2, 0)))

In [None]:
_ = plt.figure(figsize = (10,5))
train_x = np.linspace(0, num_epochs, len(model_trainer.train_acc_logger))
_ = plt.plot(train_x, model_trainer.train_acc_logger, c = "y")
valid_x = np.linspace(0, num_epochs, len(model_trainer.val_acc_logger))
_ = plt.plot(valid_x, model_trainer.val_acc_logger, c = "k")

_ = plt.title("Average IoU")
_ = plt.legend(["Training IoU", "Validation IoU"])

# Evaluate

In [None]:
# Call the evaluate function and pass the evaluation/test dataloader etc
test_acc = model_trainer.evaluate_model(train_test_val="test")
print("The Test Average IoU is: %.2f" %(test_acc))