In [None]:
# P100 or factory reset runtime!
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


images currently in gdrive are faulty. copy original_data.zip to disk

In [None]:
!ls "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/Annotations/trainval" -1 | wc -l

16227


In [None]:
!ls "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/JPEGImages/trainval" -1 | wc -l

16237


probably want to copy images to local folder so image fetching is faster

In [None]:
!cp "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/original_data.zip" /content/

In [None]:
!unzip original_data.zip

# Copy augmented images to disk

In [None]:
%cd /content

/content


In [None]:
!ls

gdrive	sample_data


In [None]:
!cp "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/FasterRCNN-Augmented.zip" /content/

In [None]:
!unzip FasterRCNN-Augmented.zip

# Copy Inpainted Images to disk

In [None]:
%cd /content
!ls

/content
gdrive	sample_data


In [None]:
!cp "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/FasterRCNN-Inpainted.zip" /content/

In [None]:
!unzip FasterRCNN-Inpainted.zip

# make sure ann and images have same count

In [None]:
!ls "/content/original_data/Annotations/trainval" -1 | wc -l

16227


In [None]:
!ls "/content/original_data/JPEGImages/trainval" -1 | wc -l

16227


# test an xml file

In [None]:
xml_path = "/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012/Annotations/trainval/2007_000033.xml"

In [None]:
import os
print(os.path.exists(xml_path))

True


In [None]:
import xml.etree.ElementTree as ET

tree = ET.parse(xml_path)
root = tree.getroot()

# size = root.findall("size")

# for s in size:
#     width = s.findall("width")[0].text
#     print(width)

b = []

for child in root:
    for t in child:
        if t.tag == 'bndbox':
            xmin = float(t.findall("xmin")[0].text)
            ymin = float(t.findall("ymin")[0].text)
            xmax = float(t.findall("xmax")[0].text)
            ymax = float(t.findall("ymax")[0].text)
            print(xmin, ymin, xmax, ymax)
            assert xmin < xmax, f"xmin not less than xmax"
            assert ymin < ymax, f"ymin not less than ymax"

9.0 107.0 499.0 263.0
421.0 200.0 482.0 226.0
325.0 188.0 411.0 223.0


# check trainval.txt to see if names are consistent with annotations or images. If not, just use names under annotations xor images folder (just a sanity check)

In [None]:
index_path = "/content/original_data/Indexes/trainval.txt"

In [None]:
with open(index_path, 'r') as fp:
    lines = fp.readlines()

print(len(lines))

16227


check all indices are present as images

In [None]:
ann_dir = "/content/original_data/Annotations/trainval"
img_dir = "/content/original_data/JPEGImages/trainval"

In [None]:
import os

split_file = os.path.join("/content/original_data", 'Indexes', 'trainval.txt')
with open(split_file) as fp:
    index_list = [line.strip() for line in fp]

In [None]:
for line in index_list:
    pth = os.path.join(img_dir, line + ".jpg")
    assert(os.path.exists(pth))  # should not throw any assertion errors

# Create Custom Dataset (Pascal2012)

imports

In [None]:
import numpy as np
import os
import xml.etree.ElementTree as ET

import torch
import torch.nn
from PIL import Image
from torch.utils.data import Dataset, DataLoader

import random
import torchvision.transforms as transforms

import scipy.io
import pickle
from tqdm import tqdm

dataset

In [None]:
class PASCALDataset(Dataset):
    CLASS_NAMES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
                   'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
                   'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
    INV_CLASS = {}
    
    for i in range(len(CLASS_NAMES)):
        INV_CLASS[CLASS_NAMES[i]] = i

    CLASS_NAMES.insert(0, '__background__') # Adding background class with 0 index


    def __init__(self, split='trainval', data_dir="/content/original_data", save_lbl_pth='/content/gdrive/MyDrive/VLR_Project/data/PASCAL2012'):
        super().__init__()
        self.split      = split     # 'trainval' or 'test'
        self.data_dir   = data_dir
        self.label_pth = os.path.join(save_lbl_pth, split + "_label.pkl")
        
        self.img_dir = os.path.join(data_dir, 'JPEGImages', split)
        self.ann_dir = os.path.join(data_dir, 'Annotations', split)

        split_file = os.path.join(data_dir, 'Indexes', split + '.txt')
        with open(split_file) as fp:
            self.index_list = [line.strip() for line in fp]

        self.anno_list = self.preload_anno()

    @classmethod
    def get_class_name(cls, index):
        return cls.CLASS_NAMES[index]

    @classmethod
    def get_class_index(cls, name):
        return cls.INV_CLASS[name]

    def __len__(self):
        return len(self.index_list)

    def preload_anno(self):
        """
        :return: a list of labels. each element is in the form of [class, weight, gt_class_list, gt_boxes],
         where both class and weight are arrays/tensors in shape of [20],
         gt_class_list is a list of the class ids (separate for each instance)
         gt_boxes is a list of [xmin, ymin, xmax, ymax] values in the range 0 to 1
        """        
        if os.path.exists(self.label_pth):
            print(f"preloading from path {self.label_pth}")
            label_list = []
            with open(self.label_pth, 'rb') as fp:
                label_list = pickle.load(fp)
            
            return label_list

        label_list = []
        # print(f"reading data to save to {self.label_pth}")

        for i, index in tqdm(enumerate(self.index_list)):
            fpath = os.path.join(self.ann_dir, index + '.xml')
            tree = ET.parse(fpath)
            root = tree.getroot()

            # print(f"parsing {index}")

            C = np.zeros(20)
            W = np.ones(20) * 2 # default to enable 1 or 0 later for difficulty

            # new list for each index
            gt_class_list = []
            gt_boxes = []

            for child in root:
                
                if child.tag == 'object':
                    C[self.INV_CLASS[child[0].text]] = 1    # item at index of child name become 1
                    if child[3].text == '1' and W[self.INV_CLASS[child[0].text]] == 2:
                        W[self.INV_CLASS[child[0].text]] = 0    # if not difficult, weight is one
                    elif child[3].text == '0' :
                        W[self.INV_CLASS[child[0].text]] = 1
                    
                    # add class_index to gt_class_list
                    gt_class_list.append(self.INV_CLASS[child[0].text])

                    for t in child:
                        if t.tag == 'bndbox':
                            xmin = float(t.findall("xmin")[0].text)
                            ymin = float(t.findall("ymin")[0].text)
                            xmax = float(t.findall("xmax")[0].text)
                            ymax = float(t.findall("ymax")[0].text)
                            assert xmin < xmax, f"xmin not less than xmax for {index}"
                            assert ymin < ymax, f"ymin not less than ymax for {index}"
                            gt_boxes.append([xmin, ymin, xmax, ymax])
                    
            for i in range(len(W)):
                if W[i] == 2:
                    W[i] = 1

            label_list.append([C, W, gt_class_list, gt_boxes])
            
        # uncomment if you want to save the pickled label list to preload next time
        # with open(self.label_pth, 'wb') as fp:
        #   pickle.dump(label_list, fp)

        return label_list

    
    def __getitem__(self, index):
        """
        :param index: a int generated by Dataloader in range [0, __len__()]
        :return: index-th element - containing all the aforementioned information
        """
        # The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each image, and should be in 0-1 range.
        # Different images can have different sizes but they will be resized to a fixed size before passing it to the backbone.
        # During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing:

        # boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.

        # labels (Int64Tensor[N]): the class label for each ground-truth box
        findex = self.index_list[index]     # findex refers to the file number
        fpath = os.path.join(self.img_dir, findex + '.jpg')

        # image: a PIL Image of size (H, W)
        img = Image.open(fpath).convert("RGB")
        img = transforms.ToTensor()(img)


        '''
        One note on the labels. The model considers class 0 as background.
        If your dataset does not contain the background class, you should not have 0 in your labels.
        For example, assuming you have just two classes, cat and dog, you can define 1 (not 0) to represent cats
        and 2 to represent dogs. So, for instance, if one of the images has both classes, your labels tensor should look like [1,2].

        Edit: So basically add 1 to all the label classes!
        '''

        
        # first id is the background, so remove it =====> Verify this!
        # obj_ids = obj_ids[1:]


        # target: a dict containing the following fields

        # 1. boxes (FloatTensor[N, 4]): the coordinates of the N bounding boxes in [x0, y0, x1, y1] format, ranging from 0 to W and 0 to H
        labels, boxes = self.anno_list[index][2], self.anno_list[index][3]

        boxes = torch.as_tensor(boxes, dtype=torch.float32)

        # 2. labels (Int64Tensor[N]): the label for each bounding box. 0 represents always the background class.
        labels = torch.as_tensor(labels, dtype=torch.int64) + 1 # Add 1 since background class has been added!


        # 3. image_id (Int64Tensor[1]): an image identifier.
        # It should be unique between all the images in the dataset, and is used during evaluation
        image_id = torch.tensor([index])

        # 4. area (Tensor[N]): The area of the bounding box.
        # This is used during evaluation with the COCO metric, to separate the metric scores between small, medium and large boxes.
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        # 5. iscrowd (UInt8Tensor[N]): instances with iscrowd=True will be ignored during evaluation.
        # Suppose all instances are not crowd
        iscrowd = torch.zeros((len(labels),), dtype=torch.int64)


        target = {}

        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        return img, target

# Define Model (FASTER RCNN)

In [None]:
import argparse
import os
import shutil
import time
import sys
from tkinter import N
import sklearn
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# model = torchvision.models.detection.ssd300_vgg16(pretrained=False, num_classes=len(PASCALDataset.CLASS_NAMES) + 1)

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = len(PASCALDataset.CLASS_NAMES) + 1
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.train().to(device)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
for name, layer in model.named_modules():
    print(name, layer)

In [None]:
print(model.backbone.body.layer4)

## some more hyperparams

In [None]:
lr = 0.005
momentum = 0.9
weight_decay = 5e-4
epochs = 5 # Keep it as 10 because we can use batch_size=4 instead of 2

## define optimizer, scheduler

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

# We'll need another library for this (using this library's train function)

In [None]:
!ls

gdrive	__MACOSX  original_data  original_data.zip  sample_data


In [None]:
!git clone https://github.com/pytorch/vision.git

Cloning into 'vision'...
remote: Enumerating objects: 132578, done.[K
remote: Counting objects: 100% (1866/1866), done.[K
remote: Compressing objects: 100% (190/190), done.[K
remote: Total 132578 (delta 1678), reused 1841 (delta 1668), pack-reused 130712[K
Receiving objects: 100% (132578/132578), 258.37 MiB | 12.66 MiB/s, done.
Resolving deltas: 100% (116317/116317), done.


In [None]:
import os
print(os.getcwd())

/content


In [None]:
print(os.environ['PYTHONPATH'])
os.environ['PYTHONPATH'] += ":" + "/content/vision/references/detection"
print(os.environ['PYTHONPATH'])

/env/python
/env/python:/content/vision/references/detection


In [None]:
%cd "/content/vision/references/detection"

/content/vision/references/detection


# Train Function

In [None]:
# Import functions from inside the /content/vision/references/detection path
from engine import train_one_epoch, evaluate
import utils

# Define dataset and dataloader

In [None]:
# We require utils for the collate function
torch.cuda.empty_cache()

batch_size = 4
num_workers = 2


train_dataset = PASCALDataset('trainval', data_dir="/content/original_data")
train_loader=DataLoader(train_dataset,batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, collate_fn=utils.collate_fn)
print('\n', len(train_loader))

16227it [00:01, 8645.36it/s]


 4057





In [None]:
test_dataset = PASCALDataset('test', data_dir="/content/original_data")
test_loader = DataLoader(test_dataset,batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=utils.collate_fn)
print('\n', len(test_loader))

898it [00:00, 7956.96it/s]


 225





the train function below is deprecated. See the main runner

In [None]:
# DEPRECATED
def train(train_loader, model, criterion, optimizer, epoch):
    total_loss = 0

    # switch to train mode
    model.train()

    end = time.time()
    for i, (image_name_list, image_list, gt_box_classes) in enumerate(train_loader):
        # send to device
        for idx in range(len(image_list)):
            image_list[idx] = image_list[idx].to(device)

        for idx in range(len(gt_box_classes)):
            curr_dict = gt_box_classes[idx]
            curr_dict['boxes'] = curr_dict['boxes'].to(device)
            curr_dict['labels'] = curr_dict['labels'].to(device)
            gt_box_classes[idx] = curr_dict

        output = model(image_list, gt_box_classes)
        print(output)
        break
        imoutput = F.max_pool2d(output, kernel_size=output.shape[2:])  # should be batch_size, num_classes, 1, 1
        imoutput = torch.sigmoid(imoutput)  # elementwise sigmoid to get probabilities
        imoutput = imoutput.squeeze()  # batch_size, num_classes to match target shape
        loss = criterion(imoutput, target)
        total_loss += loss.item()

        if i % 50 == 0:
            print(f"Epoch {epoch} [{i}/{len(train_loader)}] ====> Train Loss: {loss.item()}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)

    return avg_loss

# Runner

In [None]:
for epoch in range(epochs):
    # train for one epoch

    train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=100)
    # evaluate(model, test_loader, device)

    # VIVEK ADD EVALUATION Done! \(^_^)/
    evaluate(model, test_loader, device=device)

    scheduler.step()

save_path = os.path.join("/content/gdrive/MyDrive/VLR_Project/pascal-training/saved_models", f"faster-rcnn_original_epochs{epochs}.pt")
torch.save(model, save_path)

Epoch: [0]  [   0/4057]  eta: 2:32:12  lr: 0.000010  loss: 3.4648 (3.4648)  loss_classifier: 3.3328 (3.3328)  loss_box_reg: 0.1194 (0.1194)  loss_objectness: 0.0003 (0.0003)  loss_rpn_box_reg: 0.0122 (0.0122)  time: 2.2510  data: 0.2944  max mem: 5953
Epoch: [0]  [ 100/4057]  eta: 1:29:58  lr: 0.000509  loss: 0.7045 (1.2083)  loss_classifier: 0.3604 (0.9016)  loss_box_reg: 0.3203 (0.2784)  loss_objectness: 0.0093 (0.0128)  loss_rpn_box_reg: 0.0086 (0.0155)  time: 1.3703  data: 0.0003  max mem: 7549
Epoch: [0]  [ 200/4057]  eta: 1:28:38  lr: 0.001009  loss: 0.5739 (0.8947)  loss_classifier: 0.2775 (0.5901)  loss_box_reg: 0.2652 (0.2759)  loss_objectness: 0.0089 (0.0129)  loss_rpn_box_reg: 0.0120 (0.0157)  time: 1.3982  data: 0.0002  max mem: 7549
Epoch: [0]  [ 300/4057]  eta: 1:27:42  lr: 0.001508  loss: 0.4606 (0.7563)  loss_classifier: 0.1823 (0.4655)  loss_box_reg: 0.2542 (0.2616)  loss_objectness: 0.0108 (0.0132)  loss_rpn_box_reg: 0.0121 (0.0161)  time: 1.4671  data: 0.0002  max me

In [None]:
# save state dict (just in case)
save_path = os.path.join("/content/gdrive/MyDrive/VLR_Project/pascal-training/saved_models", f"faster-rcnn-statedict_original_epochs{epochs}.pt")
torch.save(model.state_dict(), save_path)

In [None]:
# to load model again
model = torch.load(save_path)
model.eval()

# Get MAE/IOU for FasterRCNN on PASCAL2012 test set