### Imports

In [2]:
# importing the required libraries
import os
import numpy as np
import torch
import json
from PIL import Image
import torchvision
import torch.utils.data
import torchsummary

### Config

In [None]:
import torch


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Dataset
TRAIN_DATASET_ROOT = "data/0408_take2_new/train/"
TEST_DATASET_ROOT = "data/0408_take2_new/test/"

# Hyperparameters
RANDOM_SEED = 1
LEARNING_RATE = 0.0025
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0001
STEP_SIZE = 3
GAMMA = 1
NUM_EPOCHS = 20

### Box Conversion

In [None]:
def box_convert(boxes):
    '''
    Functionality: Convert bounding boxes format from xywh to xyxy
    Input: boxes of the format xmin, ymin, width, height
    Output: boxes of format xmin, ymin, xmax, ymax (top left and bottom right points)
    '''
    x, y, w, h = boxes.unbind(-1)
    boxes = torch.stack([x, y, x + w, y + h], dim=-1)
    return boxes


#### Dataset

In [None]:
class EmberDataset_new(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        '''
        Input: Dataset (Either the train or test data)
        '''
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "imgs"))))

    def __getitem__(self, idx):
        '''
        Function : Fetches the image information along with the target given an index
        Input : Index of the image
        Output : Returns the image and target for the given index
        '''
        # load images
        try:
            img_path = os.path.join(self.root, "imgs", self.imgs[idx])
            img = Image.open(img_path).convert("RGB") 
            img = img.resize((224,224))
            boxes = []
        except:
            print(f"Idx number for which it failed: {idx}")

        # load annotations
        annotation_path = os.path.join(self.root, "annotations/")
        with open(annotation_path + os.listdir(annotation_path)[0], "r") as file:
            data = json.load(file)
        temp = data["annotations"][idx]["bbox"]
        temp = torch.as_tensor(temp, dtype=torch.float32)
        temp = temp.unsqueeze(0)
        boxes = box_convert(temp)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        # there is only one class
        labels = torch.ones((1,), dtype=torch.int64)
        iscrowd = torch.zeros((1,), dtype=torch.int64)
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["area"] = area 
        target["image_id"] = torch.tensor([data["annotations"][idx]["image_id"]])
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [None]:
from engine import train_one_epoch, evaluate
import utils
import transforms as T

def get_transform(train):
    '''
    Function : Transform the image into a tensor followed by flipping the images if train argument is true
    Input : Train argument
    Output : Transformed image to a tensor, flipped image if train argument is true
    '''
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToTensor())
    if train:
    # during training, randomly flip the training images and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

## Visualize bbox

In [None]:
from torchvision.utils import draw_bounding_boxes
from pathlib import Path
from torchvision.io import read_image
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms.functional as F

plt.rcParams["savefig.bbox"] = 'tight'

def show(imgs):
    '''
    Function : To visualise the bounding box of an image
    Input : Image to be visualised
    Output : Ember image along with the bounding box
    '''
    if not isinstance(imgs, list):
        imgs = [imgs]
    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
boxes = torch.tensor([[1948.,  554., 2034.,  640.]], dtype=torch.float)

colors = ["yellow"]
ember_img = read_image(os.path.join(TRAIN_DATASET_ROOT,"imgs",'156.jpg'))
result = draw_bounding_boxes(ember_img, boxes, colors=colors, width=5)
show(result)

### Approach 3 - Having two streams for preprocessing

In [None]:
from torchsummary import summary
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.models.detection import FasterRCNN

model = torchvision.models.resnet50(pretrained=True)
res50_con = nn.Sequential(*list(model.children())[:-4])


class upstream(nn.Module):
    def __init__(self):
        '''
        Function : 
        Initialise backbone to resnet models with certain layers detached
        Initialise the Upsample model 
        '''
        super().__init__()
        self.resnet_modified_backbone = res50_con
        self.upsample = nn.Upsample(scale_factor=(4,4), mode='nearest')
        
    def forward(self,x):
        '''
        Function : Return the upstreamed data after passing through the Upsample function
        Input : Image to be upstreamed
        Output : Upstreamed image after passing through the transpose2D
        '''
        x = self.resnet_modified_backbone(x)
        x = self.upsample(x)
        return x

class downstream(nn.Module):
    def __init__(self):
        '''
        Initialise conv layer and relu layer
        '''
        super().__init__()
        self.conv1 = nn.Conv2d(3, 256, 1)
        self.pool = nn.MaxPool2d(2, stride=2)
        self.relu = nn.ReLU()

    
    def forward(self,x):
        '''
        Function : Return the downstreamed data after passing through the 1X1 convolution followed by Maxpool and ReLU
        Input : Image to be downstreamed
        Output : Downstreamed image after passing through the 1X1 convolution, maxpool and ReLU
        '''
        x = self.conv1(x)
        x = self.pool(x)
        return self.relu(x)

class Ensemble(nn.Module):
    def __init__(self, upstream, downstream, num_classes):
        '''
        Initialise the upstreams, downstreams, relu activation and output channels
        '''
        super(Ensemble, self).__init__()
        self.upstream = upstream
        self.downstream = downstream
        self.relu = nn.ReLU()
        self.out_channels = 768
    
    def forward(self, x):
        '''
        Function : To return the concatenation of the image from the upstream as well as downstream
        Input : Image to be upstreamed and downstreamed
        Output : Concatenated upstream and downstream image
        '''
        x1 = self.upstream(x)
        x2 = self.downstream(x)
        x = torch.cat((x1, x2), dim=1)
        return self.relu(x)
    

background_supression_stream = upstream()
small_object_stream = downstream()
backbone = Ensemble(background_supression_stream, small_object_stream, num_classes = 2)
model = FasterRCNN(backbone,num_classes=2)

### Training

In [None]:
# DATALOADER
def data_loader_func():
    '''
    Function : Load the data from the train and test dataset roots
    Output : Return the data loaders
    '''
   # use our dataset and defined transformations
    dataset = EmberDataset_new(TRAIN_DATASET_ROOT, get_transform(train=True))
    dataset_test = EmberDataset_new(TEST_DATASET_ROOT, get_transform(train=False))

    torch.manual_seed(RANDOM_SEED)
    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, shuffle=True, num_workers=4)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=4)
    
    return data_loader, data_loader_test



# OPTIMIZER
def optimizer(model):
    '''
    Function : Optimize the parameters by choosing the appropriate hyperparameters
    Input : Model is taken as the input
    Output : Model with the optimal parameters
    '''
    model.to(DEVICE)
    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr = LEARNING_RATE,
                                momentum = MOMENTUM, weight_decay = WEIGHT_DECAY)

    # and a learning rate scheduler which decreases the learning rate by 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size = STEP_SIZE,
                                                gamma=GAMMA)
    return lr_scheduler



# TRAINING
def training_function(model):
    '''
    Function : Train for the number of epochs after loading the data and optimising the model
    Input : Model is given as the input
    Output : Evaluated result
    '''
    data_loader, data_loader_test = data_loader_func()
    lr_scheduler = optimizer(model)

    for epoch in range(NUM_EPOCHS):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, DEVICE, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=DEVICE)

training_function(model)