In [None]:
pip install pycocotools

In [2]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CocoDetection
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset
from torchvision import transforms as T
from torch.utils.data.dataloader import default_collate
from PIL import Image
import json


In [3]:
TRAIN_IMG_FOLDER_PATH = '/kaggle/input/icpr-vistac-dataset-for-object-detection/Combined-VISTAC-Challenge-Dataset/Combined-VISTAC-Challenge-Dataset/train' #Training image path.
VALID_IMG_FOLDER_PATH = '/kaggle/input/icpr-vistac-dataset-for-object-detection/Combined-VISTAC-Challenge-Dataset/Combined-VISTAC-Challenge-Dataset/validation' # Valid image path for checking.
TRAIN_ANNOTATION_FILE = '/kaggle/input/icpr-vistac-dataset-for-object-detection/train.json'
VAL_ANNOTATION_FILE = '/kaggle/input/icpr-vistac-dataset-for-object-detection/validation.json'
LR = 0.001 #LEARNING RATE
BATCH_SIZE = 8
EPOCHS = 10

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = 'efficient_b0'
print(DEVICE)

cuda


In [4]:
import json

# Load the JSON files
with open(TRAIN_ANNOTATION_FILE, 'r') as f:
    train_annotations = json.load(f)

with open(VAL_ANNOTATION_FILE, 'r') as f:
    validation_annotations = json.load(f)

# Print out some examples to understand the structure
print("Train Annotations Example:", list(train_annotations.items())[:5])
print("Validation Annotations Example:", list(validation_annotations.items())[:5])


Train Annotations Example: [('bag-1', {'video_dir': 'bag-1', 'init_rect': [860, 236, 397, 939, 0.5891802874498578], 'img_names': ['bag-1/img/00000001.jpg', 'bag-1/img/00000002.jpg', 'bag-1/img/00000003.jpg', 'bag-1/img/00000004.jpg', 'bag-1/img/00000005.jpg', 'bag-1/img/00000006.jpg', 'bag-1/img/00000007.jpg', 'bag-1/img/00000008.jpg', 'bag-1/img/00000009.jpg', 'bag-1/img/00000010.jpg', 'bag-1/img/00000011.jpg', 'bag-1/img/00000012.jpg', 'bag-1/img/00000013.jpg', 'bag-1/img/00000014.jpg', 'bag-1/img/00000015.jpg', 'bag-1/img/00000016.jpg', 'bag-1/img/00000017.jpg', 'bag-1/img/00000018.jpg', 'bag-1/img/00000019.jpg', 'bag-1/img/00000020.jpg', 'bag-1/img/00000021.jpg', 'bag-1/img/00000022.jpg', 'bag-1/img/00000023.jpg', 'bag-1/img/00000024.jpg', 'bag-1/img/00000025.jpg', 'bag-1/img/00000026.jpg', 'bag-1/img/00000027.jpg', 'bag-1/img/00000028.jpg', 'bag-1/img/00000029.jpg', 'bag-1/img/00000030.jpg', 'bag-1/img/00000031.jpg', 'bag-1/img/00000032.jpg', 'bag-1/img/00000033.jpg', 'bag-1/img/0

In [5]:
# Data transformations
train_augs = T.Compose([
          T.RandomHorizontalFlip(p = 0.5),
          T.RandomRotation(degrees=(-20, +20)),
          T.ToTensor()
          #PIL/numpy array -> torch tensor -> (height, width, channel) -> (channel, height, width)
    ])
valid_augs = T.Compose([
    T.ToTensor()
])

In [6]:
# Load annotations from a JSON file
def load_annotations(file_path):
    with open(file_path, 'r') as f:
        annotations = json.load(f)
    return annotations

class VistaDataset(Dataset):
    def __init__(self, img_folder, annotations, transforms=None):
        self.img_folder = img_folder
        self.annotations = annotations
        self.transforms = transforms
        self.imgs = []
        for video_dir, data in annotations.items():
            for img_name in data['img_names']:
                self.imgs.append((video_dir, img_name))

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        video_dir, img_name = self.imgs[idx]
        img_path = os.path.join(self.img_folder, img_name)

        # Debugging print statements
        #print(f"video_dir: {video_dir}")
        #print(f"img_name: {img_name}")
        #print(f"img_path: {img_path}")

        # Check if the file exists before opening it
        if not os.path.exists(img_path):
            print(f"File not found: {img_path}")
            raise FileNotFoundError(f"File not found: {img_path}")

        img = Image.open(img_path).convert("RGB")

        # Find the corresponding ground truth box
        boxes = []
        labels = []

        for i, name in enumerate(self.annotations[video_dir]['img_names']):
            if name == img_name:
                box = self.annotations[video_dir]['gt_rect'][i]
                boxes.append([box[0], box[1], box[0] + box[2], box[1] + box[3]])
                labels.append(1)  # Assuming label 1 for now; adjust according to class mapping

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels

        if self.transforms:
            img = self.transforms(img)

        return img, target

In [7]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [8]:
# Load annotations
train_annotations = load_annotations(TRAIN_ANNOTATION_FILE)
valid_annotations = load_annotations(VAL_ANNOTATION_FILE)

trainset = VistaDataset(TRAIN_IMG_FOLDER_PATH, train_annotations, transforms=train_augs)
validset = VistaDataset(VALID_IMG_FOLDER_PATH, valid_annotations, transforms=valid_augs)

print(f"Total no. of examples in trainset : {len(trainset)}")
print(f"Total no. of examples in validset : {len(validset)}")

Total no. of examples in trainset : 25464
Total no. of examples in validset : 4253


In [9]:
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print(f"Total no. of batches in trainloader : {len(trainloader)}")
print(f"Total no. of batches in validloader : {len(validloader)}")

Total no. of batches in trainloader : 3183
Total no. of batches in validloader : 532


In [10]:
# Define the model
def get_instance_segmentation_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

In [11]:
# Initialize the model
model = get_instance_segmentation_model(num_classes=71)
model.to(DEVICE)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 145MB/s]  


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [12]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=LR, momentum=0.9, weight_decay=0.0005)

In [13]:
# Define the function to train the model

def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    running_loss = 0.0
    for i, (images, targets) in enumerate(data_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        running_loss += losses.item()
        if i % print_freq == 0:
            print(f"Epoch [{epoch}/{EPOCHS}], Step [{i}/{len(data_loader)}], Loss: {losses.item():.4f}")

    print(f"Epoch [{epoch}] Loss: {running_loss/len(data_loader):.4f}")


In [14]:
# Define the function to evaluate the model
def evaluate(model, data_loader, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            running_loss += losses.item()

    print(f"Validation Loss: {running_loss/len(data_loader):.4f}")


In [19]:
import matplotlib.patches as patches

def visualize_results(image, results):
    fig, ax = plt.subplots(figsize=(12, 12))
    ax.imshow(image)

    for box, label in zip(results['boxes'].detach().numpy(), results['labels']):
        x1, y1, x2, y2 = box
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='green', facecolor="none")
        ax.add_patch(rect)
        plt.text(x1, y1, label, bbox=dict(facecolor='white', alpha=0.5), fontsize=12)

    plt.show()

In [16]:
for epoch in range(EPOCHS):
    train_one_epoch(model, optimizer, trainloader, device=DEVICE, epoch=epoch, print_freq=10)
    evaluate(model, validloader, device=DEVICE)

torch.save(model.state_dict(), 'faster_rcnn_model.pth')

Epoch [0/10], Step [0/3183], Loss: 4.5709
Epoch [0/10], Step [10/3183], Loss: 0.4117
Epoch [0/10], Step [20/3183], Loss: 0.4959
Epoch [0/10], Step [30/3183], Loss: 0.3931
Epoch [0/10], Step [40/3183], Loss: 0.2899
Epoch [0/10], Step [50/3183], Loss: 0.2923
Epoch [0/10], Step [60/3183], Loss: 0.2730
Epoch [0/10], Step [70/3183], Loss: 0.1803
Epoch [0/10], Step [80/3183], Loss: 0.2011
Epoch [0/10], Step [90/3183], Loss: 0.1688
Epoch [0/10], Step [100/3183], Loss: 0.1859
Epoch [0/10], Step [110/3183], Loss: 0.2492
Epoch [0/10], Step [120/3183], Loss: 0.1730
Epoch [0/10], Step [130/3183], Loss: 0.2251
Epoch [0/10], Step [140/3183], Loss: 0.1782
Epoch [0/10], Step [150/3183], Loss: 0.1833
Epoch [0/10], Step [160/3183], Loss: 0.2088
Epoch [0/10], Step [170/3183], Loss: 0.1848
Epoch [0/10], Step [180/3183], Loss: 0.1725
Epoch [0/10], Step [190/3183], Loss: 0.1495
Epoch [0/10], Step [200/3183], Loss: 0.1541
Epoch [0/10], Step [210/3183], Loss: 0.1857
Epoch [0/10], Step [220/3183], Loss: 0.1532

KeyboardInterrupt: 