In [1]:
#  Setup
# !pip install torchvision --upgrade --quiet  (if required)


import os, json, torch
from PIL import Image
from torch.utils.data import DataLoader
import torchvision
from torchvision.transforms import functional as F
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from collections import Counter
from tqdm import tqdm

In [2]:
#   Config
CLASS_MAP = {
    "car": 1,
    "person": 2,
    "bike": 3,
    "traffic light": 4,
    "train": 5,
    "rider": 6,
    "bus": 7,
    "truck": 8,
    "traffic sign": 9,
}


NUM_CLASSES = len(CLASS_MAP) + 1


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The BDDDataset class is a custom PyTorch Dataset implementation for loading and processing the BDD100K dataset, specifically for object detection tasks. This dataset class is designed to work seamlessly with models like Faster R-CNN.

In [3]:
#   Dataset
class BDDDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation_file, transforms=None):
        self.root = root
        self.transforms = transforms
        with open(annotation_file) as f:
            self.annotations = json.load(f)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.root, ann["name"])
        img = Image.open(img_path).convert("RGB")
        boxes, labels = [], []
        for obj in ann["labels"]:
            if "box2d" not in obj or obj["category"] not in CLASS_MAP:
                continue
            b = obj["box2d"]
            boxes.append([b["x1"], b["y1"], b["x2"], b["y2"]])
            labels.append(CLASS_MAP[obj["category"]])
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([idx]),
        }
        if self.transforms:
            img = self.transforms(img)
        return img, target

    def __len__(self):
        return len(self.annotations)

This function calculates class weights based on the frequency of each class in the BDD100K annotations. These weights can be used to handle class imbalance during training, especially for object detection tasks.

This approach ensures that rare classes get higher loss weight, improving model performance on under-represented categories.

In [4]:
#  Class Weights
def compute_class_weights(annotation_file):
    with open(annotation_file) as f:
        data = json.load(f)
    counter = Counter()
    for item in data:
        for obj in item["labels"]:
            if "box2d" in obj and obj["category"] in CLASS_MAP:
                counter[CLASS_MAP[obj["category"]]] += 1
    max_count = max(counter.values())
    weights = {
        cls: max_count / count for cls, count in counter.items()
    }  # for count max
    return torch.tensor([1.0] + [weights.get(i, 1.0) for i in range(1, NUM_CLASSES)])

This class extends PyTorch’s built-in FasterRCNN model to incorporate class-weighted loss, helping tackle class imbalance during object detection training.

This implementation modifies only the classification loss. we could extend it to include weighting in other loss components (like bbox regression) if needeed

In [5]:
# Model with weighted loss
class WeightedFasterRCNN(FasterRCNN):
    def __init__(self, backbone, num_classes, class_weights=None):
        super().__init__(backbone, num_classes)
        self.class_weights = class_weights

    def forward(self, images, targets=None):
        if self.training and self.class_weights is not None:
            losses = super().forward(images, targets)
            for t in targets:
                weights = self.class_weights.to(images[0].device)[t["labels"]]
                losses["loss_classifier"] *= weights.mean()

            return losses
        return super().forward(images)

In [6]:
from torchvision.ops import box_iou


def evaluate(model, data_loader, device):
    model.eval()
    from collections import defaultdict
    import numpy as np

    aps = []
    with torch.no_grad():
        for images, targets in tqdm(data_loader, desc="Evaluating"):
            images = list(i.to(device) for i in images)
            outputs = model(images)
            for pred, tgt in zip(outputs, targets):
                if len(pred["boxes"]) == 0 or len(tgt["boxes"]) == 0:
                    continue
                ious = box_iou(pred["boxes"].cpu(), tgt["boxes"].cpu())
                matches = (ious > 0.5).sum().item()
                aps.append(matches / max(len(tgt["boxes"]), 1))
    return np.mean(aps)

In [7]:
#  Paths (update for your dataset)
TRAIN_IMG_DIR = (
    "/nfs/interns/kuldeepk/Assignment/bdd100k_images_100k/bdd100k/images/100k/train/"
)
VAL_IMG_DIR = (
    "/nfs/interns/kuldeepk/Assignment/bdd100k_images_100k/bdd100k/images/100k/val/"
)
TRAIN_JSON = "/nfs/interns/kuldeepk/Assignment/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_train.json"
VAL_JSON = "/nfs/interns/kuldeepk/Assignment/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_val.json"

This section sets up image preprocessing, dataset loading, and dataloaders for training and validation using the BDD100K dataset.

In [None]:
#  Loaders
transform = torchvision.transforms.Compose(
    [
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=(0.5, 0.5, 0.5),
            std=(0.5, 0.5, 0.5),
        ),
    ]
)
train_ds = BDDDataset(TRAIN_IMG_DIR, TRAIN_JSON, transform)
val_ds = BDDDataset(VAL_IMG_DIR, VAL_JSON, transform)
train_dl = DataLoader(
    train_ds,
    batch_size=512,
    shuffle=True,
    drop_last=True,
    collate_fn=lambda x: tuple(zip(*x)),
)
val_dl = DataLoader(
    val_ds, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x))
)

This section sets up a Faster R-CNN model with a ResNet-50 FPN backbone and integrates class-weighted loss to improve performance on imbalanced datasets like BDD100K.

Computes inverse-frequency class weights based on training annotations.

Used to penalize under-represented classes more heavily during training.

Uses ResNet-50 pretrained on ImageNet (v2) as the feature extractor.

Combines it with a Feature Pyramid Network (FPN) to handle multi-scale object detection.

Instantiates the custom WeightedFasterRCNN model.

Injects class_weights into the model to modify classification loss based on class imbalance.

In [9]:
from torchvision.models import ResNet50_Weights

weights = compute_class_weights(TRAIN_JSON)
backbone = resnet_fpn_backbone("resnet50", weights=ResNet50_Weights.IMAGENET1K_V2)
model = WeightedFasterRCNN(backbone, NUM_CLASSES, weights)
model.to(device)



WeightedFasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
       

Sets up the AdamW optimizer to train the WeightedFasterRCNN model.
A variant of Adam that decouples weight decay from the gradient update.

Helps prevent overfitting and improves generalization compared to regular Adam.

In [10]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

This section implements manual microbatching to simulate larger batch sizes than the GPU memory allows. It’s especially useful when using large detection models like Faster R-CNN.

In [11]:
def make_microbatches(x_batch, y_batch, micro_batch_size):
    batch_size = len(x_batch)
    assert (
        batch_size % micro_batch_size == 0
    ), f"batch_size % mini_batch_size != 0, {batch_size = }, {micro_batch_size = }"
    for idx in range(0, batch_size, micro_batch_size):
        x_micro_batch = x_batch[idx : idx + micro_batch_size]
        y_micro_batch = y_batch[idx : idx + micro_batch_size]
        yield x_micro_batch, y_micro_batch


def process_microbatches(model, x_batch, y_batch, micro_batch_size, autocast):
    batch_size = len(x_batch)
    grad_accum_step = batch_size // micro_batch_size
    loss_accum = 0
    for x, y in make_microbatches(x_batch, y_batch, micro_batch_size):
        x = [x_.to(device) for x_ in x]
        # [{k: v.to(device) for k, v in t.items()} for t in targets]
        y = [{k: v.to(device) for k, v in t.items()} for t in y]

        with autocast:
            loss_dict = model(x, y)
            loss = sum(l for l in loss_dict.values())
            loss /= grad_accum_step
        # loss.backward()
        loss_accum += loss.item()
    return loss_accum

Training Loop with Microbatching, mAP Evaluation, and Checkpointing
This section defines the training loop for a WeightedFasterRCNN model using microbatching, mixed-precision support, and periodic evaluation + checkpoint saving.

In [12]:
#  Training Loop + Checkpoints
EPOCHS = 5
micro_batch_size = 8
use_float16 = False
autocast = torch.autocast("cuda", torch.float16, enabled=use_float16)

SAVE_PATH = "./content/checkpoints"
os.makedirs(SAVE_PATH, exist_ok=True)
step = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for images, targets in train_dl:
        optimizer.zero_grad()

        loss_accum = process_microbatches(
            model, images, targets, micro_batch_size, autocast
        )
        optimizer.step()
        # total_loss += loss.item()
        total_loss += loss_accum
        print(f"{step = } : {loss_accum = :.4f}")
        step += 1

    avg_loss = total_loss / len(train_dl)
    print(f"[Epoch {epoch+1}] 🔧 Avg Train Loss: {avg_loss:.4f}")

    # mAP evaluation
    mAP = evaluate(model, val_dl, device)
    print(f"[Epoch {epoch+1}] 📈 Validation mAP@0.5: {mAP:.4f}")

    # Save checkpoint
    torch.save(
        model.state_dict(), os.path.join(SAVE_PATH, f"fasterrcnn_epoch{epoch+1}.pth")
    )

step = 0 : loss_accum = 6584471.5396
step = 1 : loss_accum = 4458453.7973
step = 2 : loss_accum = 10326817.8061
step = 3 : loss_accum = 2498142.3282
step = 4 : loss_accum = 847703.3582
step = 5 : loss_accum = 5839784.9164
step = 6 : loss_accum = 3598979.6368
step = 7 : loss_accum = 6223663.0996
step = 8 : loss_accum = 3499921.3249
step = 9 : loss_accum = 2469233.9170
step = 10 : loss_accum = 2428314.3585
step = 11 : loss_accum = 7135705.3408
step = 12 : loss_accum = 503034919.0657
step = 13 : loss_accum = 8630432.8462
step = 14 : loss_accum = 5695378.5226
step = 15 : loss_accum = 7354514.4047
step = 16 : loss_accum = 1878560.8758
step = 17 : loss_accum = 2605182.8343
step = 18 : loss_accum = 462440946.1577
[Epoch 1] 🔧 Avg Train Loss: 55134269.7963


Evaluating: 100%|████████████████████████████████████████████████████| 5000/5000 [09:34<00:00,  8.70it/s]


[Epoch 1] 📈 Validation mAP@0.5: 0.0055
step = 19 : loss_accum = 7100458.8388
step = 20 : loss_accum = 20736922.2513
step = 21 : loss_accum = 20583103.8011
step = 22 : loss_accum = 9266637.1589
step = 23 : loss_accum = 2781443.0003
step = 24 : loss_accum = 2068906.3684
step = 25 : loss_accum = 4030338.4642
step = 26 : loss_accum = 2396036.1604
step = 27 : loss_accum = 11079258.2587
step = 28 : loss_accum = 10723873.8278
step = 29 : loss_accum = 3318067.6622
step = 30 : loss_accum = 18032337.3308
step = 31 : loss_accum = 3814144.2119
step = 32 : loss_accum = 8070504.5404
step = 33 : loss_accum = 7006673.7757
step = 34 : loss_accum = 9167528.6405
step = 35 : loss_accum = 5917772.3482
step = 36 : loss_accum = 8348695.7622
step = 37 : loss_accum = 2355892.6817
[Epoch 2] 🔧 Avg Train Loss: 8252557.6360


Evaluating: 100%|████████████████████████████████████████████████████| 5000/5000 [09:37<00:00,  8.66it/s]


[Epoch 2] 📈 Validation mAP@0.5: 0.0055
step = 38 : loss_accum = 16004068.0690
step = 39 : loss_accum = 2715678.5806
step = 40 : loss_accum = 3417676.2224
step = 41 : loss_accum = 2447831.3285
step = 42 : loss_accum = 4724502.9034
step = 43 : loss_accum = 3946771.2948
step = 44 : loss_accum = 5100091.2750
step = 45 : loss_accum = 17370000.5526
step = 46 : loss_accum = 3454095.6666
step = 47 : loss_accum = 3830411.2036
step = 48 : loss_accum = 4918444.9487
step = 49 : loss_accum = 6054920.4000
step = 50 : loss_accum = 2301932.4552
step = 51 : loss_accum = 2282747.7560
step = 52 : loss_accum = 28408975.0327
step = 53 : loss_accum = 7909977.1039
step = 54 : loss_accum = 11120736.3941
step = 55 : loss_accum = 2141946.1736
step = 56 : loss_accum = 345790924.7464
[Epoch 3] 🔧 Avg Train Loss: 24944301.6898


Evaluating: 100%|████████████████████████████████████████████████████| 5000/5000 [09:33<00:00,  8.72it/s]


[Epoch 3] 📈 Validation mAP@0.5: 0.0055
step = 57 : loss_accum = 4348856.0245
step = 58 : loss_accum = 1768828.1896
step = 59 : loss_accum = 11977422.8573
step = 60 : loss_accum = 1944127.8754
step = 61 : loss_accum = 1877966.0294
step = 62 : loss_accum = 8943424.2892
step = 63 : loss_accum = 2878496.5831
step = 64 : loss_accum = 8646509.2423
step = 65 : loss_accum = 4835932.8317
step = 66 : loss_accum = 74346228.5115
step = 67 : loss_accum = 6131043.1694
step = 68 : loss_accum = 4610674.9167
step = 69 : loss_accum = 11333281.8082
step = 70 : loss_accum = 142327072.7335
step = 71 : loss_accum = 4572388.5061
step = 72 : loss_accum = 2376486.9674
step = 73 : loss_accum = 1842075.3290
step = 74 : loss_accum = 2268675.6728
step = 75 : loss_accum = 13185004.2200
[Epoch 4] 🔧 Avg Train Loss: 16327078.7241


Evaluating: 100%|████████████████████████████████████████████████████| 5000/5000 [09:36<00:00,  8.68it/s]


[Epoch 4] 📈 Validation mAP@0.5: 0.0055
step = 76 : loss_accum = 14802608.2071
step = 77 : loss_accum = 2733709.8220
step = 78 : loss_accum = 2178649.2544
step = 79 : loss_accum = 8883601.7321
step = 80 : loss_accum = 1749944.7011
step = 81 : loss_accum = 29959706.6156
step = 82 : loss_accum = 3504663.9195
step = 83 : loss_accum = 2929445.5255
step = 84 : loss_accum = 6860004.8596
step = 85 : loss_accum = 4549882.4912
step = 86 : loss_accum = 17518728.2002
step = 87 : loss_accum = 14057708.3387
step = 88 : loss_accum = 6464720.5658
step = 89 : loss_accum = 9989210.2901
step = 90 : loss_accum = 4272660.8139
step = 91 : loss_accum = 1627533.8971
step = 92 : loss_accum = 8604618.3347
step = 93 : loss_accum = 4265390.4308
step = 94 : loss_accum = 1976839.2299
[Epoch 5] 🔧 Avg Train Loss: 7733138.2752


Evaluating: 100%|████████████████████████████████████████████████████| 5000/5000 [09:30<00:00,  8.76it/s]


[Epoch 5] 📈 Validation mAP@0.5: 0.0055
