Step 1： Install & Imports

In [2]:
# Install required packages if needed (uncomment if not installed)
# !pip install torchvision
# !pip install matplotlib

import os
import numpy as np
import torch
import torchvision
from PIL import Image
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import json
from tqdm import tqdm


Step 2：Prepare Dataset

In [3]:
class COCODetectionDataset(Dataset):
    def __init__(self, img_dir, ann_file, transforms=None):
        self.img_dir = img_dir
        self.transforms = transforms
        with open(ann_file, 'r') as f:
            coco_json = json.load(f)
        self.images = coco_json['images']
        self.annotations = coco_json['annotations']
        self.categories = coco_json['categories']

        # Map from image_id to annotations
        self.img_id_to_anns = {}
        for ann in self.annotations:
            self.img_id_to_anns.setdefault(ann['image_id'], []).append(ann)

        self.img_id_to_filename = {img['id']: img['file_name'] for img in self.images}
        self.img_id_to_size = {img['id']: (img['width'], img['height']) for img in self.images}
        self.cat_id_to_idx = {cat['id']: idx for idx, cat in enumerate(self.categories)}

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_id = img_info['id']
        file_name = img_info['file_name']
        img_path = os.path.join(self.img_dir, file_name)

        image = Image.open(img_path).convert("RGB")
        width, height = self.img_id_to_size[img_id]

        boxes = []
        labels = []
        for ann in self.img_id_to_anns.get(img_id, []):
            x, y, w, h = ann['bbox']
            boxes.append([x, y, x + w, y + h])
            labels.append(self.cat_id_to_idx[ann['category_id']] + 1)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([img_id])
        }

        if self.transforms:
            image = self.transforms(image)

        return image, target


Directory setup:

In [4]:
train_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/train2017"
val_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/val2017"
train_ann = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/annotations/instances_train2017.json"
val_ann = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/annotations/instances_val2017.json"


Step 3: DataLoader

In [5]:
def collate_fn(batch):
    return tuple(zip(*batch))

train_dataset = COCODetectionDataset(train_dir, train_ann, transforms=F.to_tensor)
val_dataset = COCODetectionDataset(val_dir, val_ann, transforms=F.to_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)


Step 4: Initialize Faster R-CNN Model

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pretrained Faster R-CNN
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = len(train_dataset.cat_id_to_idx) + 1  # include background class
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

model.to(device)




FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

Step 5: Training Loop (5 epochs)

In [None]:
import torch.optim as optim

# Optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training
model.train()
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Epoch [{epoch+1}/{num_epochs}]")

    epoch_loss = 0.0
    for images, targets in tqdm(train_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    lr_scheduler.step()
    print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")


Epoch [1/5]


  2%|▏         | 61/2500 [00:17<09:39,  4.21it/s]

Step 6: Save Model

In [None]:
output_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "fasterrcnn_model.pth")
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")
