Step 1： Install & Imports

In [2]:
# Install required packages if needed (uncomment if not installed)
# !pip install torchvision
# !pip install matplotlib

import os
import numpy as np
import torch
import torchvision
from PIL import Image
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import json
from tqdm import tqdm


Step 2：Prepare Dataset

In [3]:
class COCODetectionDataset(Dataset):
    def __init__(self, img_dir, ann_file, transforms=None):
        self.img_dir = img_dir
        self.transforms = transforms
        with open(ann_file, 'r') as f:
            coco_json = json.load(f)
        self.images = coco_json['images']
        self.annotations = coco_json['annotations']
        self.categories = coco_json['categories']

        # Map from image_id to annotations
        self.img_id_to_anns = {}
        for ann in self.annotations:
            self.img_id_to_anns.setdefault(ann['image_id'], []).append(ann)

        self.img_id_to_filename = {img['id']: img['file_name'] for img in self.images}
        self.img_id_to_size = {img['id']: (img['width'], img['height']) for img in self.images}
        self.cat_id_to_idx = {cat['id']: idx for idx, cat in enumerate(self.categories)}

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_id = img_info['id']
        file_name = img_info['file_name']
        img_path = os.path.join(self.img_dir, file_name)

        image = Image.open(img_path).convert("RGB")
        width, height = self.img_id_to_size[img_id]

        boxes = []
        labels = []
        for ann in self.img_id_to_anns.get(img_id, []):
            x, y, w, h = ann['bbox']
            boxes.append([x, y, x + w, y + h])
            labels.append(self.cat_id_to_idx[ann['category_id']] + 1)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([img_id])
        }

        if self.transforms:
            image = self.transforms(image)

        return image, target


Directory setup:

In [4]:
train_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/train2017"
val_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/val2017"
train_ann = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/annotations/instances_train2017.json"
val_ann = r"C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/annotations/instances_val2017.json"


Step 3: DataLoader

In [5]:
def collate_fn(batch):
    return tuple(zip(*batch))

train_dataset = COCODetectionDataset(train_dir, train_ann, transforms=F.to_tensor)
val_dataset = COCODetectionDataset(val_dir, val_ann, transforms=F.to_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)


Step 4: Initialize Faster R-CNN Model

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pretrained Faster R-CNN
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = len(train_dataset.cat_id_to_idx) + 1  # include background class
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

model.to(device)




FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

Step 5: Training Loop (5 epochs)

In [7]:
import torch.optim as optim

# Optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training
model.train()
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Epoch [{epoch+1}/{num_epochs}]")

    epoch_loss = 0.0
    for images, targets in tqdm(train_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    lr_scheduler.step()
    print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")


Epoch [1/5]


100%|██████████| 2500/2500 [11:47<00:00,  3.53it/s]


Epoch 1 Loss: 1154.8393
Epoch [2/5]


100%|██████████| 2500/2500 [10:41<00:00,  3.90it/s]


Epoch 2 Loss: 1030.9990
Epoch [3/5]


100%|██████████| 2500/2500 [07:49<00:00,  5.32it/s]


Epoch 3 Loss: 951.8513
Epoch [4/5]


100%|██████████| 2500/2500 [08:07<00:00,  5.12it/s]


Epoch 4 Loss: 732.8110
Epoch [5/5]


100%|██████████| 2500/2500 [11:44<00:00,  3.55it/s]

Epoch 5 Loss: 667.2724





Step 6: Save Model

In [8]:
output_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "fasterrcnn_model.pth")
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Model saved to C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput\fasterrcnn_model.pth


Step 7: Video Inference and Visualization with Faster R-CNN

In [9]:
import cv2
from torchvision.transforms import functional as F
from collections import Counter

# Reload the trained model
model.eval()
model.load_state_dict(torch.load(model_path))
model.to(device)

# Video directory and output
video_dir = r"C:/Users/admin/Downloads/Code/ObjectDetection/videos"
video_files = [f for f in os.listdir(video_dir) if f.lower().endswith(".mp4")]

# Category mapping
category_names = [cat['name'] for cat in train_dataset.categories]
idx_to_name = {idx + 1: name for idx, name in enumerate(category_names)}  # class 0 is background

# Colors
def get_color(cls_id):
    np.random.seed(cls_id)
    return tuple(np.random.randint(0, 255, size=3).tolist())

for video_file in video_files:
    video_path = os.path.join(video_dir, video_file)
    print(f"Processing: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Cannot open video: {video_file}")
        continue

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    output_path = os.path.join(output_dir, f"annotated_{video_file}")
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    all_class_ids = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img_tensor = F.to_tensor(img).to(device).unsqueeze(0)

        with torch.no_grad():
            outputs = model(img_tensor)[0]

        boxes = outputs["boxes"].cpu().numpy()
        scores = outputs["scores"].cpu().numpy()
        labels = outputs["labels"].cpu().numpy()

        for box, score, label in zip(boxes, scores, labels):
            if score < 0.25:
                continue
            x1, y1, x2, y2 = box.astype(int)
            cls_id = int(label)
            color = get_color(cls_id)
            name = idx_to_name.get(cls_id, "Unknown")
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            text = f"{name} {score:.2f}"
            cv2.putText(frame, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
            all_class_ids.append(cls_id)

        out.write(frame)

    cap.release()
    out.release()
    print(f"Saved annotated video to {output_path}")

    # Summary
    summary = Counter(all_class_ids)
    print("Detection Summary:")
    for cls_id, count in summary.items():
        print(f"  - {idx_to_name.get(cls_id, 'Unknown')}: {count} times")


  model.load_state_dict(torch.load(model_path))


Processing: C:/Users/admin/Downloads/Code/ObjectDetection/videos\videoplayback.mp4
Saved annotated video to C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput\annotated_videoplayback.mp4
Detection Summary:
  - car: 5974 times
  - traffic light: 5582 times
  - person: 1118 times
  - stop sign: 398 times
  - truck: 65 times
  - bicycle: 2 times
  - bus: 87 times


Step 8: Validation Set Evaluation Code

In [None]:
import os
import json
import torch
import torchvision
import numpy as np
from PIL import Image
from collections import defaultdict
from torchvision import transforms
from pycocotools.coco import COCO
from torchvision.ops import box_iou
from tqdm import tqdm

# ==== 1. Load model ====
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(num_classes=9)  # 8 + background
model.load_state_dict(torch.load("C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput/fasterrcnn_model.pth"))
model.to(device)
model.eval()

# ==== 2. Load COCO val annotations ====
ann_path = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/annotations/instances_val2017.json"
img_dir = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset/val2017"
coco = COCO(ann_path)
cat_id_to_name = {cat['id']: cat['name'] for cat in coco.loadCats(coco.getCatIds())}
cat_name_to_id = {v: k for k, v in cat_id_to_name.items()}
cat_ids = sorted(cat_id_to_name.keys())
cat_names = [cat_id_to_name[cid] for cid in cat_ids]

# ==== 3. Evaluation storage ====
TP = defaultdict(int)
FP = defaultdict(int)
FN = defaultdict(int)

transform = transforms.Compose([
    transforms.ToTensor()
])

# ==== 4. Loop through images ====
for img_id in tqdm(coco.getImgIds()):
    img_info = coco.loadImgs(img_id)[0]
    img_path = os.path.join(img_dir, img_info['file_name'])
    image = Image.open(img_path).convert("RGB")
    tensor_img = transform(image).to(device).unsqueeze(0)

    with torch.no_grad():
        outputs = model(tensor_img)[0]

    pred_boxes = outputs['boxes'].cpu()
    pred_labels = outputs['labels'].cpu()
    pred_scores = outputs['scores'].cpu()

    gt_ann = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
    gt_boxes = torch.tensor([ann['bbox'] for ann in gt_ann], dtype=torch.float32)
    gt_boxes[:, 2:] += gt_boxes[:, :2]  # convert [x, y, w, h] -> [x1, y1, x2, y2]
    gt_labels = torch.tensor([ann['category_id'] for ann in gt_ann])

    for cat_id in cat_ids:
        pred_mask = pred_labels == cat_id
        gt_mask = gt_labels == cat_id

        preds = pred_boxes[pred_mask]
        gts = gt_boxes[gt_mask]

        matched_gt = set()

        for pb in preds:
            if len(gts) == 0:
                FP[cat_id] += 1
                continue
            ious = box_iou(pb.unsqueeze(0), gts)[0]
            max_iou, idx = ious.max(0)
            if max_iou > 0.5 and idx.item() not in matched_gt:
                TP[cat_id] += 1
                matched_gt.add(idx.item())
            else:
                FP[cat_id] += 1

        FN[cat_id] += len(gts) - len(matched_gt)

# ==== 5. Compute Precision / Recall / F1 ====
results = []
for cid in cat_ids:
    tp = TP[cid]
    fp = FP[cid]
    fn = FN[cid]
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    results.append((cat_id_to_name[cid], precision, recall, f1))

# ==== 6. Display Results ====
import pandas as pd
df = pd.DataFrame(results, columns=["Class", "Precision", "Recall", "F1 Score"])
df.sort_values("F1 Score", ascending=False, inplace=True)

from IPython.display import display
display(df)

# Optional: save for plotting
df.to_csv("C:/Users/admin/Downloads/Code/ObjectDetection/fasterOutput/faster_f1_scores.csv", index=False)
