Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
Portions of this notebook consist of AI-generated content.

Permission is hereby granted, free of charge, to any person obtaining a copy

of this software and associated documentation files (the "Software"), to deal

in the Software without restriction, including without limitation the rights

to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

copies of the Software, and to permit persons to whom the Software is

furnished to do so, subject to the following conditions:



The above copyright notice and this permission notice shall be included in all

copies or substantial portions of the Software.



THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

SOFTWARE.

# CV05-1 Mask R-CNN

### Lab Description
This lab introduces **Mask R-CNN**, a deep learning model for **instance segmentation**.  
Unlike image classification or object detection, instance segmentation predicts **both bounding boxes and pixel-wise masks** for each object.

You will:
- Fine-tune a **Mask R-CNN (ResNet-50 FPN backbone)** on a small COCO dataset subset.
- Monitor **epoch training time, batch size, VRAM usage**.
- Evaluate the trained model and **visualize masks & bounding boxes**.

### What you can expect to learn
- Understand the difference between **object detection** and **instance segmentation**.  
- Use **torchvision’s Mask R-CNN** for fine-tuning.  
- Record and analyze **training performance metrics**.  
- Visualize predicted **object masks and bounding boxes**.  


In [8]:
# ⚙️ Setup & Config
!pip install torch torchvision --quiet
!pip install pycocotools matplotlib pandas --quiet

import os
import random
import time
from dataclasses import dataclass

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.models.detection import maskrcnn_resnet50_fpn

In [9]:
# Config
@dataclass
class CFG:
    seed: int = 42
    batch_size: int = 2
    epochs: int = 3
    lr: float = 1e-4
    device: str = "cuda" if torch.cuda.is_available() else "cpu"


cfg = CFG()


# Seed
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(cfg.seed)

# Output dir
OUTPUT_DIR = os.path.expanduser("./output_maskrcnn")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Dataset: COCO128 (mini COCO)
We use `COCO128`, a lightweight subset of COCO2017 with bounding boxes and segmentation masks.  
This allows us to train Mask R-CNN quickly inside Colab.  


In [14]:
!rm -rf ./data/coco128
!rm -f coco128.zip
!curl -L "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip" -o coco128.zip
!unzip -q coco128.zip -d ./data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 6819k  100 6819k    0     0  9565k      0 --:--:-- --:--:-- --:--:-- 9565k


In [12]:
import os

import torch
from pycocotools import mask as coco_mask

# ---- Transform ----
train_tfms = T.Compose([T.ToTensor()])


# ---- Custom dataset for Mask R-CNN (COCO format -> tensors with masks) ----
class COCOMask(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, annFile, transform=None):
        super().__init__(img_folder, annFile)
        self.transform = transform

    def __getitem__(self, idx):
        img, anns = super().__getitem__(idx)  # img: PIL.Image, anns: list of dicts
        img = img.convert("RGB")
        w, h = img.size

        boxes, labels, masks, iscrowd, areas = [], [], [], [], []

        for obj in anns:
            # 1) boxes (xywh -> xyxy)
            x, y, bw, bh = obj["bbox"]
            if bw <= 0 or bh <= 0:
                continue
            x1, y1, x2, y2 = x, y, x + bw, y + bh
            boxes.append([x1, y1, x2, y2])

            # 2) label (use COCO category_id; our head = 91 classes)
            labels.append(int(obj["category_id"]))

            # 3) mask (handle polygon list or RLE)
            seg = obj.get("segmentation", None)
            m = None
            if seg:
                if isinstance(seg, list):
                    # polygon -> RLE -> decode
                    rles = coco_mask.frPyObjects(seg, h, w)
                    rle = coco_mask.merge(rles)
                    m = coco_mask.decode(rle)
                elif isinstance(seg, dict):  # already RLE
                    m = coco_mask.decode(seg)
            if m is None:
                # fall back to bbox mask if missing seg (rare in coco128)
                m = np.zeros((h, w), dtype=np.uint8)
                m[int(y1) : int(y2), int(x1) : int(x2)] = 1
            if m.ndim == 3:  # merge multiple RLEs
                m = np.any(m, axis=2).astype(np.uint8)
            masks.append(torch.as_tensor(m, dtype=torch.uint8))

            # 4) area / iscrowd
            areas.append(float(obj.get("area", bw * bh)))
            iscrowd.append(int(obj.get("iscrowd", 0)))

        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            masks = torch.zeros((0, h, w), dtype=torch.uint8)
            iscrowd = torch.zeros((0,), dtype=torch.int64)
            areas = torch.zeros((0,), dtype=torch.float32)
        else:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.int64)
            masks = torch.stack(masks, dim=0)  # [N, H, W]
            iscrowd = torch.tensor(iscrowd, dtype=torch.int64)
            areas = torch.tensor(areas, dtype=torch.float32)

        target = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": torch.tensor([idx]),
            "area": areas,
            "iscrowd": iscrowd,
        }

        if self.transform is not None:
            img = self.transform(img)
        else:
            img = T.ToTensor()(img)

        return img, target


# ---- Build dataset & loader ----
train_dataset = COCOMask(
    img_folder="./data/coco128/images/train2017",
    annFile="./data/coco128/annotations/instances_train2017.json",
    transform=train_tfms,
)


# Important: custom collate_fn to keep lists (Mask R-CNN expects list[Tensor]/list[Dict])
def collate_fn(batch):
    imgs, targets = list(zip(*batch))
    return list(imgs), list(targets)


train_loader = DataLoader(
    train_dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=torch.cuda.is_available(),
    collate_fn=collate_fn,
)

len(train_dataset)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 6819k  100 6819k    0     0   9.8M      0 --:--:-- --:--:-- --:--:--  9.8M
replace ./data/coco128/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: all
error:  invalid response [all]
replace ./data/coco128/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/coco128/images/train2017/000000000612.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/coco128/images/train2017/000000000404.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/coco128/images/train2017/000000000438.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/coco128/images/train2017/000000000389.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/coco128/images/train2017/000000000564.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace ./data/co

KeyboardInterrupt: 

## 3. Model
We use **Mask R-CNN with ResNet-50 FPN backbone**, pretrained on COCO.  
We replace the **classification head** and **mask head** to match COCO’s 91 classes.  


In [None]:
# Model
model = maskrcnn_resnet50_fpn(weights="DEFAULT")

# Replace heads for fine-tuning
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, 91)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = torchvision.models.detection.mask_rcnn.MaskRCNNPredictor(in_features_mask, 256, 91)

model = model.to(cfg.device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=cfg.lr)

## 4. Training
We train for a few epochs and log:
- **Loss**
- **Time per epoch**
- **Peak VRAM usage**
- **Batch size**


In [None]:
def get_vram_mb():
    return torch.cuda.max_memory_allocated(cfg.device) / (1024**2) if torch.cuda.is_available() else 0


history = {"epoch": [], "loss": [], "time": [], "vram_MB": [], "batch_size": []}

for epoch in range(cfg.epochs):
    t0 = time.time()
    model.train()
    torch.cuda.reset_peak_memory_stats()

    total_loss = 0.0
    for imgs, targets in train_loader:
        imgs = [img.to(cfg.device) for img in imgs]
        targets = [{k: v.to(cfg.device) for k, v in t.items()} for t in targets]

        loss_dict = model(imgs, targets)
        loss = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    t1 = time.time()
    epoch_time = t1 - t0
    vram = get_vram_mb()

    history["epoch"].append(epoch + 1)
    history["loss"].append(total_loss / len(train_loader))
    history["time"].append(epoch_time)
    history["vram_MB"].append(vram)
    history["batch_size"].append(cfg.batch_size)

    print(
        f"Epoch {epoch + 1}/{cfg.epochs} | Loss {total_loss / len(train_loader):.4f} | "
        f"Time {epoch_time:.1f}s | VRAM {vram:.1f}MB"
    )

df = pd.DataFrame(history)
df.to_csv(os.path.join(OUTPUT_DIR, "maskrcnn_log.csv"), index=False)

## 5. Evaluation & Visualization
We run inference on a few images and overlay:
- **Predicted masks**
- **Bounding boxes**


In [None]:
model.eval()
test_imgs, _ = next(iter(train_loader))
test_imgs = [img.to(cfg.device) for img in test_imgs]

with torch.no_grad():
    outputs = model(test_imgs)


# Visualization
def show_masks(img, masks, boxes, scores, threshold=0.5):
    img = img.permute(1, 2, 0).cpu().numpy()
    plt.figure(figsize=(8, 8))
    plt.imshow(img)
    for m, box, score in zip(masks, boxes, scores):
        if score < threshold:
            continue
        mask = m[0].cpu().numpy()
        plt.imshow(mask, alpha=0.4)
        x1, y1, x2, y2 = box.cpu().numpy()
        plt.gca().add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, color="red", linewidth=2))
    plt.axis("off")
    plt.show()


for i in range(min(2, len(test_imgs))):
    show_masks(test_imgs[i], outputs[i]["masks"], outputs[i]["boxes"], outputs[i]["scores"])