<a href="https://colab.research.google.com/github/KeerthanaRachuri12/Mask-R-CNN-for-Object-Detection-and-Instance-Segmentation/blob/main/Image_Segmentation_using_mask_RCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Preparation and Input Modalities

In [None]:
!pip install kaggle

# Make directory for Kaggle configuration
!mkdir -p ~/.kaggle

# Upload kaggle.json manually via Colab file upload and move it to the required directory
from google.colab import files
files.upload()  # Upload your kaggle.json here

!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the NYU Depth V2 dataset
!kaggle datasets download -d gopalbhattrai/pascal-voc-2012-dataset





Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/gopalbhattrai/pascal-voc-2012-dataset
License(s): unknown
Downloading pascal-voc-2012-dataset.zip to /content
100% 3.51G/3.52G [00:37<00:00, 186MB/s]
100% 3.52G/3.52G [00:37<00:00, 99.9MB/s]


In [None]:
!unzip -q pascal-voc-2012-dataset.zip

voc dataset.py

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.transforms import functional as F
from pathlib import Path
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
import numpy as np

# Dataset class for Pascal VOC downloaded and uploaded to Colab
class VOCDatasetFromFolder(torch.utils.data.Dataset):
    def __init__(self, root_folder, image_set_file, transforms=None):
        self.root = Path(root_folder)
        self.transforms = transforms

        # Read image IDs list from train/val text files
        with open(image_set_file) as f:
            self.image_ids = [line.strip() for line in f.readlines()]

        self.img_folder = self.root / "JPEGImages"
        self.anno_folder = self.root / "Annotations"
        self.class_names = ['__background__','aeroplane','bicycle','bird','boat','bottle',
                            'bus','car','cat','chair','cow','diningtable','dog','horse',
                            'motorbike','person','pottedplant','sheep','sofa','train','tvmonitor']

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_path = self.img_folder / f"{img_id}.jpg"
        anno_path = self.anno_folder / f"{img_id}.xml"

        img = Image.open(img_path).convert("RGB")
        tree = ET.parse(anno_path)
        root = tree.getroot()

        boxes = []
        labels = []
        masks = []

        for obj in root.findall('object'):
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])

            cls_name = obj.find('name').text
            labels.append(self.class_names.index(cls_name))

            # Create box mask from bbox
            mask = Image.new('1', img.size)
            draw = ImageDraw.Draw(mask)
            draw.rectangle([xmin, ymin, xmax, ymax], fill=1)
            masks.append(torch.as_tensor(np.array(mask), dtype=torch.uint8))

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        masks = torch.stack(masks) if masks else torch.zeros((0, img.height, img.width), dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((len(labels),), dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transforms:
            img, target = self.transforms(img, target)

        return img, target

# Simple transform: PIL to Tensor
def get_transform():
    def transform(img, target):
        img = F.to_tensor(img)
        return img, target
    return transform

In [None]:
# pip install --upgrade torchvision


Collecting torchvision
  Downloading torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Collecting torch==2.9.1 (from torchvision)
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch==2.9.1->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch==2.9.1->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch==2.9.1->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cublas-cu12==12.8.4.1 (from torch==2.9.1->torchvision)
  Downloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)
Collect

Setting up the model

In [None]:
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights

def get_model_instance_segmentation(num_classes):
    # Load pretrained Mask R-CNN with proper weights argument
    weights = MaskRCNN_ResNet50_FPN_Weights.DEFAULT
    model = maskrcnn_resnet50_fpn(weights=weights)

    # Replace the box predictor to match the number of classes (if needed)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Replace the mask predictor similarly
    in_channels = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    model.roi_heads.mask_predictor = torchvision.models.detection.mask_rcnn.MaskRCNNPredictor(in_channels, hidden_layer, num_classes)

    return model




Training Model

In [None]:
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    model.train()
    for images, targets in data_loader:
        # Ensure inputs are list of images (each tensor [C,H,W])
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
    print(f"Epoch {epoch} complete.")

In [None]:
import torch
from torch.utils.data import DataLoader

def collate_fn(batch):
    return tuple(zip(*batch))

def main():
    # Set paths for your uploaded data in Colab
    root_folder = "/content/VOC2012_train_val/VOC2012_train_val"
    train_txt = "/content/VOC2012_train_val/VOC2012_train_val/ImageSets/Segmentation/train.txt"
    val_txt = "/content/VOC2012_train_val/VOC2012_train_val/ImageSets/Segmentation/train.txt"

    num_classes = 21  # 20 classes + background
    batch_size = 2
    num_epochs = 20  # Reduced training epochs for faster run
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Prepare datasets and dataloaders
    train_dataset = VOCDatasetFromFolder(root_folder, train_txt, transforms=get_transform())

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    model = get_model_instance_segmentation(num_classes)
    model.to(device)

    optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr=0.005, momentum=0.9, weight_decay=0.0005)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    model.train()
    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, train_loader, device, epoch+1)
        lr_scheduler.step()
        print(f"Epoch {epoch+1} completed.")

    print("Training complete.")

if __name__ == "__main__":
    main()

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


100%|██████████| 170M/170M [00:00<00:00, 202MB/s]


Epoch 1 complete.
Epoch 1 completed.
Epoch 2 complete.
Epoch 2 completed.
Epoch 3 complete.
Epoch 3 completed.
Epoch 4 complete.
Epoch 4 completed.
Epoch 5 complete.
Epoch 5 completed.
Epoch 6 complete.
Epoch 6 completed.
Epoch 7 complete.
Epoch 7 completed.
Epoch 8 complete.
Epoch 8 completed.
Epoch 9 complete.
Epoch 9 completed.
Epoch 10 complete.
Epoch 10 completed.
Epoch 11 complete.
Epoch 11 completed.
Epoch 12 complete.
Epoch 12 completed.
Epoch 13 complete.
Epoch 13 completed.
Epoch 14 complete.
Epoch 14 completed.
Epoch 15 complete.
Epoch 15 completed.
Epoch 16 complete.
Epoch 16 completed.
Epoch 17 complete.
Epoch 17 completed.
Epoch 18 complete.
Epoch 18 completed.
Epoch 19 complete.
Epoch 19 completed.
Epoch 20 complete.
Epoch 20 completed.
Training complete.


In [4]:
import os
import torch

# Assuming 'model' is your trained Mask R-CNN model instance

# Specify directory to save weights
save_dir = "./saved_models"
os.makedirs(save_dir, exist_ok=True)  # Create directory if needed

# Define full path including filename for saved weights
save_path = os.path.join(save_dir, "maskrcnn_trained.pth")

# Save only the model parameters (state_dict)
torch.save(model.state_dict(), save_path)

print(f"Model weights saved to: {save_path}")


NameError: name 'model' is not defined

Validation


In [15]:
import os
import json
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import numpy as np

# Modify these paths and classes as needed
xml_folder = "/content/VOC2012_train_val/VOC2012_train_val/Annotations"  # Folder with all XML annotation files (train + val)
img_folder = "/content/VOC2012_train_val/VOC2012_train_val/JPEGImages"   # Folder with all images
val_txt_file = "/content/VOC2012_train_val/VOC2012_train_val/ImageSets/Segmentation/val.txt"  # Text file listing val image basenames
output_json_path = "./val_annotations_coco.json"

VOC_CLASSES = [
    "aeroplane","bicycle","bird","boat","bottle",
    "bus","car","cat","chair","cow",
    "diningtable","dog","horse","motorbike","person",
    "pottedplant","sheep","sofa","train","tvmonitor"
]
class_name_to_id = {cls_name: i + 1 for i, cls_name in enumerate(VOC_CLASSES)}

def mask_to_rle_for_coco_json(mask_np):
    from pycocotools import mask as maskUtils
    # Ensure mask_np is binary (0 or 1) and of type uint8
    mask = mask_np.astype(np.uint8)
    # pycocotools expects a Fortran-contiguous array
    rle = maskUtils.encode(np.asfortranarray(mask))
    # 'counts' needs to be a string for JSON serialization
    rle['counts'] = rle['counts'].decode('utf-8')
    return rle

def xml_to_coco(xml_folder, img_folder, val_image_ids_ordered_list, output_json_path):
    images = []
    annotations = []
    categories = [{'id': i + 1, 'name': name} for i, name in enumerate(VOC_CLASSES)]
    info = {
        "description": "Pascal VOC 2012 Dataset converted to COCO format for validation",
        "version": "1.0",
        "year": 2012,
        "contributor": "",
        "date_created": ""
    }

    ann_id = 1
    # Iterate over the provided ordered list of validation image IDs
    for image_idx, image_id_str in enumerate(val_image_ids_ordered_list): # image_idx will be 0, 1, 2...
        xml_file_name = f"{image_id_str}.xml"
        xml_path = os.path.join(xml_folder, xml_file_name)

        # Ensure the XML file exists before parsing
        if not os.path.exists(xml_path):
            print(f"Warning: XML file not found for {image_id_str}. Skipping.")
            continue

        tree = ET.parse(xml_path)
        root = tree.getroot()

        filename = root.find('filename').text # Original filename, e.g., '2007_000032.jpg'
        img_path = os.path.join(img_folder, filename)

        # Ensure the image file exists before reading size
        if not os.path.exists(img_path):
            print(f"Warning: Image file not found for {filename}. Skipping.")
            continue

        with Image.open(img_path) as img:
            width, height = img.size

        images.append({
            "id": image_idx, # Use the 0-based index as the COCO image_id, consistent with DataLoader
            "file_name": filename,
            "width": width,
            "height": height
        })

        for obj in root.findall('object'):
            cls_name = obj.find('name').text
            if cls_name not in class_name_to_id:
                # Optionally handle unknown classes or skip
                continue

            category_id = class_name_to_id[cls_name]
            bndbox = obj.find('bndbox')
            xmin = int(float(bndbox.find('xmin').text))
            ymin = int(float(bndbox.find('ymin').text))
            xmax = int(float(bndbox.find('xmax').text))
            ymax = int(float(bndbox.find('ymax').text))
            o_width = xmax - xmin
            o_height = ymax - ymin

            # Create a binary mask from the bounding box for segmentation
            bbox_mask = Image.new('1', (width, height)) # Use image's full width and height
            draw = ImageDraw.Draw(bbox_mask)
            draw.rectangle([xmin, ymin, xmax, ymax], fill=1)
            bbox_mask_np = np.array(bbox_mask)

            # Convert the mask to RLE format for COCO segmentation
            segmentation_rle = mask_to_rle_for_coco_json(bbox_mask_np)

            annotations.append({
                "id": ann_id,
                "image_id": image_idx,
                "category_id": category_id,
                "bbox": [xmin, ymin, o_width, o_height],
                "area": o_width * o_height,
                "iscrowd": 0,
                "segmentation": segmentation_rle # Now it's an RLE dict
            })
            ann_id += 1

    coco_format = {
        "info": info,
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    with open(output_json_path, 'w') as f:
        json.dump(coco_format, f, indent=4)
    print(f"COCO JSON annotation saved to {output_json_path}")

if __name__ == "__main__":
    # Read validation image IDs as an ordered list from the file
    with open(val_txt_file, 'r') as f:
        val_image_ids_ordered = [line.strip() for line in f.readlines()]
    xml_to_coco(xml_folder, img_folder, val_image_ids_ordered, output_json_path)


COCO JSON annotation saved to ./val_annotations_coco.json


In [9]:
import torch
import gc

# Delete any existing model or tensors if assigned
# del model  # Uncomment if you want to delete a model variable

# Run garbage collector to free up unreferenced memory
gc.collect()

# Clear unused cached memory in PyTorch CUDA allocator
torch.cuda.empty_cache()


In [16]:
import torch
from torch.utils.data import DataLoader
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.transforms import functional as F
from pathlib import Path
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
import numpy as np
import json
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# For get_model_instance_segmentation
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights

# Dataset class for Pascal VOC downloaded and uploaded to Colab
class VOCDatasetFromFolder(torch.utils.data.Dataset):
    def __init__(self, root_folder, image_set_file, transforms=None):
        self.root = Path(root_folder)
        self.transforms = transforms

        # Read image IDs list from train/val text files
        with open(image_set_file) as f:
            self.image_ids = [line.strip() for line in f.readlines()]

        self.img_folder = self.root / "JPEGImages"
        self.anno_folder = self.root / "Annotations"
        self.class_names = ['__background__','aeroplane','bicycle','bird','boat','bottle',
                            'bus','car','cat','chair','cow','diningtable','dog','horse',
                            'motorbike','person','pottedplant','sheep','sofa','train','tvmonitor']

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_path = self.img_folder / f"{img_id}.jpg"
        anno_path = self.anno_folder / f"{img_id}.xml"

        img = Image.open(img_path).convert("RGB")
        tree = ET.parse(anno_path)
        root = tree.getroot()

        boxes = []
        labels = []
        masks = []

        for obj in root.findall('object'):
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])

            cls_name = obj.find('name').text
            labels.append(self.class_names.index(cls_name))

            # Create box mask from bbox
            mask = Image.new('1', img.size)
            draw = ImageDraw.Draw(mask)
            draw.rectangle([xmin, ymin, xmax, ymax], fill=1)
            masks.append(torch.as_tensor(np.array(mask), dtype=torch.uint8))

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        masks = torch.stack(masks) if masks else torch.zeros((0, img.height, img.width), dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((len(labels),), dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transforms:
            img, target = self.transforms(img, target)

        return img, target

def collate_fn(batch):
    return tuple(zip(*batch))

# Simple transform: PIL to Tensor
def get_transform():
    def transform(img, target):
        img = F.to_tensor(img)
        return img, target
    return transform

# Definition for get_model_instance_segmentation
def get_model_instance_segmentation(num_classes):
    # Load pretrained Mask R-CNN with proper weights argument
    weights = MaskRCNN_ResNet50_FPN_Weights.DEFAULT
    model = maskrcnn_resnet50_fpn(weights=weights)

    # Replace the box predictor to match the number of classes (if needed)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Replace the mask predictor similarly
    in_channels = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    model.roi_heads.mask_predictor = torchvision.models.detection.mask_rcnn.MaskRCNNPredictor(in_channels, hidden_layer, num_classes)

    return model

# Definition for collate_fn
def collate_fn(batch):
    return tuple(zip(*batch))

def mask_to_rle(mask):
    from pycocotools import mask as maskUtils
    mask = mask.squeeze(0).cpu().numpy()
    mask = mask.astype(np.uint8)
    rle = maskUtils.encode(np.asfortranarray(mask))
    rle['counts'] = rle['counts'].decode('utf-8')  # Needed for json
    return rle


def evaluate_model(model, data_loader, device, gt_json_path):
    model.eval()
    results = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(img.to(device) for img in images)
            outputs = model(images)

            for target, output in zip(targets, outputs):
                image_id = target["image_id"].item()
                boxes = output['boxes'].cpu().numpy()
                scores = output['scores'].cpu().numpy()
                labels = output['labels'].cpu().numpy()
                masks = output['masks']

                for box, score, label, mask in zip(boxes, scores, labels, masks):
                    x1, y1, x2, y2 = box
                    width = x2 - x1
                    height = y2 - y1
                    result = {
                        "info": {
                          "description": "Pascal VOC converted dataset",
                          "version": "1.0",
                          "year": 2025,
                          "contributor": "",
                          "date_created": "2025-11-14"
                          },
                        "image_id": image_id,
                        "category_id": int(label),  # Ensure matches dataset category IDs
                        "bbox": [float(x1), float(y1), float(width), float(height)],
                        "score": float(score),
                        "segmentation": mask_to_rle(mask)
                    }
                    results.append(result)

    results_file = "results.json"
    with open(results_file, "w") as f:
        json.dump(results, f)

    coco_gt = COCO(gt_json_path)
    coco_dt = coco_gt.loadRes(results_file)
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm')  # 'bbox' for bounding box mAP
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()  # Prints mAP and other metrics


def main():
    root_folder = "/content/VOC2012_train_val/VOC2012_train_val"
    val_txt = "/content/VOC2012_train_val/VOC2012_train_val/ImageSets/Segmentation/val.txt"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    val_dataset = VOCDatasetFromFolder(root_folder, val_txt, transforms=get_transform())

    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    model = get_model_instance_segmentation(num_classes=21)
    model.load_state_dict(torch.load("/content/maskrcnn_trained.pth"))
    model.to(device)

    gt_json_path = "/content/val_annotations_coco.json"

    evaluate_model(model, val_loader, device, gt_json_path)


if __name__ == "__main__":
    main()

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Loading and preparing results...
DONE (t=1.53s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=5.06s).
Accumulating evaluation results...
DONE (t=1.70s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets