In [52]:
!pip install xmltodict torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.12.0-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

In [3]:
!unzip FilteredData.zip

Archive:  FilteredData.zip
   creating: Filtered Data/
   creating: Filtered Data/filtered_images/
   creating: Filtered Data/filtered_images/train/
  inflating: Filtered Data/filtered_images/train/1_Renew_Licenced_Small_1_2022.png  
  inflating: Filtered Data/filtered_images/train/1010_Expired_Notified_Building_1_2023.png  
  inflating: Filtered Data/filtered_images/train/1011_Expired_Notified_Building_1_2016.png  
  inflating: Filtered Data/filtered_images/train/1032_Expired_Notified_Building_1_2018.png  
  inflating: Filtered Data/filtered_images/train/1033_Expired_Notified_Building_1_2020.png  
  inflating: Filtered Data/filtered_images/train/1039_Expired_Notified_Building_1_2014.png  
  inflating: Filtered Data/filtered_images/train/106_Renew_Licenced_Building_1_2021.png  
  inflating: Filtered Data/filtered_images/train/1060_Expired_Notified_Building_1_2022.png  
  inflating: Filtered Data/filtered_images/train/1072_licenced_Building_1_2020.png  
  inflating: Filtered Data/filter

In [53]:
import torch
import torchvision
import torchvision.models as models
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import os
import xmltodict
import numpy as np
import cv2
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torchmetrics
from tqdm import tqdm


import warnings
warnings.filterwarnings("ignore")

In [41]:
# Load pre-trained Faster R-CNN with FPN
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Modify the classifier to detect only 1 class (building) +   background
num_classes = 2  # 1 (building) + 1 (background)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [42]:
class PascalVOCDataset(Dataset):
    def __init__(self, images_dir, annotations_dir, transform=None):
        self.images_dir = images_dir
        self.annotations_dir = annotations_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(images_dir) if f.endswith('.png')]

        # Filter images with 'building' annotations
        self.image_files = [f for f in self.image_files if self._has_building_annotation(f)]
        print(f"Filtered dataset! Found {len(self.image_files)} images with 'building' annotations.")

    def _has_building_annotation(self, img_name):
        xml_path = os.path.join(self.annotations_dir, img_name.replace('.png', '.xml'))

        # Read the XML annotation file
        try:
            with open(xml_path, "r") as f:
                annotation = xmltodict.parse(f.read())
            # Check if "building" is in the annotation
            objects = annotation["annotation"].get("object", [])
            if isinstance(objects, dict):  # If only one object, wrap it in a list
                objects = [objects]
            for obj in objects:
                if obj["name"].lower() == "building":
                    return True
        except Exception as e:
            print(f"Error reading {xml_path}: {e}")
        return False

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.images_dir, img_name)
        xml_path = os.path.join(self.annotations_dir, img_name.replace('.png', '.xml'))

        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Load XML annotation
        with open(xml_path, "r") as f:
            annotation = xmltodict.parse(f.read())

        # Extract bounding boxes and labels
        boxes = []
        labels = []
        objects = annotation["annotation"].get("object", [])
        if isinstance(objects, dict):
            objects = [objects]

        for obj in objects:
            if obj["name"].lower() != "building":
                continue
            bbox = obj["bndbox"]
            x_min = int(bbox["xmin"])
            y_min = int(bbox["ymin"])
            x_max = int(bbox["xmax"])
            y_max = int(bbox["ymax"])
            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(1)  # Class label for 'building'

        # Initialize target
        target = {"boxes": boxes, "labels": labels}

        # Apply transformations
        if self.transform:
            transformed = self.transform(image=image, bboxes=boxes, labels=labels)
            image = transformed["image"]
            # Convert transformed bboxes and labels to tensors
            target["boxes"] = torch.tensor(transformed["bboxes"], dtype=torch.float32)
            target["labels"] = torch.tensor(transformed["labels"], dtype=torch.int64)
        else:
            # Convert boxes and labels to tensors if no transform applied
            target["boxes"] = torch.tensor(boxes, dtype=torch.float32)
            target["labels"] = torch.tensor(labels, dtype=torch.int64)

        return image, target


In [43]:
# Define transformations
transform = A.Compose([
    A.Resize(640, 640),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    ToTensorV2(),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

In [44]:
TRAIN_IMG_DIR = "/content/Filtered Data/filtered_images/train"
TRAIN_ANNOT_DIR = "/content/Filtered Data/filtered_labels/train/xml_annotations"

TEST_IMG_DIR = "/content/Filtered Data/filtered_images/test"
TEST_ANNOT_DIR = "/content/Filtered Data/filtered_labels/test/xml_annotations"

VAL_IMG_DIR = "/content/Filtered Data/filtered_images/val"
VAL_ANNOT_DIR = "/content/Filtered Data/filtered_labels/val/xml_annotations"

In [45]:
# Create datasets for train, val, and test
train_dataset = PascalVOCDataset(TRAIN_IMG_DIR, TRAIN_ANNOT_DIR, transform=transform)
val_dataset = PascalVOCDataset(VAL_IMG_DIR, VAL_ANNOT_DIR, transform=transform)
test_dataset = PascalVOCDataset(TEST_IMG_DIR, TEST_ANNOT_DIR, transform=transform)

# Create DataLoader for each dataset
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

Filtered dataset! Found 208 images with 'building' annotations.
Filtered dataset! Found 30 images with 'building' annotations.
Filtered dataset! Found 33 images with 'building' annotations.


In [46]:
# Define optimizer for Faster R-CNN
params = [p for p in model.parameters() if p.requires_grad]
# Using a common choice of learning rate and momentum for Faster R-CNN
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Optional: Define a learning rate scheduler (for better performance)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [47]:
# Training loop with validation loss
num_epochs = 10
best_val_loss = float('inf')  # Track best validation loss for saving models

for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train()
    train_loss = 0

    for images, targets in train_loader:
        # Move data to device
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        train_loss += losses.item()

    # --- Validation Phase ---
    model.eval()  # Switch to evaluation mode (affects layers like dropout)
    val_loss = 0

    with torch.no_grad():  # Disable gradient calculation
        for images, targets in val_loader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Forward pass (returns losses if in train mode, but we use eval mode)
            # To get validation loss, temporarily switch to train mode:
            model.train()
            loss_dict = model(images, targets)
            model.eval()

            val_loss += sum(loss for loss in loss_dict.values()).item()

    # Calculate epoch metrics
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)

    # Update learning rate scheduler
    lr_scheduler.step()  # Optional but recommended

    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f}")
    print(f"  Learning Rate: {optimizer.param_groups[0]['lr']:.6f}\n")

    # Optional: Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pth")

Epoch 1/10
  Train Loss: 1.1176
  Val Loss: 1.1405
  Learning Rate: 0.005000

Epoch 2/10
  Train Loss: 0.8353
  Val Loss: 0.9352
  Learning Rate: 0.005000

Epoch 3/10
  Train Loss: 0.7567
  Val Loss: 0.8997
  Learning Rate: 0.000500

Epoch 4/10
  Train Loss: 0.7083
  Val Loss: 0.8666
  Learning Rate: 0.000500

Epoch 5/10
  Train Loss: 0.6833
  Val Loss: 0.8666
  Learning Rate: 0.000500

Epoch 6/10
  Train Loss: 0.6795
  Val Loss: 0.8508
  Learning Rate: 0.000050

Epoch 7/10
  Train Loss: 0.6671
  Val Loss: 0.8494
  Learning Rate: 0.000050

Epoch 8/10
  Train Loss: 0.6649
  Val Loss: 0.8730
  Learning Rate: 0.000050

Epoch 9/10
  Train Loss: 0.6693
  Val Loss: 0.8702
  Learning Rate: 0.000005

Epoch 10/10
  Train Loss: 0.6554
  Val Loss: 0.8551
  Learning Rate: 0.000005



In [57]:
model.eval()

# Initialize mAP metric and loss tracking
map_metric = torchmetrics.detection.MeanAveragePrecision()
test_loss = 0
num_batches = 0

with torch.no_grad():
    for images, targets in tqdm(test_loader):
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # --- Step 1: Calculate Test Loss (Optional) ---
        # Temporarily switch to train mode to compute loss
        model.train()
        loss_dict = model(images, targets)  # Returns loss dictionary
        losses = sum(loss for loss in loss_dict.values())
        test_loss += losses.item()
        num_batches += 1

        # --- Step 2: Generate Predictions for mAP ---
        # Switch back to eval mode for inference
        model.eval()
        predictions = model(images)  # Returns list of prediction dicts

        # Update mAP metric
        for pred, true in zip(predictions, targets):
            map_metric.update(
                preds=[{
                    "boxes": pred["boxes"],
                    "scores": pred["scores"],
                    "labels": pred["labels"]
                }],
                target=[{
                    "boxes": true["boxes"],
                    "labels": true["labels"]
                }]
            )

# Compute metrics
avg_test_loss = test_loss / num_batches
map_score = map_metric.compute()

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"mAP: {map_score['map']:.4f}")

100%|██████████| 9/9 [00:07<00:00,  1.16it/s]


Test Loss: 0.8839
mAP: 0.0782
