# Imports

In [1]:
import os
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torchvision.datasets.utils import download_and_extract_archive
from torchvision.utils import draw_bounding_boxes
import torchvision.transforms.functional as F
from sklearn.model_selection import train_test_split
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

### Step 1: Download & Prepare Dataset

In [2]:
url = "https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip"
root = "./images"
download_and_extract_archive(url, download_root=root)
root_dir = os.path.join(root, "PennFudanPed")

**The dataset subdivides into two folders: annotations (which contains txt files of bounding box start and end points) and images**

### custom Dataset Class

In [3]:
class PennFudanDataset(Dataset):
    """
    Custom dataset for PennFudanPed.

    Each sample contains:
        - image (RGB)
        - bounding boxes (normalized)
        - labels (1 = person)
    """

    def __init__(self, root_dir, transform=None):
        self.img_dir = os.path.join(root_dir, "PNGImages")
        self.ann_dir = os.path.join(root_dir, "Annotation")
        self.transform = transform

        # Sort file lists to align images and annotations
        self.imgs = sorted([f for f in os.listdir(self.img_dir) if f.endswith(".png")])
        self.anns = sorted([f for f in os.listdir(self.ann_dir) if f.endswith(".txt")])

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.imgs[idx])
        ann_path = os.path.join(self.ann_dir, self.anns[idx])

        # Load image
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        # Parse annotation bounding boxes
        with open(ann_path, "r") as f:
            text = f.read()

        boxes = re.findall(r"\((\d+), (\d+)\) - \((\d+), (\d+)\)", text)
        boxes = np.array([[int(x1), int(y1), int(x2), int(y2)] for x1, y1, x2, y2 in boxes], dtype=np.float32)

        # Normalize bounding boxes (for training stability)
        boxes[:, [0, 2]] /= w
        boxes[:, [1, 3]] /= h

        # Label = person (single class)
        labels = np.ones((len(boxes),), dtype=np.int64)

        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(boxes, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64)


### Data Preparation

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

dataset = PennFudanDataset(root_dir, transform=transform)
train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_subset = torch.utils.data.Subset(dataset, train_idx)
test_subset = torch.utils.data.Subset(dataset, test_idx)

def collate_fn(batch):
    imgs, boxes, labels = zip(*batch)
    return list(imgs), list(boxes), list(labels)

train_loader = DataLoader(train_subset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_subset, batch_size=4, collate_fn=collate_fn)

Coming to the model we’ll be needing for this project, we need to keep two things in mind. First, to avoid additional hassle and for efficient feature extraction, we’ll use a pre-trained model to act as the base model. Second, the base model will then be split into two parts; the box regressor and the label classifier. Both of these will be individual model entities.

The second thing to remember is that only the box regressor and the label classifier will have trainable weights. The weights of the pre-trained model will be left untouched

### Define Object Detection Model

In [10]:
class DeepObjectDetector(nn.Module):
    def __init__(self, baseModel, numClasses):
        super(DeepObjectDetector, self).__init__()
        self.baseModel = baseModel
        self.baseModel.fc = nn.Identity()  # remove final classifier
        feature_dim = 2048

        # 🔸 Deeper regressor
        self.regressor = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 4),
            nn.Sigmoid()
        )

        # 🔸 Deeper classifier
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, numClasses)
        )

    def forward(self, x):
        features = self.baseModel(x)
        boxes = self.regressor(features)
        classes = self.classifier(features)
        return boxes, classes

###  Training

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeepObjectDetector(models.resnet50(weights="IMAGENET1K_V1"), numClasses=2).to(device)

class_loss_fn = nn.CrossEntropyLoss()
bbox_loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
num_epochs = 5
train_losses = []

print("\n🔁 Starting Training...\n")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for imgs, boxes, labels in train_loader:
        for img, box, label in zip(imgs, boxes, labels):
            img, box, label = img.to(device), box.to(device), label.to(device)

            pred_box, pred_class = model(img.unsqueeze(0))

            # losses
            bbox_loss = bbox_loss_fn(pred_box, box.mean(dim=0, keepdim=True))
            class_loss = class_loss_fn(pred_class, label[0].unsqueeze(0))
            loss = bbox_loss + class_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}] ➜ Loss: {avg_loss:.4f}")

plt.figure()
plt.plot(train_losses, label="Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Curve")
plt.legend()
plt.show()



🔁 Starting Training...

Epoch [1/5] ➜ Loss: 0.4205
Epoch [2/5] ➜ Loss: 0.0645
Epoch [3/5] ➜ Loss: 0.0621
Epoch [4/5] ➜ Loss: 0.0601


### Visualization

In [None]:
def unnormalize(img):
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    return img * std + mean


def denormalize_box(box, width, height):
    box = box.clone()
    box[0::2] *= width
    box[1::2] *= height
    return box


def draw_label(pil_img, box, label):
    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.load_default()
    text_w, text_h = draw.textlength(label, font=font), 12
    x1, y1, x2, y2 = box
    draw.rectangle([x1, max(0, y1 - text_h - 2), x1 + text_w + 4, y1], fill="white")
    draw.text((x1 + 2, y1 - text_h - 2), label, fill="black", font=font)
    return pil_img

In [None]:
def visualize_predictions(model, data_loader, device, num_images=5):
    model.eval()
    shown = 0
    correct, total = 0, 0

    with torch.no_grad():
        for imgs, boxes, labels in data_loader:
            for img, box, label in zip(imgs, boxes, labels):
                img, label = img.to(device), label.to(device)

                pred_box, pred_class = model(img.unsqueeze(0))
                pred_box = pred_box[0].cpu()
                class_id = pred_class.argmax(1).item()
                conf = torch.softmax(pred_class, dim=1)[0, class_id].item()

                # Accuracy
                total += 1
                if class_id == label[0].item():
                    correct += 1

                # Unnormalize + visualize
                img_disp = unnormalize(img.cpu()).clamp(0, 1)
                img_disp = (img_disp * 255).byte()
                _, H, W = img_disp.shape

                pred_box_abs = denormalize_box(pred_box, W, H).unsqueeze(0)
                gt_boxes = torch.stack([denormalize_box(b, W, H) for b in box])

                drawn = draw_bounding_boxes(img_disp, gt_boxes.to(torch.int32), colors="green", width=2)
                drawn = draw_bounding_boxes(drawn, pred_box_abs.to(torch.int32), colors="red", width=3)

                pil_img = F.to_pil_image(drawn)
                label_text = f"Pred: person ({conf:.2f})"
                pil_img = draw_label(pil_img, pred_box_abs[0].tolist(), label_text)

                plt.figure(figsize=(6, 6))
                plt.imshow(pil_img)
                plt.title("🟩 Ground Truth | 🟥 Prediction")
                plt.axis("off")
                plt.show()

                shown += 1
                if shown >= num_images:
                    acc = correct / total * 100
                    print(f"✅ Shown Accuracy: {acc:.2f}%")
                    return

    acc = correct / total * 100
    print(f"✅ Overall Accuracy: {acc:.2f}%")

In [None]:
visualize_predictions(model, test_loader, device, num_images=5)