# Single Shot Multibox Detection

In [2]:
import os, torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.ops import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms import v2 as T

class PennFudan(Dataset):
    def __init__(self, root, train=True):
        self.root = root
        self.imgs  = sorted(os.listdir(os.path.join(root, "PNGImages")))
        self.masks = sorted(os.listdir(os.path.join(root, "PedMasks")))
        # SSD pipeline: photometric --> geometric --> bbox cleanup --> float
        if train:
            self.tf = T.Compose([
                T.ToImage(),
                T.RandomPhotometricDistort(),               # color jitter
                T.RandomIoUCrop(min_scale=0.3, max_scale=1.0, max_aspect_ratio=2.0,
                                sampler_options=(0.1, 0.3, 0.5, 0.7, 0.9)),
                T.RandomHorizontalFlip(p=0.5),
                T.Resize((300, 300), antialias=True),
                T.SanitizeBoundingBoxes(min_size=1),
                T.ToDtype(torch.float32, scale=True),
            ])
        else:
            self.tf = T.Compose([
                T.ToImage(),
                T.Resize((300, 300), antialias=True),
                T.SanitizeBoundingBoxes(min_size=1),
                T.ToDtype(torch.float32, scale=True),
            ])

    def __len__(self): 
        return len(self.imgs)

    def __getitem__(self, idx):
        ip = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mp = os.path.join(self.root, "PedMasks",  self.masks[idx])

        img  = read_image(ip)                  # (3,H,W), uint8
        mask = read_image(mp)[0]               # (H,W), instance ids
        ids  = torch.unique(mask)[1:]          # drop background=0
        masks = (mask[None] == ids[:, None, None]).to(torch.uint8)  # (N,H,W)
        boxes = masks_to_boxes(masks)          # (N,4) xyxy on original size

        labels = torch.ones((boxes.shape[0],), dtype=torch.int64)   # 1 = person

        img = tv_tensors.Image(img)
        boxes = tv_tensors.BoundingBoxes(
            boxes, format=tv_tensors.BoundingBoxFormat.XYXY,
            canvas_size=img.shape[-2:]
        )

        sample = {"image": img, "boxes": boxes, "labels": labels}
        sample = self.tf(sample)

        out = {
            "boxes":  torch.as_tensor(sample["boxes"], dtype=torch.float32),
            "labels": sample["labels"]
        }
        return sample["image"], out

def collate(batch):
    imgs, targets = list(zip(*batch))
    return torch.stack(imgs, 0), list(targets)

root = "PennFudanPed"
train_ds = PennFudan(root, train=True)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,
                          num_workers=0, collate_fn=collate)


In [9]:
from torchvision.utils import draw_bounding_boxes
import torchvision.transforms.functional as TF

for i in range(10, 13, 1):
    img, label = train_ds[i]

    img_uint8 = (img * 255).to(torch.uint8)  

    out = draw_bounding_boxes(img_uint8, label["boxes"], colors="red", width=2)
    pil_out = TF.to_pil_image(out)
    pil_out.show()

![Img1](figures/img1.png)
![Img2](figures/img2.png)
![Img3](figures/img3.png)
![Img4](figures/img4.png)
![Img5](figures/img5.png)
![Img6](figures/img6.png)
![Img7](figures/img7.png)

In [4]:
from torchvision.models import resnet34, ResNet34_Weights

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = resnet34(weights="IMAGENET1K_V1").to(device).eval()

In [5]:
model.parameters

<bound method Module.parameters of ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)


### Original Architecture

| Stage    | Operation                        | Output Shape (C × H × W) |
| -------- | -------------------------------- | ------------------------ |
| Input    | —                                | 3 × 300 × 300            |
| Conv1    | 7×7, stride 2, pad 3             | 64 × 150 × 150           |
| MaxPool  | 3×3, stride 2, pad 1             | 64 × 75 × 75             |
| Layer 1  | 3 blocks, stride 1               | 64 × 75 × 75             |
| Layer 2  | 4 blocks, stride 2 (first block) | 128 × 38 × 38            |
| Layer 3  | 6 blocks, stride 2 (first block) | 256 × 19 × 19            |
| Layer 4  | 3 blocks, stride 2 (first block) | 512 × 10 × 10            |
| AvgPool  | global (kernel = 10×10)          | 512 × 1 × 1              |
| FC       | 1000 (ImageNet)                  | —                        |

### Adjusted Architecture

| Stage    | Operation                        | Output Shape (C × H × W) | Anchors |
| -------- | -------------------------------- | ------------------------ |---------|
| Input    | —                                | 3 × 300 × 300            | No      |
| Conv1    | 7×7, stride 2, pad 3             | 64 × 150 × 150           | No      |
| MaxPool  | 3×3, stride 2, pad 1             | 64 × 75 × 75             | No      |
| Layer 1  | 3 blocks, stride 1               | 64 × 75 × 75             | No      |
| Layer 2  | 4 blocks, stride 2 (first block) | 128 × 38 × 38            | Yes     |
| Layer 3  | 6 blocks, stride 2 (first block) | 256 × 19 × 19            | Yes     |
| Layer 4  | 3 blocks, stride 2 (first block) | 512 × 10 × 10            | Yes     |
| C5       | Conv2d(512, 256, ks=3, s=2, p=1) | 256 × 5 × 5              | Yes     |
| C6       | Conv2d(256, 256, ks=3, s=2, p=1) | 256 × 3 × 3              | Yes     |
| C7       | Conv2d(256, 256, ks=3, s=2, p=1) | 256 × 1 × 1              | Yes     |



| Map         | in\_ch | mid\_ch  | out\_ch | Why       |
| ------------| -----| ----| ----| -------------------- |
| C5 (10-->5) | 512  | 256 | 256 |-                     |
| C6 (5-->3)  | 256  | 128 | 256 |-                     |
| C7 (3-->1)  | 256  | -   | 256 | Single 3×3, s=1, p=0 |


In [6]:
from torch import nn, optim

def ds_block(in_ch, mid_ch, out_ch):
    return nn.Sequential(
        nn.Conv2d(in_ch, mid_ch, 1, 1, 0, bias=False),
        nn.BatchNorm2d(mid_ch),
        nn.ReLU(inplace=True),
        nn.Conv2d(mid_ch, out_ch, 3, 2, 1, bias=False),
        nn.BatchNorm2d(out_ch),
        nn.ReLU(inplace=True),
    )

In [7]:
class ResNet34Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        bone = resnet34(weights="IMAGENET1K_V1")

        self.stem = nn.Sequential(bone.conv1, 
                                  bone.bn1,
                                  bone.relu, 
                                  bone.maxpool
                                  )
        
        self.layer1 = bone.layer1
        self.layer2 = bone.layer2
        self.layer3 = bone.layer3
        self.layer4 = bone.layer4

        self.c5 = ds_block(512, 256, 256)
        self.c6 = ds_block(256, 128, 256)
        self.c7 = nn.Sequential(
            nn.Conv2d(256, 256, 3, 1, 0, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.out_l2 = nn.LazyConv2d(256, 1, bias=False)
        self.out_l3 = nn.LazyConv2d(256, 1, bias=False)
        self.out_l4 = nn.LazyConv2d(256, 1, bias=False)

    def forward(self, x):
        x = self.stem(x)          # 64 x 75 x 75
        o2 = self.layer1(x)       # 64 x 75 x 75
        o3 = self.layer2(o2)      # 128 x 38 x 38
        o4 = self.layer3(o3)      # 256 x 19 x 19
        o5 = self.layer4(o4)      # 512 x 10 x 10

        f3 = self.out_l2(o3)      # 256 x 38 x 38
        f4 = self.out_l3(o4)      # 256 x 19 x 19
        f5 = self.out_l4(o5)      # 256 x 10 x 10

        f6 = self.c5(o5)          # 256 x 5 x 5
        f7 = self.c6(f6)          # 256 x 3 x 3
        f8 = self.c7(f7)          # 256 x 1 x 1

        return [f3, f4, f5, f6, f7, f8]


In [8]:
import matplotlib.pyplot as plt

img, label = train_ds[0]
img = img.unsqueeze(0)
model = ResNet34Backbone()
maps = model(img)

for i, f in enumerate(maps, 3):
    print(f"f{i}: {tuple(f.shape)}")

fig, axs = plt.subplots(1, len(maps), figsize=(20, 4))
for i, f in enumerate(maps):
    fmap = f[0, 0].detach().cpu()
    axs[i].imshow(fmap, cmap="viridis")
    axs[i].set_title(f"f{i+3} {f.shape[2]}x{f.shape[3]}")
    axs[i].axis("off")
plt.show()



ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256, 1, 1])