In [None]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from torch import nn

import numpy as np

In [None]:
from common.utils import seed_everything
from train import get_dataset
from draw_utils import visualize_dataset

In [None]:
from common.logger import logger
import logging

# DEBUG INFO WARNING ERROR CRITICAL
logger.setLevel(logging.INFO)

In [None]:
seed_everything(1024)

# Backbone

In [None]:
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone

In [None]:
backbone = resnet_fpn_backbone(
    'resnet50', # resnet18, resnet50
    pretrained=True,
    trainable_layers=5, # all layers
    # trainable_layers=3,
    returned_layers=[2,3,4]
)

In [None]:
backbone.out_channels

In [None]:
test_tensor = torch.randn(1,3,384,384)
feature_dict = backbone(test_tensor)
feature_dict.keys()

In [None]:
features = list(feature_dict.values())

In [None]:
for f in features:
    print(f.shape)

# PennFudanPed

In [None]:
from dataset.penn_fudan_dataset import PennFudanDataset

In [None]:
PENN_FUDAT_ROOT_PATH = './data/PennFudanPed'
dataset_train = PennFudanDataset(
    PENN_FUDAT_ROOT_PATH,
    train=True,
    stride=8,
    format='xyxy' # xyxy, cxcywh
)

In [None]:
img, mask, bboxs = dataset_train[0]
img.shape, mask.shape, bboxs.shape

In [None]:
visualize_dataset(dataset_train, count=5, size=3)

# Pascal VOC

In [None]:
from dataset.pascal_voc_dataset import PascalVOCDataset

In [None]:
dataset_voc = PascalVOCDataset('./data/VOCdevkit/', 'TRAIN', transforms=None)

In [None]:
len(dataset_voc)

In [None]:
visualize_dataset(dataset_voc, count=5, size=3)

# Model

In [None]:
from centernet import create_model

In [None]:
model = create_model()

In [None]:
img, mask, bboxs = dataset_train[0]
img.shape, mask.shape, bboxs.shape

In [None]:
out = model(img)

In [None]:
out.shape

# Test

In [None]:
from draw_utils import make_prediction
from centernet import create_model

In [None]:
dataset_name = 'penn_fud' # voc, penn_fud
dataset_train, dataset_test = get_dataset(dataset_name)

In [None]:
model = create_model()
model_name = 'centernet_v2'
model_save_name = f'./ckpts/{model_name}.pth'
model.load_state_dict(torch.load(model_save_name))
logger.info(f'Model loaded from {model_save_name}')

In [None]:
make_prediction(model, dataset_test, index=2)

In [None]:
make_prediction(model, dataset_train, index=0, threshold=0.5)

# Gaussian Kernel

In [None]:
g_kernel = np.array([
    [0.0625, 0.125, 0.0625],
    [0.125, 0.25, 0.125],
    [0.0625, 0.125, 0.0625]]
)

In [None]:
g_kernel_size = g_kernel.shape[0]
w = g_kernel_size // 2

In [None]:
x = 1
y = 1

center_mask = np.zeros((40, 40), dtype='float32')
center_mask[y-w:y+w+1,x-w:x+w+1] = g_kernel
show_images([center_mask], ["mask"], size=4)

# FCOS

In [None]:
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.models.detection.image_list import ImageList
from torchvision.models.detection.transform import GeneralizedRCNNTransform

import matplotlib.pyplot as plt
import cv2

In [None]:
backbone = resnet_fpn_backbone(
    'resnet18', # resnet18, resnet50
    pretrained=True,
    trainable_layers=5, # all layers
    # trainable_layers=3,
    returned_layers=[2,3,4]
)

In [None]:
anchor_sizes = ((8,), (16,), (32,), (64,))  # equal to strides of multi-level feature map
aspect_ratios = ((1.0,),) * len(anchor_sizes)  # set only one anchor
anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

In [None]:
min_size = 300
max_size = 1333
image_mean = [0.485, 0.456, 0.406]
image_std = [0.229, 0.224, 0.225]
transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

In [None]:
image_size = (384, 384)

In [None]:
images = torch.rand(1, 3, *image_size)

bboxs = torch.tensor([
    [5,10,20,30],
    [50,50,150,300],
    [200,200,220,250]
])
# bboxsbboxs.unsqueeze(0)

labels = torch.tensor([1,2,3]).long()
targets = dict(
    boxes=bboxs,
    labels=labels,
)

In [None]:
images, targets = transform(images, [targets])

In [None]:
image_size = images.tensors.shape[-2:]
image_size

In [None]:
features = backbone(images.tensors)
features = list(features.values())
for f in features:
    print(f.shape)

In [None]:
anchors = anchor_generator(images, features)

In [None]:
anchors_per_image = anchors[0]
targets_per_image = targets[0]
len(anchors_per_image), targets_per_image

In [None]:
# anchor_num = [f.shape[-2:].numel() for f in features]
num_anchors_per_level = [x.size(2) * x.size(3) for x in features]

anchor_idx = np.cumsum([0] + num_anchors_per_level)
anchor_idx

In [None]:
# FCOS.compute_loss

In [None]:
center_sampling_radius = 1.5

gt_boxes = targets_per_image["boxes"]
gt_centers = (gt_boxes[:, :2] + gt_boxes[:, 2:]) / 2  # Nx2
anchor_centers = (anchors_per_image[:, :2] + anchors_per_image[:, 2:]) / 2  # N
anchor_sizes = anchors_per_image[:, 2] - anchors_per_image[:, 0]

In [None]:
pairwise_match = (anchor_centers[:, None, :] - gt_centers[None, :, :]).abs_().max(
    dim=2
).values < center_sampling_radius * anchor_sizes[:, None]
pairwise_match.shape

In [None]:
# compute pairwise distance between N points and M boxes
x, y = anchor_centers.unsqueeze(dim=2).unbind(dim=1)  # (N, 1)
x0, y0, x1, y1 = gt_boxes.unsqueeze(dim=0).unbind(dim=2)  # (1, M)
pairwise_dist = torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2)  # (N, M)
pairwise_dist.shape

In [None]:
# anchor point must be inside gt
pairwise_match &= pairwise_dist.min(dim=2).values > 0

In [None]:
# each anchor is only responsible for certain scale range.
lower_bound = anchor_sizes * 4
lower_bound[: num_anchors_per_level[0]] = 0
upper_bound = anchor_sizes * 8
upper_bound[-num_anchors_per_level[-1] :] = float("inf")

pairwise_dist = pairwise_dist.max(dim=2).values
pairwise_match &= (pairwise_dist > lower_bound[:, None]) & (pairwise_dist < upper_bound[:, None])

In [None]:
# match the GT box with minimum area, if there are multiple GT matches
gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1])  # N
pairwise_match = pairwise_match.to(torch.float32) * (1e8 - gt_areas[None, :])
min_values, matched_idx = pairwise_match.max(dim=1)  # R, per-anchor match
matched_idx[min_values < 1e-5] = -1  # unmatched anchors are assigned -1

matched_idxs_per_image = matched_idx

In [None]:
# FCOSHead.compute_loss

In [None]:
gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]

gt_classes_targets[matched_idxs_per_image < 0] = -1  # background

In [None]:
# foregroud_mask = gt_classes_targets >= 0
foregroud_mask = matched_idxs_per_image != -1

In [None]:
# gt_boxes_targets[foregroud_mask]
# anchors_per_image[foregroud_mask]

In [None]:
image_to_draw = np.ones((*image_size, 3)) * 255

fm_idx = 3
# anchors_to_show = anchors_per_image[anchor_idx[fm_idx]:anchor_idx[fm_idx+1]]
anchors_to_show = anchors_per_image[foregroud_mask]

for bbox in anchors_to_show:
    bbox = np.int32(bbox)
    # print(bbox)
    cv2.rectangle(image_to_draw, (bbox[0],bbox[1]), (bbox[2],bbox[3]), (255, 0, 0), 1)

for bbox in targets_per_image['boxes']:
    bbox = np.int32(bbox)
    # print(bbox)
    cv2.rectangle(image_to_draw, (bbox[0],bbox[1]), (bbox[2],bbox[3]), (0, 255, 0), 1)
    
plt.imshow(image_to_draw)

In [None]:
image_to_draw = np.ones((*image_size, 3)) * 255

fm_idx = 3
# anchors_to_show = anchors_per_image[anchor_idx[fm_idx]:anchor_idx[fm_idx+1]]
anchors_to_show = anchors_per_image[foregroud_mask]

for bbox in anchors_to_show:
    bbox = np.int32(bbox)
    # print(bbox)
    cv2.rectangle(image_to_draw, (bbox[0],bbox[1]), (bbox[2],bbox[3]), (255, 0, 0), 1)

for bbox in targets_per_image['boxes']:
    bbox = np.int32(bbox)
    # print(bbox)
    cv2.rectangle(image_to_draw, (bbox[0],bbox[1]), (bbox[2],bbox[3]), (0, 255, 0), 1)
    
plt.imshow(image_to_draw)