In [2]:
from torch import BoolTensor, IntTensor, Tensor
from torchmetrics.detection.mean_ap import MeanAveragePrecision

# Preds should be a list of elements, where each element is a dict
# containing 3 keys: boxes, scores, labels
mask_pred = [
    [0, 0, 0, 0, 0],
    [0, 0, 1, 1, 0],
    [0, 0, 1, 1, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
]
preds = [
    {
        # The boxes keyword should contain an [N,4] tensor,
        # where N is the number of detected boxes with boxes of the format
        # [xmin, ymin, xmax, ymax] in absolute image coordinates
        "boxes": Tensor([[258.0, 41.0, 606.0, 285.0]]),
        # The scores keyword should contain an [N,] tensor where
        # each element is confidence score between 0 and 1
        "scores": Tensor([0.536]),
        # The labels keyword should contain an [N,] tensor
        # with integers of the predicted classes
        "labels": IntTensor([0]),
        # The masks keyword should contain an [N,H,W] tensor,
        # where H and W are the image height and width, respectively,
        # with boolean masks. This is only required when iou_type is `segm`.
        "masks": BoolTensor([mask_pred]),
    }
]

# Target should be a list of elements, where each element is a dict
# containing 2 keys: boxes and labels (and masks, if iou_type is `segm`).
# Each keyword should be formatted similar to the preds argument.
# The number of elements in preds and target need to match
mask_tgt = [
    [0, 0, 0, 0, 0],
    [0, 0, 1, 0, 0],
    [0, 0, 1, 1, 0],
    [0, 0, 1, 0, 0],
    [0, 0, 0, 0, 0],
]
target = [
    {
        "boxes": Tensor([[214.0, 41.0, 562.0, 285.0]]),
        "labels": IntTensor([0]),
        "masks": BoolTensor([mask_tgt]),
    }
]

if __name__ == "__main__":
    # Initialize metric
    metric = MeanAveragePrecision(iou_type="bbox")

    # Update metric with predictions and respective ground truth
    metric.update(preds, target)

    # Compute the results
    result = metric.compute()
    print(result)

{'map': tensor(0.6000), 'map_50': tensor(1.), 'map_75': tensor(1.), 'map_small': tensor(-1.), 'map_medium': tensor(-1.), 'map_large': tensor(0.6000), 'mar_1': tensor(0.6000), 'mar_10': tensor(0.6000), 'mar_100': tensor(0.6000), 'mar_small': tensor(-1.), 'mar_medium': tensor(-1.), 'mar_large': tensor(0.6000), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor(0, dtype=torch.int32)}


In [None]:
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
with torch.no_grad():
      outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)

In [None]:
class Detr(pl.LightningModule):
     def __init__(self, lr, lr_backbone, weight_decay):
         super().__init__()
         # replace COCO classification head with custom head
         # we specify the "no_timm" variant here to not rely on the timm library
         # for the convolutional backbone
         self.model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50",
                                                             revision="no_timm",
                                                             num_labels=len(id2label),
                                                             ignore_mismatched_sizes=True)
         # see https://github.com/PyTorchLightning/pytorch-lightning/pull/1896
         self.save_hyperparameters()
         self.lr = lr
         self.lr_backbone = lr_backbone
         self.weight_decay = weight_decay

     def forward(self, pixel_values, pixel_mask):
       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

       return outputs

     def common_step(self, batch, batch_idx):
       pixel_values = batch["pixel_values"]
       pixel_mask = batch["pixel_mask"]
       labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
       print(outputs)
       loss = outputs.loss
       loss_dict = outputs.loss_dict

       return loss, loss_dict

     def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
          self.log("train_" + k, v.item())

        return loss

     def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss)
        for k,v in loss_dict.items():
          self.log("validation_" + k, v.item())

        return loss

     def configure_optimizers(self):
        param_dicts = [
              {"params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
              {
                  "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                  "lr": self.lr_backbone,
              },
        ]
        optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
                                  weight_decay=self.weight_decay)

        return optimizer

     def train_dataloader(self):
        return train_dataloader

     def val_dataloader(self):
        return val_dataloader