In [30]:
from transformers import AutoImageProcessor, DeformableDetrConfig, DeformableDetrForObjectDetection

ID2LABEL = {
    0: 'Ball', 
    1: 'Goalkeeper', 
    2: 'Player', 
    3: 'Referee'
}

config = DeformableDetrConfig.from_pretrained('SenseTime/deformable-detr')
# config.id2label = ID2LABEL
# config.label2id = {v: k for k, v in ID2LABEL.items()}
# config.num_labels = len(ID2LABEL)
model = DeformableDetrForObjectDetection.from_pretrained('SenseTime/deformable-detr', config=config, ignore_mismatched_sizes=True)

Some weights of the model checkpoint at SenseTime/deformable-detr were not used when initializing DeformableDetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DeformableDetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeformableDetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
from transformers import DeformableDetrImageProcessor

processor = DeformableDetrImageProcessor.from_pretrained("SenseTime/deformable-detr")
print(processor)


  from .autonotebook import tqdm as notebook_tqdm


DeformableDetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DeformableDetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}



In [3]:
import torch
logits = torch.randn(2, 100, 6)
logits

tensor([[[ 1.2191, -0.9071, -0.1581, -0.6062, -1.7461,  0.1671],
         [ 2.0704, -0.2686,  0.5438, -1.3978, -0.1865, -0.9449],
         [ 0.0557,  0.6647, -0.1101, -0.0270, -0.8548,  0.2706],
         ...,
         [-0.5965,  1.1909, -0.4583,  1.3302,  0.4806, -1.2204],
         [-0.0643, -0.4726, -0.0065,  0.6351, -0.5586,  1.4894],
         [ 1.0889,  0.0977,  0.3557, -0.8123, -1.3941, -0.8619]],

        [[ 0.2691,  0.8176,  1.8594, -0.4778, -0.5652,  0.2593],
         [ 0.6761, -0.6594,  2.4742, -0.5044,  0.2283,  0.1378],
         [-0.0998, -0.2896, -0.6326, -0.2417,  1.7162, -1.3902],
         ...,
         [ 1.6152, -0.2041,  0.0033,  1.2212, -1.0327, -0.9970],
         [-0.3772,  1.4883, -1.9305,  2.5030, -1.1029,  1.2630],
         [-1.0457,  1.2286,  0.3054, -0.2148, -0.1844,  0.4441]]])

In [2]:
probs = logits.softmax(-1)
probs

tensor([[[0.1319, 0.0442, 0.5377, 0.0772, 0.0181, 0.1909],
         [0.0977, 0.0838, 0.1497, 0.1667, 0.1401, 0.3620],
         [0.0660, 0.1172, 0.1888, 0.5353, 0.0459, 0.0467],
         ...,
         [0.3705, 0.0611, 0.0786, 0.2150, 0.2621, 0.0127],
         [0.0377, 0.0569, 0.1957, 0.0360, 0.0366, 0.6370],
         [0.2867, 0.0586, 0.1892, 0.1794, 0.2047, 0.0814]],

        [[0.2542, 0.0595, 0.0645, 0.3672, 0.1637, 0.0910],
         [0.1877, 0.0120, 0.5219, 0.0032, 0.0553, 0.2199],
         [0.2405, 0.3818, 0.0926, 0.0771, 0.0678, 0.1402],
         ...,
         [0.2660, 0.2890, 0.0522, 0.1696, 0.0868, 0.1365],
         [0.1517, 0.0761, 0.0751, 0.2512, 0.3417, 0.1042],
         [0.3049, 0.6126, 0.0272, 0.0096, 0.0325, 0.0132]]])

In [3]:
scores, cls_id = probs.max(-1)

In [4]:
scores

tensor([[0.5377, 0.3620, 0.5353, 0.5057, 0.2737, 0.3301, 0.4148, 0.3355, 0.6182,
         0.5342, 0.4441, 0.6727, 0.3657, 0.2345, 0.2877, 0.5606, 0.2644, 0.3522,
         0.4762, 0.6648, 0.5521, 0.3044, 0.6456, 0.2903, 0.4493, 0.2281, 0.2713,
         0.2784, 0.2522, 0.4520, 0.2442, 0.3426, 0.3818, 0.4286, 0.5002, 0.3136,
         0.2816, 0.4171, 0.3720, 0.5021, 0.3652, 0.3085, 0.5686, 0.3988, 0.3660,
         0.3379, 0.4495, 0.4424, 0.2579, 0.3405, 0.3807, 0.2980, 0.3302, 0.4291,
         0.7127, 0.6927, 0.4805, 0.2729, 0.2309, 0.4086, 0.3295, 0.3031, 0.4382,
         0.3601, 0.3670, 0.5883, 0.4451, 0.2592, 0.4050, 0.3844, 0.4446, 0.3076,
         0.2687, 0.3817, 0.4591, 0.2880, 0.3845, 0.2576, 0.3608, 0.5022, 0.2832,
         0.4842, 0.3685, 0.4293, 0.7574, 0.3336, 0.2499, 0.3401, 0.5283, 0.3563,
         0.4223, 0.7517, 0.2501, 0.3588, 0.4908, 0.3433, 0.4504, 0.3705, 0.6370,
         0.2867],
        [0.3672, 0.5219, 0.3818, 0.3336, 0.4206, 0.3463, 0.4525, 0.3434, 0.2939,
         0

In [70]:
logits = torch.randn(4, 6)
probs = logits.softmax(-1)
scores, cls_ids = probs.max(-1)
pred_boxes = torch.randn(4, 4)

scores     = scores.unsqueeze(0) if scores.ndim == 1 else scores
# cls_ids    = cls_ids.unsqueeze(0) if cls_ids.ndim == 1 else cls_ids
pred_boxes = pred_boxes.unsqueeze(0) if pred_boxes.ndim == 2 else pred_boxes

In [71]:
for img_pred_boxes, img_scores, img_cls_ids in zip(pred_boxes, scores, cls_ids):
    print(img_cls_ids)

tensor(3)


In [None]:
from transformers import AutoImageProcessor, DeformableDetrForObjectDetection
import torch
from PIL import Image
import requests
import cv2

img

processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr")

# Preprocess: converts input for model (cx, cy, w, h internally used)
inputs = processor(images=image, return_tensors="pt")

# Forward pass
outputs = model(**inputs)

# Post-process: converts back to [x_min, y_min, x_max, y_max]
target_sizes = torch.tensor([image.size[::-1]])  # (height, width)
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.5)

# Print bounding boxes
for result in results:
    for box in result["boxes"]:
        print([round(coord.item(), 2) for coord in box])  # Output: [x1, y1, x2, y2]   

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of the model checkpoint at SenseTime/deformable-detr were not used when initializing DeformableDetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DeformableDetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Ber

[16.5, 52.84, 318.25, 470.78]
[342.19, 24.3, 640.02, 372.25]
[40.79, 72.78, 176.76, 117.25]
