# Defining Project Path

In [1]:
import sys
from pathlib import Path
import os
sys.path.append(str(Path(os.getcwd()).resolve().parent))

# Path of the project
PROJECT_PATH = Path(os.getcwd()).resolve().parent

# Loading Model from checkpoint

In [2]:
import importlib
import models.def_detr_model
importlib.reload(models.def_detr_model)

from models.rt_detr_v2_model import RTDetrV2Model

CHECKPOINT_PATH = PROJECT_PATH / 'checkpoints' / 'rt_detr_checkpoints' / 'checkpoint_e30_17112025_172724.pth'

MODEL = RTDetrV2Model.from_pretrained(
    'Mikolaj1234/rt-detr-v2-football-ai',
    device='cuda'
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14266.34it/s]


In [20]:
print(MODEL.id2label)
print(MODEL.model.config.id2label)

{0: 'Ball', 1: 'Goalkeeper', 2: 'Player', 3: 'Referee'}
{0: 'Ball', 1: 'Goalkeeper', 2: 'Player', 3: 'Referee'}


# Process Video

In [4]:
from utils.box_ops import cxcywh2xyxy, denormalize_bboxes
import supervision as sv
import torch
from tqdm import tqdm

VIDEO_PATH = PROJECT_PATH / 'data' / 'origin_videos' / 'football_06.mp4'
video_info = sv.VideoInfo.from_video_path(VIDEO_PATH)

PROCESSED_VIDEO_DIR = PROJECT_PATH / 'data' / 'processed_video'
PROCESSED_VIDEO_DIR.mkdir(parents=True, exist_ok=True)
TARGET_VIDEO_PATH = PROCESSED_VIDEO_DIR / f'{VIDEO_PATH.stem}_rt_detr_processed.mp4'
video_sink = sv.VideoSink(TARGET_VIDEO_PATH, video_info=video_info)

# Define annotator objects for visualization
box_annotator = sv.BoxAnnotator(
    color=sv.ColorPalette.from_hex(['#FF8C00', '#00BFFF', '#FF1493', '#FFD700']),
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#FF8C00', '#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000')
)
id2label = {
    0: 'Ball',
    1: 'Goalkeeper',
    2: 'Player',
    3: 'Referee'
}
label2id = {v: k for k, v in id2label.items()}

tracker = sv.ByteTrack()
tracker.reset()

frame_generator = sv.get_video_frames_generator(VIDEO_PATH)
with video_sink:
    for frame in tqdm(frame_generator, total=video_info.total_frames, desc=f'Processing video: {VIDEO_PATH.stem}'):
        img = torch.as_tensor(frame, dtype=torch.uint8)
        img = img.permute(2, 0, 1)
        img = [img]
        outputs = MODEL(img)

        raw_logits = outputs.logits[0].cpu()
        preds = raw_logits.softmax(-1)
        scores, cls_ids = preds.max(-1)

        pred_boxes = outputs.pred_boxes[0].cpu()
        img_height = int(outputs.orig_size[0][0])
        img_width = int(outputs.orig_size[0][1])
        bboxes = cxcywh2xyxy(pred_boxes)
        bboxes = denormalize_bboxes(bboxes, img_height, img_width)

        bboxes = bboxes.numpy()
        cls_ids = cls_ids.numpy()
        scores = scores.numpy()
        detections = sv.Detections(
            xyxy=bboxes,
            confidence=scores,
            class_id=cls_ids
        )

        detections = detections[detections.confidence > 0.6]
        detections = detections.with_nms(threshold=0.3, class_agnostic=True)

        ball_detections = detections[detections.class_id == label2id['Ball']]
        ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

        all_detections = detections[detections.class_id != label2id['Ball']]
        all_detections = tracker.update_with_detections(detections=all_detections)

        goalkeepers_detections = all_detections[all_detections.class_id == label2id['Goalkeeper']]
        players_detections = all_detections[all_detections.class_id == label2id['Player']]
        referees_detections = all_detections[all_detections.class_id == label2id['Referee']]

        all_detections = sv.Detections.merge([players_detections, goalkeepers_detections, referees_detections])

        labels = [
            f"#{id2label[cls_id]}"
            for cls_id
            in all_detections.class_id
        ]
        
        img = img[0].numpy()
        img = img.transpose(1, 2, 0)
        annotated_img = img.copy()
        annotated_img = box_annotator.annotate(annotated_img, all_detections)
        annotated_img = label_annotator.annotate(scene=annotated_img, detections=all_detections, labels=labels)
        annotated_img = box_annotator.annotate(scene=annotated_img, detections=ball_detections)

        video_sink.write_frame(annotated_img)

Processing video: football_06: 100%|██████████| 750/750 [00:45<00:00, 16.66it/s]


In [7]:
from models.rt_detr_v2_model import RTDetrV2Model

CHECKPOINT_PATH = PROJECT_PATH / 'checkpoints' / 'rt_detr_checkpoints' / 'checkpoint_e30_17112025_172724.pth'

MODEL = RTDetrV2Model(
    'PekingU/rtdetr_v2_r18vd',
    device='cuda'
)
MODEL.load_model_checkpoint(CHECKPOINT_PATH)

Some weights of RTDetrV2ForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_v2_r18vd and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([4]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([4, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([4]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([4, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([4]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([4, 256]) in the model instantiated
- model.denoi

In [18]:
from safetensors.torch import save_file

export_dir = PROJECT_PATH / "huggingface_export"
export_dir.mkdir(parents=True, exist_ok=True)

MODEL.model.config.save_pretrained(export_dir)
MODEL.processor.save_pretrained(export_dir)

state_dict = MODEL.model.state_dict()

# Rozbijamy współdzielone tensory na unikalne kopie
clean_state_dict = {k: v.clone() for k, v in state_dict.items()}

save_file(clean_state_dict, export_dir / "model.safetensors", metadata={"format": "pt"})