## load the images with dependencies

In [327]:
import pickle

# Open the pickle file in binary read mode ('rb')
with open('my_dict.pkl', 'rb') as file:
    batch = pickle.load(file)

# Now 'data' contains the Python object that was saved
print(batch.keys())

dict_keys(['batch_idx', 'bboxes', 'cls', 'im_file', 'img', 'ori_shape', 'resized_shape'])


In [328]:
img_img = batch["img"][3]

In [329]:
# batch["img"]

In [330]:
# from ultralytics import YOLO
# model = YOLO("yolo11n.pt")

In [331]:
# result = model(batch["img"])

In [332]:
# result[3].show()

In [333]:
# output_list = model.model(batch["img"])

In [334]:
# type(output_list)

In [335]:
# type(output_list[1]), output_list[0].shape, len(output_list[1])

## setup predictor

In [336]:
from ultralytics import YOLO
from ultralytics.engine.predictor import BasePredictor

In [337]:
# 2. Build predictor (this contains YOLO preprocess)
predictor = BasePredictor(overrides={"model": "yolo11n.pt"})

In [338]:
# 3. Manually set up model inside predictor
predictor.setup_model("yolo11n.pt")

Ultralytics 8.3.184  Python-3.13.5 torch-2.9.0+cu130 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
YOLO11n summary (fused): 100 layers, 2,616,248 parameters, 0 gradients


## settting up dataset

In [339]:
dataset = predictor.setup_source(img_img.unsqueeze(0))

In [340]:
# 5. Get raw image from predictor as YOLO sees it
paths, im0s, _ = next(iter(predictor.dataset))

In [341]:
# 6. NOW apply YOLOâ€™s EXACT preprocess
img_tensor = predictor.preprocess(im0s)

In [342]:
print(img_tensor.shape)

torch.Size([1, 3, 640, 640])


## now to predict the model output

In [343]:
preds = predictor.model(img_tensor)

In [344]:
type(preds)

list

In [345]:
len(preds), type(preds[0]), type(preds[1])

(2, torch.Tensor, list)

In [346]:
len(preds[1]), preds[1][0].shape, preds[1][1].shape, preds[1][2].shape

(3,
 torch.Size([1, 144, 80, 80]),
 torch.Size([1, 144, 40, 40]),
 torch.Size([1, 144, 20, 20]))

In [347]:
preds[0].shape

torch.Size([1, 84, 8400])

In [348]:
item_1 = preds[0][:, :, 0:1]

In [349]:
item_1

tensor([[[4.9693e+00],
         [1.1561e+01],
         [1.1712e+01],
         [2.4632e+01],
         [5.6590e-07],
         [1.0759e-07],
         [4.4967e-07],
         [1.3837e-07],
         [2.0762e-07],
         [2.1978e-07],
         [1.6447e-07],
         [1.0940e-07],
         [1.5621e-07],
         [2.0935e-07],
         [4.5591e-08],
         [1.1319e-07],
         [4.5540e-08],
         [2.4337e-07],
         [1.9762e-07],
         [1.5301e-07],
         [9.7529e-08],
         [1.1021e-07],
         [1.0657e-07],
         [1.4638e-07],
         [1.1024e-07],
         [1.2934e-07],
         [1.7624e-07],
         [1.0721e-07],
         [1.6403e-07],
         [2.3042e-07],
         [1.1476e-07],
         [2.5156e-07],
         [1.0982e-07],
         [2.0944e-07],
         [1.3973e-07],
         [1.1207e-07],
         [2.0283e-07],
         [1.6196e-07],
         [2.0644e-07],
         [1.4514e-07],
         [9.3364e-08],
         [1.3915e-07],
         [1.1797e-07],
         [2

## now applying non maximum supression

In [350]:
from ultralytics.utils.ops import non_max_suppression
from ultralytics.engine.results import Results

# 1) apply YOLO's NMS (same used inside AutoBackend)
nms_output = non_max_suppression(
    preds,
    conf_thres=predictor.args.conf,
    iou_thres=predictor.args.iou,
    max_det=predictor.args.max_det,
    classes=None,
    agnostic=predictor.args.agnostic_nms
)

In [351]:
nms_output

[tensor([[372.1011, 173.0616, 599.6474, 464.7625,   0.8958,  23.0000]], device='cuda:0')]

In [352]:
# 2) Construct YOLO Results objects for each image
results = [
    Results(
        orig_img=im0s[i],
        path=paths[i],
        names=predictor.model.names,
        boxes=nms_output[i]
    )
    for i in range(len(nms_output))
]

## ploting the nms results

In [353]:
import cv2
import torch
import numpy as np

# Your raw NMS output
detections = nms_output

# Use the batch image tensor directly
img_tensor = img_img  # This is your tensor

# Convert tensor to proper numpy format for OpenCV
img = img_tensor.detach().cpu().numpy()

# Handle tensor format conversion
if img.shape[0] == 3:  # CHW format (channels first)
    img = img.transpose(1, 2, 0)  # Convert to HWC format

# Denormalize if needed (assuming image was normalized to 0-1)
if img.max() <= 1.0:
    img = (img * 255).astype(np.uint8)
else:
    img = img.astype(np.uint8)

# Convert from RGB to BGR for OpenCV
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

# Loop through detections
for det in detections[0].cpu().numpy():   # move to CPU and numpy
    x1, y1, x2, y2, conf, cls = det

    # Convert to int
    x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

    # Draw bounding box
    cv2.rectangle(
        img,
        (x1, y1),
        (x2, y2),
        (0, 255, 0),   # green box
        2              # box thickness
    )

    # Draw label text
    label = f"{int(cls)} {conf:.2f}"
    cv2.putText(
        img,
        label,
        (x1, y1 - 10),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7,            # font size
        (0, 255, 0),    # text color
        2               # thickness
    )

# Show result
cv2.imshow("result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## now im applying the NMS to the prediction output

In [354]:
# Open the pickle file in binary read mode ('rb')
with open('t_list.pkl', 'rb') as file:
    t_list = pickle.load(file)

# Now 'data' contains the Python object that was saved
print(len(t_list))

2


In [355]:
# Open the pickle file in binary read mode ('rb')
with open('s_list.pkl', 'rb') as file:
    s_list = pickle.load(file)

# Now 'data' contains the Python object that was saved
print(len(s_list))

2


for `t_list[0][2][-1]` gives us tal masks. maybe useful when we try to do masked distillation

In [356]:
type(t_list[0]), type(t_list[1])

(tuple, list)

In [357]:
print(t_list[0][0], t_list[0][1], sep= "\n")

tensor([5.2045, 8.7647, 6.4827], device='cuda:0')
tensor([1.3011, 2.1912, 1.6207], device='cuda:0')


In [358]:
print(t_list[0][2][-1].shape)

torch.Size([8400])


## ploting tal generated masks

In [359]:
import cv2
import torch
import numpy as np

# Assuming you have a tensor with 8400 elements (boolean values)
tensor_8400 = t_list[0][2][-1]  # Your tensor with 8400 boolean values

# Reshape to three different sizes
tensor_80x80 = tensor_8400[:6400].reshape(80, 80)
tensor_40x40 = tensor_8400[6400:8000].reshape(40, 40)
tensor_20x20 = tensor_8400[8000:].reshape(20, 20)

# Convert boolean tensors to uint8 for OpenCV
def bool_to_image(bool_tensor):
    img_np = bool_tensor.cpu().numpy().astype(np.uint8) * 255
    return img_np

# Convert all tensors
img_80x80 = bool_to_image(tensor_80x80)
img_40x40 = bool_to_image(tensor_40x40)
img_20x20 = bool_to_image(tensor_20x20)

# Resize smaller images
img_40x40_resized = cv2.resize(img_40x40, (160, 160), interpolation=cv2.INTER_NEAREST)
img_20x20_resized = cv2.resize(img_20x20, (160, 160), interpolation=cv2.INTER_NEAREST)

# Calculate canvas dimensions
padding = 20
total_width = 80 + 160 + 160 + 4 * padding  # 3 images + 4 paddings
canvas_height = 200

# Create canvas
canvas = np.ones((canvas_height, total_width), dtype=np.uint8) * 255

# Calculate positions
x_positions = [
    padding,  # 80x80 start
    padding + 80 + padding,  # 40x40 start
    padding + 80 + padding + 160 + padding  # 20x20 start
]

# Place images on canvas
canvas[10:90, x_positions[0]:x_positions[0]+80] = img_80x80
canvas[20:180, x_positions[1]:x_positions[1]+160] = img_40x40_resized
canvas[20:180, x_positions[2]:x_positions[2]+160] = img_20x20_resized

# Add labels
cv2.putText(canvas, "80x80", (x_positions[0], 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)
cv2.putText(canvas, "40x40", (x_positions[1], 190), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)
cv2.putText(canvas, "20x20", (x_positions[2], 190), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)

# Add statistics
true_count_80x80 = tensor_80x80.sum().item()
true_count_40x40 = tensor_40x40.sum().item()
true_count_20x20 = tensor_20x20.sum().item()

stats_80x80 = f"True: {true_count_80x80}/6400"
stats_40x40 = f"True: {true_count_40x40}/1600"
stats_20x20 = f"True: {true_count_20x20}/400"

cv2.putText(canvas, stats_80x80, (x_positions[0], 115), cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)
cv2.putText(canvas, stats_40x40, (x_positions[1], 205), cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)
cv2.putText(canvas, stats_20x20, (x_positions[2], 205), cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)

# Display the result
cv2.imshow("Boolean Tensors Visualization", canvas)
cv2.waitKey(0)
cv2.destroyAllWindows()

## ploting images with upscaled masks

In [360]:
import cv2
import torch
import numpy as np

# -----------------------
# Helpers
# -----------------------

def tensor_to_numpy_image(img_tensor):
    """
    Convert various tensor shapes to a uint8 numpy image.
    Handles:
      - (H, W)
      - (1, H, W)
      - (C, H, W) with C in {1, 3}
      - (B, C, H, W) with B=1
    Returns a 2D (H, W) grayscale image.
    """
    if isinstance(img_tensor, torch.Tensor):
        arr = img_tensor.detach().cpu().numpy()
    else:
        arr = np.array(img_tensor)

    # Remove batch dimension if present
    if arr.ndim == 4:
        # Expecting (B, C, H, W)
        if arr.shape[0] != 1:
            raise ValueError(f"Unexpected batch size in image: {arr.shape}")
        arr = arr[0]

    # Handle (C, H, W) or (1, H, W)
    if arr.ndim == 3:
        # (C, H, W)
        if arr.shape[0] == 1:
            # Single channel
            arr = arr[0]  # (H, W)
        elif arr.shape[0] == 3:
            # (3, H, W) -> (H, W, 3)
            arr = arr.transpose(1, 2, 0)
        else:
            # Maybe already (H, W, C)
            pass

    # Now handle normalization and dtype
    arr = arr.astype(np.float32)
    if arr.max() <= 1.0:
        arr = arr * 255.0
    arr = np.clip(arr, 0, 255).astype(np.uint8)

    # Ensure grayscale
    if arr.ndim == 3:
        # If (H, W, 3) or more
        if arr.shape[2] > 3:
            arr = arr[:, :, :3]
        arr = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)

    if arr.ndim != 2:
        raise ValueError(f"Image is not 2D after processing. Got shape: {arr.shape}")

    return arr


def bool_to_image(bool_tensor):
    """Convert a boolean tensor to uint8 0/255 numpy image."""
    if isinstance(bool_tensor, torch.Tensor):
        arr = bool_tensor.detach().cpu().numpy()
    else:
        arr = np.array(bool_tensor)
    arr = arr.astype(np.uint8) * 255
    return arr


def create_overlay(original_gray, mask, color):
    """
    original_gray: 2D (H, W) grayscale image
    mask: 2D (H, W) binary/uint8 mask
    color: 'red', 'green', 'blue'
    Returns: 3-channel BGR overlay image.
    """
    if original_gray.ndim != 2:
        raise ValueError(f"Expected 2D grayscale image. Got shape: {original_gray.shape}")
    if mask.shape != original_gray.shape:
        raise ValueError(f"Mask shape {mask.shape} does not match image shape {original_gray.shape}")

    # Convert grayscale to BGR
    original_colored = cv2.cvtColor(original_gray, cv2.COLOR_GRAY2BGR)

    colored_mask = np.zeros_like(original_colored)
    mask_bool = mask > 0  # (H, W)

    if color == 'red':
        colored_mask[mask_bool] = [0, 0, 255]
    elif color == 'green':
        colored_mask[mask_bool] = [0, 255, 0]
    elif color == 'blue':
        colored_mask[mask_bool] = [255, 0, 0]
    else:
        raise ValueError(f"Unknown color: {color}")

    overlay = cv2.addWeighted(original_colored, 0.7, colored_mask, 0.3, 0)
    return overlay


# -----------------------
# Your data
# -----------------------

# Assuming you have:
# t_list[0][2][-1]  -> 8400-element tensor of booleans
# t_list[0][0]      -> image tensor
tensor_8400 = t_list[0][2][-1]
# img_img = t_list[0][0]

# Normalize tensor_8400
if isinstance(tensor_8400, torch.Tensor):
    tensor_8400 = tensor_8400.view(-1)
else:
    tensor_8400 = torch.tensor(tensor_8400).view(-1)

if tensor_8400.numel() != 8400:
    raise ValueError(f"Expected 8400 elements in tensor_8400, got {tensor_8400.numel()}")

# Reshape to 80x80, 40x40, 20x20
tensor_80x80 = tensor_8400[:6400].reshape(80, 80)
tensor_40x40 = tensor_8400[6400:8000].reshape(40, 40)
tensor_20x20 = tensor_8400[8000:].reshape(20, 20)

# Convert masks to images
img_80x80 = bool_to_image(tensor_80x80)
img_40x40 = bool_to_image(tensor_40x40)
img_20x20 = bool_to_image(tensor_20x20)

# Convert original image to 2D grayscale numpy
img_gray = tensor_to_numpy_image(img_img)

# For safety, resize grayscale base to 640x640
img_gray = cv2.resize(img_gray, (640, 640), interpolation=cv2.INTER_LINEAR)

# Resize masks to match 640x640
img_80x80_resized = cv2.resize(img_80x80, (640, 640), interpolation=cv2.INTER_NEAREST)
img_40x40_resized = cv2.resize(img_40x40, (640, 640), interpolation=cv2.INTER_NEAREST)
img_20x20_resized = cv2.resize(img_20x20, (640, 640), interpolation=cv2.INTER_NEAREST)

# -----------------------
# Create overlays
# -----------------------
overlay_80x80 = create_overlay(img_gray, img_80x80_resized, 'red')
overlay_40x40 = create_overlay(img_gray, img_40x40_resized, 'green')
overlay_20x20 = create_overlay(img_gray, img_20x20_resized, 'blue')

# -----------------------
# Main visualization canvas
# -----------------------
padding = 20
image_size = 200  # display size
total_width = image_size * 4 + 5 * padding
canvas_height = image_size + 150

canvas = np.ones((canvas_height, total_width, 3), dtype=np.uint8) * 255

x_positions = [
    padding,
    padding + image_size + padding,
    padding + 2 * (image_size + padding),
    padding + 3 * (image_size + padding)
]

# Resize images for display
img_640x640_small = cv2.resize(img_gray, (image_size, image_size))
img_640x640_small = cv2.cvtColor(img_640x640_small, cv2.COLOR_GRAY2BGR)

overlay_80x80_small = cv2.resize(overlay_80x80, (image_size, image_size))
overlay_40x40_small = cv2.resize(overlay_40x40, (image_size, image_size))
overlay_20x20_small = cv2.resize(overlay_20x20, (image_size, image_size))

# Place on canvas
canvas[10:10 + image_size, x_positions[0]:x_positions[0] + image_size] = img_640x640_small
canvas[10:10 + image_size, x_positions[1]:x_positions[1] + image_size] = overlay_80x80_small
canvas[10:10 + image_size, x_positions[2]:x_positions[2] + image_size] = overlay_40x40_small
canvas[10:10 + image_size, x_positions[3]:x_positions[3] + image_size] = overlay_20x20_small

# Labels
cv2.putText(canvas, "Original", (x_positions[0], image_size + 30),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
cv2.putText(canvas, "80x80 Mask", (x_positions[1], image_size + 30),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
cv2.putText(canvas, "40x40 Mask", (x_positions[2], image_size + 30),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
cv2.putText(canvas, "20x20 Mask", (x_positions[3], image_size + 30),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

# Stats
true_count_80x80 = int(tensor_80x80.sum().item())
true_count_40x40 = int(tensor_40x40.sum().item())
true_count_20x20 = int(tensor_20x20.sum().item())

stats_80x80 = f"True: {true_count_80x80}/6400"
stats_40x40 = f"True: {true_count_40x40}/1600"
stats_20x20 = f"True: {true_count_20x20}/400"

cv2.putText(canvas, stats_80x80, (x_positions[1], image_size + 50),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
cv2.putText(canvas, stats_40x40, (x_positions[2], image_size + 50),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
cv2.putText(canvas, stats_20x20, (x_positions[3], image_size + 50),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)

# -----------------------
# Mask visualization canvas
# -----------------------
mask_canvas_height = 200
mask_total_width = 80 + 160 + 160 + 4 * padding
mask_canvas = np.ones((mask_canvas_height, mask_total_width), dtype=np.uint8) * 255

mask_x_positions = [
    padding,
    padding + 80 + padding,
    padding + 80 + padding + 160 + padding
]

img_40x40_small = cv2.resize(img_40x40, (160, 160), interpolation=cv2.INTER_NEAREST)
img_20x20_small = cv2.resize(img_20x20, (160, 160), interpolation=cv2.INTER_NEAREST)

mask_canvas[10:90, mask_x_positions[0]:mask_x_positions[0] + 80] = img_80x80
mask_canvas[20:180, mask_x_positions[1]:mask_x_positions[1] + 160] = img_40x40_small
mask_canvas[20:180, mask_x_positions[2]:mask_x_positions[2] + 160] = img_20x20_small

cv2.putText(mask_canvas, "80x80", (mask_x_positions[0], 100),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)
cv2.putText(mask_canvas, "40x40", (mask_x_positions[1], 190),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)
cv2.putText(mask_canvas, "20x20", (mask_x_positions[2], 190),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1)

cv2.putText(mask_canvas, stats_80x80, (mask_x_positions[0], 115),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)
cv2.putText(mask_canvas, stats_40x40, (mask_x_positions[1], 205),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)
cv2.putText(mask_canvas, stats_20x20, (mask_x_positions[2], 205),
            cv2.FONT_HERSHEY_SIMPLEX, 0.4, 0, 1)

# -----------------------
# Show windows
# -----------------------
cv2.imshow("Image with Boolean Masks Overlay", canvas)
cv2.imshow("Boolean Masks Visualization", mask_canvas)
cv2.waitKey(0)
cv2.destroyAllWindows()

## now i want to feed teacher predictions for nms process

In [361]:
print(t_list[1][0].shape, t_list[1][1].shape, t_list[1][2].shape)

torch.Size([4, 144, 80, 80]) torch.Size([4, 144, 40, 40]) torch.Size([4, 144, 20, 20])


In [362]:
def yolo11_decode(outputs, strides=[8, 16, 32], num_bins=16):
    B = outputs[0].shape[0]

    # 1) MAKE ANCHORS
    anchors, stride_tensor = make_anchors(outputs, strides)
    anchors = anchors.t().unsqueeze(0)          # (1, 2, N)
    stride_tensor = stride_tensor.t().unsqueeze(0)  # (1, 1, N)

    all_boxes, all_scores = [], []
    start = 0

    for feat, stride in zip(outputs, strides):
        B, C, H, W = feat.shape
        h_w = H * W
        end = start + h_w

        # per-level slice
        anchors_slice = anchors[:, :, start:end]       # (1, 2, h*w)
        stride_slice = stride_tensor[:, :, start:end]  # (1, 1, h*w)

        # 2) DFL distances
        dfl_ch = feat[:, :64]
        dist = dfl_expectation(dfl_ch, num_bins).view(B, 4, -1)

        # 3) class scores
        cls = feat[:, 64:].sigmoid().view(B, 80, -1)

        # 4) decode boxes
        boxes = dist2bbox(dist, anchors_slice) * stride_slice

        all_boxes.append(boxes)
        all_scores.append(cls)

        start = end

    # concat all pyramid levels
    boxes = torch.cat(all_boxes, dim=2)
    scores = torch.cat(all_scores, dim=2)

    return torch.cat([boxes, scores], dim=1)  # (B, 84, N)

In [363]:
result_ = yolo11_decode(t_list[1], strides=[8, 16, 32], num_bins=16)

In [364]:
result_.shape

torch.Size([4, 84, 8400])

In [365]:
preds[1][0].shape , t_list[1][0][3].unsqueeze(0).shape

(torch.Size([1, 144, 80, 80]), torch.Size([1, 144, 80, 80]))

In [366]:
test_1 = preds[1][0]
test_2 = t_list[1][0][3]
test_3 = s_list[1][0][3]

In [367]:
print(test_1[0,:,0,0][:4], sep= "\n")
print(test_2[:,0,0][:4])
print(test_3[:,0,0][:4])

tensor([6.3727, 6.3577, 3.0764, 1.3201], device='cuda:0')
tensor([7.6250, 7.5312, 3.5957, 1.2539], device='cuda:0', dtype=torch.float16)
tensor([7.6250, 7.5312, 3.5957, 1.2539], device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)


In [368]:
print(test_1[0,:,0,0][5:9], sep= "\n")
print(test_2[:,0,0][5:9], sep= "\n")
print(test_3[:,0,0][5:9], sep= "\n")

tensor([0.9528, 0.3598, 0.2781, 0.1830], device='cuda:0')
tensor([ 0.6533,  0.0303, -0.0391, -0.1104], device='cuda:0', dtype=torch.float16)
tensor([ 0.6533,  0.0303, -0.0391, -0.1104], device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)


In [369]:
# item_2 = result[3][:, 0:1]

In [370]:
# item_2 == item_1

In [371]:
from ultralytics.utils.ops import non_max_suppression
from ultralytics.engine.results import Results

# 1) apply YOLO's NMS (same used inside AutoBackend)
nms_output = non_max_suppression(
    result_,
    conf_thres= 0.5, #predictor.args.conf,
    iou_thres= 0.9, #predictor.args.iou,
    max_det=predictor.args.max_det,
    classes=None,
    agnostic=predictor.args.agnostic_nms
)

In [372]:
nms_output[3]

tensor([[4.0100e+02, 1.8100e+02, 5.5350e+02, 4.6100e+02, 8.7305e-01, 2.3000e+01],
        [1.2425e+02, 1.0800e+02, 1.8625e+02, 1.5825e+02, 5.2441e-01, 0.0000e+00]], device='cuda:0')

In [373]:
import cv2
import torch
import numpy as np

# NMS output for one image (index 3 in batch)
detections = nms_output[3]  # tensor [num_dets, 6] on cuda

# Use the batch image tensor directly
img_tensor = img_img  # CHW or HWC tensor

# Convert tensor to numpy
img = img_tensor.detach().cpu().numpy()

# Handle tensor format conversion (CHW -> HWC)
if img.ndim == 3 and img.shape[0] in (1, 3):  # typical CHW
    img = img.transpose(1, 2, 0)

# Denormalize if needed
if img.max() <= 1.0:
    img = (img * 255).astype(np.uint8)
else:
    img = img.astype(np.uint8)

# Convert RGB -> BGR for OpenCV
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

# Draw detections
if detections is not None and len(detections):
    for det in detections.cpu().numpy():   # each det: [x1, y1, x2, y2, conf, cls]
        x1, y1, x2, y2, conf, cls = det

        x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

        cv2.rectangle(
            img,
            (x1, y1),
            (x2, y2),
            (0, 255, 0),
            2
        )

        label = f"{int(cls)} {conf:.2f}"
        cv2.putText(
            img,
            label,
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (0, 255, 0),
            2
        )

cv2.imshow("result", img)
cv2.waitKey(0)
cv2.destroyAllWindows()