In [64]:
from networks import  yolo_v11_n
import torch
import numpy as np
from skimage import io
from sahi.slicing import slice_image
from time import time
import torchvision
import cv2
import random
import os
from tqdm import tqdm
import onnxruntime as ort

In [26]:
def wh2xy(x):
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

In [27]:
def resample():
    choices = (cv2.INTER_AREA,
               cv2.INTER_CUBIC,
               cv2.INTER_LINEAR,
               cv2.INTER_NEAREST,
               cv2.INTER_LANCZOS4)
    return random.choice(seq=choices)


In [38]:
def preprocess_image(image_path, input_size=640):
    """Preprocess image to match training pipeline in Yolo_Dataset."""
    # Read image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Failed to load image: {image_path}")
    
    # Get original shape
    h, w = image.shape[:2]
    
    # Scale ratio (new / old)
    r = min(input_size / h, input_size / w)
    
    # Resize
    pad = (int(w * r), int(h * r))
    if (h, w) != pad[::-1]:
        image = cv2.resize(image, dsize=pad, interpolation=cv2.INTER_LINEAR)
    
    # Compute padding
    top, bottom = int((input_size - pad[1]) / 2), int((input_size - pad[1]) / 2 + (input_size - pad[1]) % 2)
    left, right = int((input_size - pad[0]) / 2), int((input_size - pad[0]) / 2 + (input_size - pad[0]) % 2)
    image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
    
    # Convert HWC to CHW, BGR to RGB, normalize
    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    image = np.ascontiguousarray(image) / 255.0  # Normalize to [0, 1]
    
    # Convert to tensor
    image = torch.from_numpy(image).float()
    
    return image, (r, r), (left, top), (h, w)

In [29]:

def non_max_suppression(outputs, confidence_threshold=0.001, iou_threshold=0.65): 
    max_wh = 7680
    max_det = 300
    max_nms = 30000

    bs = outputs.shape[0]  # batch size
    nc = outputs.shape[1] - 4  # number of classes
    xc = outputs[:, 4:4 + nc].amax(1) > confidence_threshold  # candidates

    # Settings
    start = time()
    limit = 0.5 + 0.05 * bs  # seconds to quit after
    output = [torch.zeros((0, 6), device=outputs.device)] * bs
    for index, x in enumerate(outputs):  # image index, image inference
        x = x.transpose(0, -1)[xc[index]]  # confidence

        # If none remain process next image
        if not x.shape[0]:
            continue

        # matrix nx6 (box, confidence, cls)
        box, cls = x.split((4, nc), 1)
        box = wh2xy(box)  # (cx, cy, w, h) to (x1, y1, x2, y2)
        if nc > 1:
            i, j = (cls > confidence_threshold).nonzero(as_tuple=False).T
            x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float()), 1)
        else:  # best class only
            conf, j = cls.max(1, keepdim=True)
            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > confidence_threshold]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence and remove excess boxes

        # Batched NMS
        c = x[:, 5:6] * max_wh  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes, scores
        indices = torchvision.ops.nms(boxes, scores, iou_threshold)  # NMS
        indices = indices[:max_det]  # limit detections

        output[index] = x[indices]
        if (time() - start) > limit:
            break  # time limit exceeded

    return output

In [65]:
onnx = ort.InferenceSession("/mnt/DATA/gits/yolov11_scratch/weights/my_model.onnx",  providers=['CUDAExecutionProvider'])

In [56]:
model = yolo_v11_n(3)
model.load_state_dict(torch.load('/mnt/DATA/gits/yolov11_scratch/weights/last_model.pt', weights_only=False))
model.eval()

YOLO(
  (net): DarkNet(
    (p1): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (norm): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (relu): SiLU()
      )
    )
    (p2): Sequential(
      (0): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (norm): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (relu): SiLU()
      )
      (1): CSP(
        (conv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (relu): SiLU()
        )
        (conv2): Conv(
          (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=

In [42]:
img , (r, r), (left, top), (h, w)= preprocess_image('/mnt/DATA/DATASETS/data/dlpj/slp/train/images/DJI_20240113121622_0064_T_-_JPG.rf.3aeb1701897dea0a9760b34c966d5873.jpg')
# slices = slice_image(img,slice_height=640, slice_width=640)

In [46]:
img.shape

torch.Size([3, 640, 640])

In [70]:
out_onnx = onnx.run(None, {onnx.get_inputs()[0].name : img.unsqueeze(0).numpy()})

In [57]:
out = model(img.unsqueeze(0))

In [58]:
(out[:, 4:4 + 3].amax(1) > 0.25).sum()

tensor(6)

In [59]:
non_max_suppression(out, confidence_threshold=0.25, iou_threshold=0.65)

[tensor([[338.7303, 234.4133, 352.5556, 247.7118,   0.6577,   0.0000],
         [326.9105, 233.5493, 344.1747, 246.1192,   0.5875,   0.0000],
         [326.6710, 232.8117, 353.6649, 248.0241,   0.3790,   0.0000]],
        grad_fn=<IndexBackward0>)]

In [62]:
def draw_detections(image, detections, class_names=None):
    """Draw bounding boxes and labels on the image."""
    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
        label = f"{int(cls)} {conf:.2f}" if class_names is None else f"{class_names[int(cls)]} {conf:.2f}"
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    return image

def inference(model, image_paths, input_size=640, conf_thres=0.25, iou_thres=0.65, output_dir="output"):
    """Run inference on a list of images and save results."""
    model.eval()
    os.makedirs(output_dir, exist_ok=True)
    
    for image_path in tqdm(image_paths, desc="Inference"):
        # Preprocess image
        image_tensor, ratio, pad, orig_shape = preprocess_image(image_path, input_size)
        image_tensor = image_tensor.unsqueeze(0)  # Add batch dimension
        
        # Inference
        with torch.no_grad():
            outputs = model(image_tensor)
            outputs = non_max_suppression(outputs, confidence_threshold=conf_thres, iou_threshold=iou_thres)
        
        # Process detections
        detections = outputs[0]  # First batch
        if detections.shape[0] > 0:
            # Rescale boxes to original image size
            detections[:, :4] = detections[:, :4].clone()
            detections[:, [0, 2]] = (detections[:, [0, 2]] - pad[0]) / ratio[0]  # x1, x2
            detections[:, [1, 3]] = (detections[:, [1, 3]] - pad[1]) / ratio[1]  # y1, y2
        
        # Load original image for visualization
        orig_image = cv2.imread(image_path)
        if detections.shape[0] > 0:
            orig_image = draw_detections(orig_image, detections)
        else:
            print(f"No detections for {image_path}")
        
        # Save output
        output_path = os.path.join(output_dir, os.path.basename(image_path))
        cv2.imwrite(output_path, orig_image)
    
    print(f"Results saved to {output_dir}")

In [63]:
input_size = 640
conf_thres = 0.1  # Lowered to allow more detections
iou_thres = 0.5   # Lowered to reduce NMS filtering
output_dir = "output/inference_results"

# Run inference
inference(model, ['/mnt/DATA/DATASETS/data/dlpj/slp/train/images/DJI_20240113121622_0064_T_-_JPG.rf.3aeb1701897dea0a9760b34c966d5873.jpg'],
           input_size, conf_thres, iou_thres, output_dir)

Inference: 100%|██████████| 1/1 [00:00<00:00,  7.27it/s]

Results saved to output/inference_results



