In [1]:
import tensorflow as tf
import cv2 as cv
import numpy as np
from strong_sort.strong_sort import StrongSORT
model = tf.lite.Interpreter(r'models\yolo11n_float32.tflite')
model.allocate_tensors()



In [2]:
tracker = StrongSORT(
    model_weights = r'models\osnet_x0_25_msmt17.pt',
    device = "cpu",
    fp16 = True,
    max_dist=0.2,  # Appearance matching threshold (lower = stricter)
    max_iou_distance=0.7,
    max_age=70,  # Frames to keep lost tracks
    n_init=3,  # Frames to confirm a track
    nn_budget=100,
    mc_lambda=0.995,
    ema_alpha=0.9,
)

Model: osnet_x0_25
- params: 203,568
- flops: 82,316,000
Successfully loaded pretrained weights from "models\osnet_x0_25_msmt17.pt"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']


  checkpoint = torch.load(fpath, map_location=map_location)


In [3]:
def deepSort(frame):
    input_details = model.get_input_details()[0]
    output_details = model.get_output_details()[0]

    frame_resized = cv.resize(frame, (input_details['shape'][2], input_details['shape'][1]))
    frame_norm = frame_resized.astype(np.float32) / 255.0
    model.set_tensor(input_details['index'], np.expand_dims(frame_norm, axis=0))
    model.invoke()
    
    raw_output = model.get_tensor(output_details['index']).squeeze().T

    class_probs = raw_output[:, 4:]
    confidence_score = np.max(class_probs, axis=1)
    class_id = np.argmax(class_probs, axis=1)
    bbox = raw_output[:, 0:4]

    conf_thres = 0.4
    mask = (confidence_score >= conf_thres) & (class_id == 0)
    filtered_box = bbox[mask]
    filtered_score = confidence_score[mask]

    if filtered_box.size == 0:
        return

    height, width = frame.shape[:2]
    x_center, y_center, w, h = filtered_box[:, 0], filtered_box[:, 1], filtered_box[:, 2], filtered_box[:, 3]
    x1 = (x_center - w / 2) * width
    y1 = (y_center - h / 2) * height
    x2 = (x_center + w / 2) * width
    y2 = (y_center + h / 2) * height

    boxes_nms = np.stack([x1, y1, x2 - x1, y2 - y1], axis=1).astype(np.float32).tolist()

    nms_thres = 0.4
    indices = cv.dnn.NMSBoxes(
        bboxes=boxes_nms,
        scores=filtered_score.tolist(),
        score_threshold=conf_thres,
        nms_threshold=nms_thres
    )

    detection = []
    if len(indices) > 0:
        indices = indices.flatten()
        for i in indices:
            x1_i, y1_i, x2_i, y2_i = int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])
            detection.append([x1_i, y1_i, x2_i - x1_i, y2_i - y1_i])
    else:
        return
    
    detection = np.array(detection)

    tracks = tracker.update(
        bbox_xywh = detection,
        classes = 0,
        confidences = filtered_score,
        ori_img=frame_resized
    )

    return tracks

In [4]:
def process_sort():
    vid = cv.VideoCapture(0)
    
    while vid.isOpened():
        _, frame = vid.read()
        if not _:
            print("Failed to capture frame")
            break
        original_h, original_w = frame.shape[:2]
        resized_frame = cv.resize(frame, (640, 640))
        
        tracker_obj = deepSort(resized_frame)

        for obj in tracker.tracker.tracks:
            x1, y1, x2, y2 = obj.to_tlbr()
            cv.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        cv.imshow("Tracking", resized_frame)
        if cv.waitKey(1) & 0xFF == ord('q'):
            break
        
    cv.destroyAllWindows()
    vid.release()

In [49]:
process_sort()

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [7]:
def perform():
    vid = cv.VideoCapture(0)

    while vid.isOpened():
        ret, frame = vid.read()
        if not ret:
            print("Failed to capture frame.")
            break

        img_resized = cv.resize(frame, (640, 640)).astype(np.float32) / 255.0
        inp = model.get_input_details()[0]
        out = model.get_output_details()[0]

        model.set_tensor(inp['index'], np.expand_dims(img_resized, axis=0))
        model.invoke()
        res = model.get_tensor(out['index']).squeeze().T

        bbox = res[:, 0:4]
        class_prob = res[:, 4:]
        class_id = np.argmax(class_prob, axis=1)
        confidence_score = np.max(class_prob, axis=1)

        conf_thres = 0.5
        mask = (class_id == 0) & (confidence_score >= conf_thres)

        filtered_box = bbox[mask]
        filtered_score = confidence_score[mask]

        if filtered_box.size == 0:
            print("No detections found.")
            cv.imshow("vid", frame)
            if cv.waitKey(1) & 0xFF == ord("q"):
                break
            continue

        x_center, y_center, w, h = filtered_box[:, 0], filtered_box[:, 1], filtered_box[:, 2], filtered_box[:, 3]
        x1 = (x_center - w / 2) * 640
        y1 = (y_center - h / 2) * 640
        x2 = (x_center + w / 2) * 640
        y2 = (y_center + h / 2) * 640

        nms_thres = 0.5
        boxes_nms = np.stack([x1, y1, x2 - x1, y2 - y1], axis=1).astype(np.float32).tolist()
        
        indices = cv.dnn.NMSBoxes(boxes_nms, filtered_score.tolist(), conf_thres, nms_thres)

        if len(indices) > 0:  # Ensure indices is not empty
            for i in indices.flatten():
                x1_i, y1_i, x2_i, y2_i = int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])
                cv.rectangle(frame, (x1_i, y1_i), (x2_i, y2_i), (0, 255, 0), 2)

        cv.imshow("vid", frame)
        if cv.waitKey(1) & 0xFF == ord("q"):
            break

    vid.release()
    cv.destroyAllWindows()


In [8]:
perform()

No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
No detections found.
0
No detections found.
No detections found.
No detections found.
3
No detections found.
3
No detections found.
No detections found.
1
2
No detections found.
No detections found.
No detections found.
6
7
5
7
3
2
7
No detections found.
0
7
4
2
5
2
2
2
7
2
5
2
7
7
2
5
5
2
5
8
2
2
7
2
7
8
3
8
8
2
7
6
8
8
8
8
8
8
8
8
8
1
1
3
5
2
4
6
5
3
5
5
5


In [2]:
import cv2
import numpy as np
import onnxruntime
from strong_sort import StrongSORT
from strong_sort.utils.parser import get_config

# Load YOLOv11 ONNX model
onnx_model_path = r"models\yolo11n.onnx"
session = onnxruntime.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])

# Initialize StrongSORT tracker with correct fp16 setting for CPU
tracker = StrongSORT(
    model_weights=r'models\osnet_x0_25_msmt17.pt',
    device="cpu",
    fp16=False  # Disabled for CPU compatibility
)

# Detection Parameters
CONFIDENCE_THRESHOLD = 0.5
NMS_THRESHOLD = 0.45

def preprocess_image(image, input_size=(640, 640)):
    """Resize and normalize image for YOLOv11 ONNX model."""
    img = cv2.resize(image, input_size)
    img = img.astype(np.float32) / 255.0
    img = np.transpose(img, (2, 0, 1))  # HWC to CHW
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

def postprocess_detections(outputs, img_shape, input_size=(640, 640)):
    """Parse YOLOv11 ONNX model output into bounding boxes."""
    boxes_xywh, scores, class_ids = [], [], []

    preds = outputs[0].squeeze(0).T  # Shape: (8400, 84)

    scale_x = img_shape[1] / input_size[0]  # Original width / model input width
    scale_y = img_shape[0] / input_size[1]  # Original height / model input height

    for pred in preds:
        class_scores = pred[4:]
        conf = np.max(class_scores)
        class_id = np.argmax(class_scores)

        if conf > CONFIDENCE_THRESHOLD:
            cx, cy, w, h = pred[:4]
            # Convert coordinates to original image scale
            x = (cx - w/2) * scale_x
            y = (cy - h/2) * scale_y
            width = w * scale_x
            height = h * scale_y

            boxes_xywh.append([x, y, width, height])
            scores.append(float(conf))
            class_ids.append(int(class_id))

    # Apply NMS with correct box format (x, y, w, h)
    indices = cv2.dnn.NMSBoxes(boxes_xywh, scores, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)
    final_boxes, final_scores, final_class_ids = [], [], []

    if len(indices) > 0:
        for i in indices.flatten():
            x, y, w, h = boxes_xywh[i]
            final_boxes.append([int(x), int(y), int(x + w), int(y + h)])
            final_scores.append(scores[i])
            final_class_ids.append(class_ids[i])

    return final_boxes, final_scores, final_class_ids

def main():
    cap = cv2.VideoCapture(0)  # Webcam input

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Preprocess and run inference
        input_tensor = preprocess_image(frame)
        outputs = session.run(None, {session.get_inputs()[0].name: input_tensor})

        # Postprocess detections
        boxes, scores, class_ids = postprocess_detections(outputs, frame.shape)

        # Prepare detections for tracking
        detections = []
        for box, score, class_id in zip(boxes, scores, class_ids):
            x1, y1, x2, y2 = box
            detections.append([x1, y1, x2-x1, y2-y1, score, class_id])

        detections = np.array(detections, dtype=np.float32) if detections else np.empty((0, 6), dtype=np.float32)

        # Update tracker with correct parameters
        if detections.size > 0:
            tracks = tracker.update(detections=detections, ori_img=frame)
        else:
            tracks = []

        # Draw tracked objects
        for track in tracks:
            track_id = track.track_id
            x, y, w, h = track.to_tlwh().astype(int)
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"ID {track_id}", (x, y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Tracking", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

Model: osnet_x0_25
- params: 203,568
- flops: 82,316,000
Successfully loaded pretrained weights from "models\osnet_x0_25_msmt17.pt"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']


TypeError: update() got an unexpected keyword argument 'detections'