In [1]:
import torch

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODEL_TYPE = "vit_t"

Load the YOLO model

In [2]:
from ultralytics import YOLO

In [3]:
model = YOLO("yolov8n.pt")
model.to(DEVICE)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

Load the TinySAM

In [6]:
from tinysam import sam_model_registry, SamPredictor

sam = sam_model_registry[MODEL_TYPE](checkpoint="./weights/tinysam.pth")
sam.to(DEVICE)

Sam(
  (image_encoder): TinyViT(
    (patch_embed): PatchEmbed(
      (seq): Sequential(
        (0): Conv2d_BN(
          (c): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): GELU(approximate='none')
        (2): Conv2d_BN(
          (c): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
    )
    (layers): ModuleList(
      (0): ConvLayer(
        (blocks): ModuleList(
          (0-1): 2 x MBConv(
            (conv1): Conv2d_BN(
              (c): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            )
            (act1): GELU(approximate='none')
            (conv2): Conv2d_BN(
 

In [10]:
sam_predictor = SamPredictor(sam)

Video inference

In [9]:
import cv2
import numpy as np

In [12]:
video_path = "fight_2.mp4"
cap = cv2.VideoCapture(video_path)

output_path = "result_2.avi"
fourcc = cv2.VideoWriter_fourcc(*'XVID')
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    composite_mask = np.zeros_like(frame, dtype=np.uint8)

    objects = model(frame, classes=[0], conf=0.6)

    for obj in objects:
        boxes = obj.boxes
        cls = boxes.cls

        classes = ['person']

        for i in range(len(boxes)):
            class_name = classes[int(cls[i])]
            print(class_name)

            #Extort the box coordination
            xyxy = boxes.xyxy[i]
            x1, y1, x2, y2 = xyxy.cpu().numpy()
            
            #Plot the rectangle on to the frame
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)

            #Add text to the rectangle
            text = class_name
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.5
            thickness = 2
            text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
            text_x = int(x1 + 5)
            text_y = int(y1 + text_size[1] + 5)
            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (0, 0, 255), thickness)

            """
            Use the box and inference it to the SAM model
            """

            sam_predictor.set_image(frame)
            input_box = np.array(xyxy.cpu().numpy())

            masks, _, _ = sam_predictor.predict(
                point_coords=None,
                point_labels=None,
                box=input_box[None, :],
            )

            mask = masks[0]
            colored_mask = np.zeros_like(frame)
            colored_mask[mask > 0] = [0, 255, 0]
            composite_mask = cv2.add(composite_mask, colored_mask)
    
    alpha = 0.5
    frame = cv2.addWeighted(frame, 1 - alpha, composite_mask, alpha, 0)

    out.write(frame)
    
    cv2.imshow("Frame with person segmentation", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 4 persons, 40.3ms
Speed: 4.9ms preprocess, 40.3ms inference, 7.6ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 8.9ms
Speed: 2.4ms preprocess, 8.9ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 17.8ms
Speed: 3.0ms preprocess, 17.8ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 37.7ms
Speed: 3.8ms preprocess, 37.7ms inference, 6.2ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 13.3ms
Speed: 2.5ms preprocess, 13.3ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 8.6ms
Speed: 1.8ms preprocess, 8.6ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)
person
person
person
person

0: 384x640 4 persons, 11.8ms
Speed: 2.5ms preprocess, 11.8ms inference, 3