## Baseline using Yolov8n


In [3]:
from ultralytics import YOLO
import cv2

model = YOLO("yolov8n.pt")
cap = cv2.VideoCapture("/kaggle/input/crowd-5s/crowd_5s.mp4")
ret, frame = cap.read()
if ret:
    results = model(frame)
    annotated = results[0].plot()  # ← стандартная отрисовка YOLO
    cv2.imwrite("test_frame.jpg", annotated)
cap.release()

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/yolov8n.pt to 'yolov8n.pt': 100% ━━━━━━━━━━━━ 6.2MB 76.5MB/s 0.1s

0: 384x640 12 persons, 1 stop sign, 2 umbrellas, 65.0ms
Speed: 4.9ms preprocess, 65.0ms inference, 34.2ms postprocess per image at shape (1, 3, 384, 640)


видно что есть лишние объекты и слишком жирная рамка

In [None]:
import cv2

cap = cv2.VideoCapture("../crowd.mp4")

if not cap.isOpened():
    raise IOError("Cannot open video file")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"Resolution: {width}x{height}")
print(f"FPS: {fps}")
print(f"Total frames: {frame_count}")

cap.release()

In [None]:
from ultralytics import YOLO
import cv2

model = YOLO("yolov8n.pt")

cap = cv2.VideoCapture("./crowd_5s.mp4")
ret, frame = cap.read()

if not ret:
    raise ValueError("Failed to read frame from video!")

results = model(frame, imgsz=1280)
boxes = results[0].boxes

annotated_frame = frame.copy()

for box in boxes:
    cls_id = int(box.cls.item())
    conf = float(box.conf.item())
    xyxy = box.xyxy[0].cpu().numpy()

    if cls_id == 0 and conf > 0.1:
        x1, y1, x2, y2 = map(int, xyxy)
        cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
        label = f"person {conf:.2f}"
        cv2.putText(
            annotated_frame,
            label,
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=0.5,
            color=(0, 255, 0),
            thickness=1,
            lineType=cv2.LINE_AA
        )

cv2.imwrite("annotated_frame1.jpg", annotated_frame)
cap.release()

print("✅ Frame processed and saved as 'annotated_frame.jpg'")

## Сравнение быстродействия трех подходов - Yolov8n , Yolov8s + SAHI , RT-DETR от Baidu 

In [6]:
import cv2
import time
from ultralytics import YOLO, RTDETR
from sahi import AutoDetectionModel
from sahi.predict import get_sliced_prediction
import matplotlib.pyplot as plt

In [7]:
def draw_boxes(frame, boxes, label="person"):
    """Draw bounding boxes and labels on frame."""
    annotated = frame.copy()
    for xyxy, conf in boxes:
        x1, y1, x2, y2 = map(int, xyxy)
        cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
        text = f"{label} {conf:.2f}"
        cv2.putText(
            annotated, text, (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA
        )
    return annotated

def extract_person_detections(results):
    """Extract person detections from YOLO/RT-DETR results."""
    boxes = []
    for box in results[0].boxes:
        cls_id = int(box.cls.item())
        conf = float(box.conf.item())
        if cls_id == 0 and conf > 0.3:
            xyxy = box.xyxy[0].cpu().numpy()
            boxes.append((xyxy, conf))
    return boxes

def extract_person_detections_sahi(result):
    """Extract person detections from SAHI result."""
    boxes = []
    for obj in result.object_prediction_list:
        if obj.category.name == "person":
            conf = obj.score.value
            if conf > 0.3:
                x1, y1, x2, y2 = obj.bbox.to_voc_bbox()
                boxes.append(([x1, y1, x2, y2], conf))
    return boxes

In [8]:
VIDEO_PATH = "/kaggle/input/crowd-5s/crowd_5s.mp4"
N_FRAMES = 30

cap = cv2.VideoCapture(VIDEO_PATH)
frames = []
for _ in range(N_FRAMES):
    ret, frame = cap.read()
    if not ret:
        break
    frames.append(frame)
cap.release()



### Yolov8n

In [10]:
model_yolo = YOLO("yolov8n.pt")
model_yolo.to('cuda')
times_yolo = []
last_detections_yolo = []

for i, frame in enumerate(frames):
    start = time.time()
    results = model_yolo(frame, imgsz=1280, verbose=False)
    boxes = extract_person_detections(results)
    elapsed = time.time() - start
    times_yolo.append(elapsed)
    if i == len(frames) - 1:
        last_detections_yolo = boxes
        img_yolo = draw_boxes(frame, boxes)
        cv2.imwrite("yolo8.jpg", img_yolo)

avg_time_yolo = sum(times_yolo) / len(times_yolo)
print(f"YOLOv8n — avg time: {avg_time_yolo:.3f}s, persons in last frame: {len(last_detections_yolo)}")

YOLOv8n — avg time: 0.020s, persons in last frame: 16


### Yolo26

In [27]:
model_yolo = YOLO("yolo26l.pt")
model_yolo.to('cuda')
times_yolo = []
last_detections_yolo = []

for i, frame in enumerate(frames):
    start = time.time()
    results = model_yolo(frame, imgsz=1280, verbose=False)
    boxes = extract_person_detections(results)
    elapsed = time.time() - start
    times_yolo.append(elapsed)
    if i == len(frames) - 1:
        last_detections_yolo = boxes
        img_yolo = draw_boxes(frame, boxes)
        cv2.imwrite("yolov26.jpg", img_yolo)

avg_time_yolo = sum(times_yolo) / len(times_yolo)
print(f"YOLO26l — avg time: {avg_time_yolo:.3f}s, persons in last frame: {len(last_detections_yolo)}")

YOLO26l — avg time: 0.286s, persons in last frame: 20


### Yolov8 + SAHI

In [22]:
sahi_model = AutoDetectionModel.from_pretrained(
    model_type="yolov8",
    model_path="yolov8s.pt",
    confidence_threshold=0.3,
    device="cuda"
)

times_sahi = []
last_detections_sahi = []

for i, frame in enumerate(frames):
    start = time.time()
    result = get_sliced_prediction(
        image=frame,
        detection_model=sahi_model,
        slice_height=512,
        slice_width=512,
        overlap_height_ratio=0.2,
        overlap_width_ratio=0.2
    )
    boxes = extract_person_detections_sahi(result)
    elapsed = time.time() - start
    times_sahi.append(elapsed)
    if i == len(frames) - 1:
        last_detections_sahi = boxes
        img_sahi = draw_boxes(frame, boxes)
        cv2.imwrite("img_sahi.jpg", img_sahi)


avg_time_sahi = sum(times_sahi) / len(times_sahi)
print(f"SAHI (YOLOv8m) — avg time: {avg_time_sahi:.3f}s, persons in last frame: {len(last_detections_sahi)}")

Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 

### RTDETR

In [23]:
model_rtdetr = RTDETR("rtdetr-l.pt")
model_rtdetr.to("cuda")

times_rtdetr = []
last_detections_rtdetr = []

for i, frame in enumerate(frames):
    start = time.time()
    results = model_rtdetr(frame, imgsz=1280, verbose=False)
    boxes = extract_person_detections(results)  # same format as YOLO
    elapsed = time.time() - start
    times_rtdetr.append(elapsed)
    if i == len(frames) - 1:
        last_detections_rtdetr = boxes
        img_rtdetr = draw_boxes(frame, boxes)
        cv2.imwrite("img_rtdetr.jpg", img_rtdetr)

avg_time_rtdetr = sum(times_rtdetr) / len(times_rtdetr)
print(f"RT-DETR — avg time: {avg_time_rtdetr:.3f}s, persons in last frame: {len(last_detections_rtdetr)}")

RT-DETR — avg time: 0.410s, persons in last frame: 26


### RTDETR + SAHI

In [24]:
sahi_model = AutoDetectionModel.from_pretrained(
    model_type="ultralytics",
    model_path="rtdetr-l.pt",
    device="cuda"
)

times_sahi = []
last_detections_sahi = []

for i, frame in enumerate(frames):
    start = time.time()
    result = get_sliced_prediction(
        image=frame,
        detection_model=sahi_model,
        slice_height=512,
        slice_width=512,
        overlap_height_ratio=0.2,
        overlap_width_ratio=0.2
    )
    boxes = extract_person_detections_sahi(result)
    elapsed = time.time() - start
    times_sahi.append(elapsed)
    if i == len(frames) - 1:
        last_detections_sahi = boxes
        img_sahi = draw_boxes(frame, boxes)
        cv2.imwrite("img_sahi_rtder.jpg", img_sahi)


avg_time_sahi = sum(times_sahi) / len(times_sahi)
print(f"SAHI (YOLOv8m) — avg time: {avg_time_sahi:.3f}s, persons in last frame: {len(last_detections_sahi)}")

Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 slices.
Performing prediction on 15 

# Видеопоток


Для лучшего сочетания сохраним видео.

In [11]:
import cv2
import os
from ultralytics import RTDETR
from sahi import AutoDetectionModel
from sahi.predict import get_sliced_prediction


sahi_model = AutoDetectionModel.from_pretrained(
    model_type="rtdetr",
    model_path="rtdetr-l.pt",
    confidence_threshold=0.35,
    device="cuda"
)

input_video_path = "/kaggle/input/full-crowd/crowd.mp4"
output_video_path = "output_sahi_rtdetr.avi"

if not os.path.exists(input_video_path):
    raise FileNotFoundError(f"Input video not found: {input_video_path}")

cap = cv2.VideoCapture(input_video_path)
if not cap.isOpened():
    raise IOError(f"Cannot open video: {input_video_path}")

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"XVID")
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

frame_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    result = get_sliced_prediction(
        image=frame,
        detection_model=sahi_model,
        slice_height=512,
        slice_width=512,
        overlap_height_ratio=0.2,
        overlap_width_ratio=0.2,
        verbose=0
    )

    boxes = extract_person_detections_sahi(result)
    annotated_frame = draw_boxes(frame, boxes)
    out.write(annotated_frame)
    frame_count += 1

cap.release()
out.release()


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/rtdetr-l.pt to 'rtdetr-l.pt': 100% ━━━━━━━━━━━━ 63.4MB 179.2MB/s 0.4s0.3s<0.1s
