In [2]:
import time
from pathlib import Path
from tqdm import tqdm

import cv2
import torch
import numpy as np
import pandas as pd
from ultralytics import YOLO
from torchvision.models.detection import ssd300_vgg16, SSD300_VGG16_Weights, fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
import torchvision.transforms.functional as F

In [3]:
CHANGEABLE_CONDITIONS_VIDEO_PATH = Path("data/videos/changeable_conditions.mp4")
CLEAR_WHEATHER_VIDEO_PATH = Path("data/videos/clear_weather.mp4")
DARK_VIDEO_PATH = Path("data/videos/dark_video.mp4")
POOR_WEATHER_VIDEO_PATH = Path("data/videos/poor_weather.mp4")

In [4]:
# Порог для уверенности и IoU (общие для всех)
CONF_TH = 0.3
IOU_TH = 0.45

# Классы "person" для разных моделей (COCO)
YOLO_PERSON_ID = 0      # COCO: 0 — person
SSD_PERSON_LABEL = 1    # COCO: 1 — person
FASTER_PERSON_LABEL = 1 # COCO: 1 — person

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [26]:
# YOLOv8 (ultralytics)
yolo_model = YOLO("yolov8n.pt")

# SSD300
ssd_weights = SSD300_VGG16_Weights.DEFAULT
ssd_model = ssd300_vgg16(weights=ssd_weights).to(device)
ssd_model.eval()

# Faster R-CNN
fasterrcnn_weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
fasterrcnn_model = fasterrcnn_resnet50_fpn(weights=fasterrcnn_weights).to(device)
fasterrcnn_model.eval()

SSD_CATEGORIES = ssd_weights.meta["categories"]
FASTER_CATEGORIES = fasterrcnn_weights.meta["categories"]


In [7]:
def make_video_summary(
    *,
    model_name: str,
    video_path,
    total_frames: int,
    processed_frames: int,
    sample_every: int,
    max_frames,
    avg_fps: float,
    total_person_dets: int,
    avg_persons_per_frame: float,
    max_persons_on_frame: int,
    avg_confidence: float,
    conf: float,
    iou: float | None = None,
) -> dict:
    """
    Возвращает словарь с метриками по видео в едином формате
    для YOLO, SSD и Faster R-CNN.
    """
    return {
        "model_name": model_name,
        "video_name": Path(video_path).name,

        "frames_total": total_frames,
        "frames_processed": processed_frames,
        "sample_every": sample_every,
        "max_frames": max_frames,

        "avg_fps": avg_fps,
        "total_person_dets": int(total_person_dets),
        "avg_persons_per_frame": avg_persons_per_frame,
        "max_persons_on_frame": int(max_persons_on_frame),
        "avg_confidence": avg_confidence,
        "conf_th": conf,

        "iou_th": iou, # для YOLO — число, для SSD/Faster — None
    }


In [20]:
def evaluate_video_yolo_simple(
    video_path: Path,
    model,
    model_name: str = "yolov8n",
    conf: float = CONF_TH,
    iou: float = IOU_TH,
    person_id: int = YOLO_PERSON_ID,
    sample_every: int = 1,
    max_frames: int | None = None,
    output_path: Path | None = None,
):
    """
    Прогоняет видео через YOLOv8, опционально сохраняет размеченное видео
    и возвращает: (output_path, summary_dict) в формате make_video_summary().
    Метрики считаются только по людям (person_id), но на видео рисуются все классы.
    """
    video_path = Path(video_path)
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Не удалось открыть видео: {video_path}")

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    native_fps = cap.get(cv2.CAP_PROP_FPS) or 0.0

    out = None
    if output_path is not None:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        fps_out = native_fps if native_fps > 0 else 25.0
        out = cv2.VideoWriter(str(output_path), fourcc, fps_out, (width, height))

    total_frames = 0
    processed_frames = 0
    total_person_dets = 0
    max_persons_on_frame = 0
    conf_sum = 0.0
    conf_count = 0

    frame_idx = -1
    t0 = time.perf_counter()

    while True:
        ok, frame_bgr = cap.read()
        if not ok:
            break

        frame_idx += 1
        total_frames += 1

        if frame_idx % sample_every != 0:
            continue

        processed_frames += 1
        if max_frames is not None and processed_frames > max_frames:
            break

        results = model(
            frame_bgr,
            conf=conf,
            iou=iou,
            verbose=False,
        )
        res = results[0]

        if res.boxes is not None and len(res.boxes) > 0:
            boxes_xyxy = res.boxes.xyxy.cpu().numpy()
            scores = res.boxes.conf.cpu().numpy()
            classes = res.boxes.cls.cpu().numpy().astype(int)
        else:
            boxes_xyxy = np.empty((0, 4), dtype=np.float32)
            scores = np.empty((0,), dtype=np.float32)
            classes = np.empty((0,), dtype=int)

        # ---- МЕТРИКИ: только по людям ----
        person_mask = (classes == person_id)
        person_scores = scores[person_mask]

        num_persons = len(person_scores)
        total_person_dets += num_persons
        max_persons_on_frame = max(max_persons_on_frame, num_persons)

        if num_persons > 0:
            conf_sum += float(person_scores.sum())
            conf_count += num_persons

        if out is not None:
            annotated = res.plot()
            out.write(annotated)

    elapsed = time.perf_counter() - t0
    cap.release()
    if out is not None:
        out.release()

    avg_fps = processed_frames / elapsed if elapsed > 0 and processed_frames > 0 else 0.0
    avg_persons_per_frame = (
        total_person_dets / processed_frames if processed_frames > 0 else 0.0
    )
    avg_conf = conf_sum / conf_count if conf_count > 0 else 0.0

    summary = make_video_summary(
        model_name=model_name,
        video_path=video_path,
        total_frames=total_frames,
        processed_frames=processed_frames,
        sample_every=sample_every,
        max_frames=max_frames,
        avg_fps=avg_fps,
        total_person_dets=total_person_dets,
        avg_persons_per_frame=avg_persons_per_frame,
        max_persons_on_frame=max_persons_on_frame,
        avg_confidence=avg_conf,
        conf=conf,
        iou=iou,
    )

    return output_path, summary


In [27]:
def evaluate_video_ssd300_simple(
    video_path: Path,
    model,
    device,
    model_name: str = "ssd300_vgg16",
    conf: float = CONF_TH,
    person_label: int = SSD_PERSON_LABEL,
    sample_every: int = 1,
    max_frames: int | None = None,
    output_path: Path | None = None,
):
    """
    Прогоняет видео через SSD300, опционально сохраняет размеченное видео
    и возвращает: (output_path, summary_dict) в формате make_video_summary().

    Метрики считаются только по людям (person_label),
    но на видео рисуются все классы с score >= conf
    с подписями "<class_name> <score>".
    """
    video_path = Path(video_path)
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Не удалось открыть видео: {video_path}")

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    native_fps = cap.get(cv2.CAP_PROP_FPS) or 0.0

    out = None
    if output_path is not None:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        fps_out = native_fps if native_fps > 0 else 25.0
        out = cv2.VideoWriter(str(output_path), fourcc, fps_out, (width, height))

    total_frames = 0
    processed_frames = 0
    total_person_dets = 0
    max_persons_on_frame = 0
    conf_sum = 0.0
    conf_count = 0

    frame_idx = -1
    t0 = time.perf_counter()

    model.eval()
    with torch.no_grad():
        while True:
            ok, frame_bgr = cap.read()
            if not ok:
                break

            frame_idx += 1
            total_frames += 1

            if frame_idx % sample_every != 0:
                continue

            processed_frames += 1
            if max_frames is not None and processed_frames > max_frames:
                break

            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            img_tensor = F.to_tensor(frame_rgb).to(device)

            outputs = model([img_tensor])[0]

            boxes = outputs["boxes"].detach().cpu().numpy()
            scores = outputs["scores"].detach().cpu().numpy()
            labels = outputs["labels"].detach().cpu().numpy()

            # ----- МЕТРИКИ: только люди -----
            person_mask = (labels == person_label) & (scores >= conf)
            person_scores = scores[person_mask]

            num_persons = len(person_scores)
            total_person_dets += num_persons
            max_persons_on_frame = max(max_persons_on_frame, num_persons)

            if num_persons > 0:
                conf_sum += float(person_scores.sum())
                conf_count += num_persons

            # ----- ВИЗУАЛИЗАЦИЯ: все классы с score >= conf -----
            if out is not None:
                vis_mask = (scores >= conf)
                vis_boxes = boxes[vis_mask]
                vis_labels = labels[vis_mask]
                vis_scores = scores[vis_mask]

                for box, lbl, scr in zip(vis_boxes, vis_labels, vis_scores):
                    x1, y1, x2, y2 = box.astype(int)

                    # один зелёный цвет
                    cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (0, 255, 0), 2)

                    # подпись: <class_name> <score>
                    class_id = int(lbl)
                    if 0 <= class_id < len(SSD_CATEGORIES):
                        class_name = SSD_CATEGORIES[class_id]
                    else:
                        class_name = str(class_id)

                    text = f"{class_name} {scr:.2f}"
                    cv2.putText(
                        frame_bgr,
                        text,
                        (x1, max(y1 - 5, 10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.4,
                        (0, 255, 0),
                        1,
                        cv2.LINE_AA,
                    )

                out.write(frame_bgr)

    elapsed = time.perf_counter() - t0
    cap.release()
    if out is not None:
        out.release()

    avg_fps = processed_frames / elapsed if elapsed > 0 and processed_frames > 0 else 0.0
    avg_persons_per_frame = (
        total_person_dets / processed_frames if processed_frames > 0 else 0.0
    )
    avg_conf = conf_sum / conf_count if conf_count > 0 else 0.0

    summary = make_video_summary(
        model_name=model_name,
        video_path=video_path,
        total_frames=total_frames,
        processed_frames=processed_frames,
        sample_every=sample_every,
        max_frames=max_frames,
        avg_fps=avg_fps,
        total_person_dets=total_person_dets,
        avg_persons_per_frame=avg_persons_per_frame,
        max_persons_on_frame=max_persons_on_frame,
        avg_confidence=avg_conf,
        conf=conf,
        iou=None,
    )

    return output_path, summary


In [29]:
def evaluate_video_fasterrcnn_simple(
    video_path: Path,
    model,
    device,
    model_name: str = "fasterrcnn_resnet50_fpn",
    conf: float = CONF_TH,
    person_label: int = FASTER_PERSON_LABEL,
    sample_every: int = 1,
    max_frames: int | None = None,
    output_path: Path | None = None,
):
    """
    Прогоняет видео через Faster R-CNN, опционально сохраняет размеченное видео
    и возвращает: (output_path, summary_dict) в формате make_video_summary().

    Метрики считаются только по людям (person_label),
    но на видео рисуются все классы с score >= conf
    с подписями "<class_name> <score>".
    """
    video_path = Path(video_path)
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Не удалось открыть видео: {video_path}")

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    native_fps = cap.get(cv2.CAP_PROP_FPS) or 0.0

    out = None
    if output_path is not None:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        fps_out = native_fps if native_fps > 0 else 25.0
        out = cv2.VideoWriter(str(output_path), fourcc, fps_out, (width, height))

    total_frames = 0
    processed_frames = 0
    total_person_dets = 0
    max_persons_on_frame = 0
    conf_sum = 0.0
    conf_count = 0

    frame_idx = -1
    t0 = time.perf_counter()

    model.eval()
    with torch.no_grad():
        while True:
            ok, frame_bgr = cap.read()
            if not ok:
                break

            frame_idx += 1
            total_frames += 1

            if frame_idx % sample_every != 0:
                continue

            processed_frames += 1
            if max_frames is not None and processed_frames > max_frames:
                break

            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            img_tensor = F.to_tensor(frame_rgb).to(device)

            outputs = model([img_tensor])[0]

            boxes = outputs["boxes"].detach().cpu().numpy()
            scores = outputs["scores"].detach().cpu().numpy()
            labels = outputs["labels"].detach().cpu().numpy()

            # ----- МЕТРИКИ: только люди -----
            person_mask = (labels == person_label) & (scores >= conf)
            person_scores = scores[person_mask]

            num_persons = len(person_scores)
            total_person_dets += num_persons
            max_persons_on_frame = max(max_persons_on_frame, num_persons)

            if num_persons > 0:
                conf_sum += float(person_scores.sum())
                conf_count += num_persons

            # ----- ВИЗУАЛИЗАЦИЯ: все классы с score >= conf -----
            if out is not None:
                vis_mask = (scores >= conf)
                vis_boxes = boxes[vis_mask]
                vis_labels = labels[vis_mask]
                vis_scores = scores[vis_mask]

                for box, lbl, scr in zip(vis_boxes, vis_labels, vis_scores):
                    x1, y1, x2, y2 = box.astype(int)

                    cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (0, 255, 0), 2)

                    class_id = int(lbl)
                    if 0 <= class_id < len(FASTER_CATEGORIES):
                        class_name = FASTER_CATEGORIES[class_id]
                    else:
                        class_name = str(class_id)

                    text = f"{class_name} {scr:.2f}"
                    cv2.putText(
                        frame_bgr,
                        text,
                        (x1, max(y1 - 5, 10)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.4,
                        (0, 255, 0),
                        1,
                        cv2.LINE_AA,
                    )

                out.write(frame_bgr)

    elapsed = time.perf_counter() - t0
    cap.release()
    if out is not None:
        out.release()

    avg_fps = processed_frames / elapsed if elapsed > 0 and processed_frames > 0 else 0.0
    avg_persons_per_frame = (
        total_person_dets / processed_frames if processed_frames > 0 else 0.0
    )
    avg_conf = conf_sum / conf_count if conf_count > 0 else 0.0

    summary = make_video_summary(
        model_name=model_name,
        video_path=video_path,
        total_frames=total_frames,
        processed_frames=processed_frames,
        sample_every=sample_every,
        max_frames=max_frames,
        avg_fps=avg_fps,
        total_person_dets=total_person_dets,
        avg_persons_per_frame=avg_persons_per_frame,
        max_persons_on_frame=max_persons_on_frame,
        avg_confidence=avg_conf,
        conf=conf,
        iou=None,
    )

    return output_path, summary


In [30]:
def evaluate_models_on_video(
    video_path: Path,
    sample_every: int = 1,
    max_frames: int | None = None,
) -> pd.DataFrame:
    """
    Прогоняет одно видео через YOLOv8, SSD300 и Faster R-CNN,
    показывает прогресс и возвращает DataFrame с метриками.
    """
    summaries = []

    video_path = Path(video_path)
    video_name = video_path.stem  # например "clear_weather"

    # Пути для сохранения результатов
    yolo_output = Path(f"data/result/yolov8/{video_name}.mp4")
    ssd_output = Path(f"data/result/ssd300/{video_name}.mp4")
    frcnn_output = Path(f"data/result/fasterrcnn/{video_name}.mp4")

    # Папки создадутся внутри каждой evaluate_*_simple при необходимости,
    # но можно и здесь:
    yolo_output.parent.mkdir(parents=True, exist_ok=True)
    ssd_output.parent.mkdir(parents=True, exist_ok=True)
    frcnn_output.parent.mkdir(parents=True, exist_ok=True)

    models = [
        (
            "YOLOv8",
            evaluate_video_yolo_simple,
            {
                "video_path": video_path,
                "model": yolo_model,
                "model_name": "yolov8n",
                "conf": CONF_TH,
                "iou": IOU_TH,
                "person_id": YOLO_PERSON_ID,
                "sample_every": sample_every,
                "max_frames": max_frames,
                "output_path": yolo_output,
            },
        ),
        (
            "SSD300",
            evaluate_video_ssd300_simple,
            {
                "video_path": video_path,
                "model": ssd_model,
                "device": device,
                "model_name": "ssd300_vgg16",
                "conf": CONF_TH,
                "person_label": SSD_PERSON_LABEL,
                "sample_every": sample_every,
                "max_frames": max_frames,
                "output_path": ssd_output,
            },
        ),
        (
            "Faster R-CNN",
            evaluate_video_fasterrcnn_simple,
            {
                "video_path": video_path,
                "model": fasterrcnn_model,
                "device": device,
                "model_name": "fasterrcnn_resnet50_fpn",
                "conf": CONF_TH,
                "person_label": FASTER_PERSON_LABEL,
                "sample_every": sample_every,
                "max_frames": max_frames,
                "output_path": frcnn_output,
            },
        ),
    ]

    for model_name, func, kwargs in tqdm(
        models, desc=f"Processing {video_path.name}", leave=True
    ):
        _, summary = func(**kwargs)
        summaries.append(summary)

    return pd.DataFrame(summaries)


In [24]:
def evaluate_all_videos(
    video_paths,
    sample_every: int = 1,
    max_frames: int | None = None,
) -> pd.DataFrame:
    """
    Прогоняет КАЖДОЕ видео через все три модели и
    возвращает один общий DataFrame по всем (модель × видео).

    video_paths:
        - список Path/строк
        - или dict[name -> Path/строка]
    """
    rows = []

    if isinstance(video_paths, dict):
        iterable = list(video_paths.items())
    else:
        iterable = [(None, vp) for vp in video_paths]

    for alias, vp in tqdm(iterable, desc="All videos", leave=True):
        df_video = evaluate_models_on_video(Path(vp), sample_every, max_frames).copy()
        if alias is not None:
            df_video["video_alias"] = alias
        rows.append(df_video)

    if rows:
        return pd.concat(rows, ignore_index=True)
    else:
        return pd.DataFrame()


In [31]:
VIDEOS = {
    "changeable": CHANGEABLE_CONDITIONS_VIDEO_PATH,
    "clear": CLEAR_WHEATHER_VIDEO_PATH,
    "dark": DARK_VIDEO_PATH,
    "poor": POOR_WEATHER_VIDEO_PATH,
}
# # Одно видео:
# df_one = evaluate_models_on_video(CHANGEABLE_CONDITIONS_VIDEO, sample_every=2, max_frames=300)
# display(df_one)

# Все видео:
df_all = evaluate_all_videos(VIDEOS)
display(df_all)


All videos:   0%|          | 0/4 [00:00<?, ?it/s]
[Acessing changeable_conditions.mp4:   0%|          | 0/3 [00:00<?, ?it/s]
[Acessing changeable_conditions.mp4:  33%|███▎      | 1/3 [01:33<03:06, 93.49s/it]
[Acessing changeable_conditions.mp4:  67%|██████▋   | 2/3 [10:30<05:54, 354.15s/it]
Processing changeable_conditions.mp4: 100%|██████████| 3/3 [1:10:07<00:00, 1402.53s/it]
All videos:  25%|██▌       | 1/4 [1:10:07<3:30:22, 4207.64s/it]
[Acessing clear_weather.mp4:   0%|          | 0/3 [00:00<?, ?it/s]
[Acessing clear_weather.mp4:  33%|███▎      | 1/3 [01:41<03:23, 101.51s/it]
[Acessing clear_weather.mp4:  67%|██████▋   | 2/3 [12:06<06:49, 409.57s/it]
Processing clear_weather.mp4: 100%|██████████| 3/3 [1:39:22<00:00, 1987.62s/it]
All videos:  50%|█████     | 2/4 [2:49:30<2:54:40, 5240.15s/it]
[Acessing dark_video.mp4:   0%|          | 0/3 [00:00<?, ?it/s]
[Acessing dark_video.mp4:  33%|███▎      | 1/3 [01:48<03:37, 108.75s/it]
Processing dark_video.mp4:  67%|██████▋   | 2/3 

KeyboardInterrupt: 