# Данный код я прогонял внутри kaggle, так как на моем ноутбуке это заняло бы больше времени

## Поэтому он не запустится

In [None]:
# !pip -q install pytorchvideo decord

In [None]:
import os

DATA_ROOT = "/kaggle/input/action-recognition-data/data"  
VIDEOS_DIR = os.path.join(DATA_ROOT, "raw")
ANN_PATH   = os.path.join(DATA_ROOT, "processed", "annotations_clean.csv")
LABELS_TXT = os.path.join(DATA_ROOT, "labels.txt")
SPLIT_TRAIN= os.path.join(DATA_ROOT, "splits", "train.txt")
SPLIT_VAL  = os.path.join(DATA_ROOT, "splits", "val.txt")

print("VIDEOS_DIR exists:", os.path.isdir(VIDEOS_DIR))
print("ANN_PATH exists:", os.path.isfile(ANN_PATH))
print("LABELS_TXT exists:", os.path.isfile(LABELS_TXT))
print("SPLIT_TRAIN exists:", os.path.isfile(SPLIT_TRAIN))
print("SPLIT_VAL exists:", os.path.isfile(SPLIT_VAL))


In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from decord import VideoReader, cpu

def read_labels(labels_path: str):
    with open(labels_path, "r", encoding="utf-8") as f:
        labels = [line.strip() for line in f if line.strip()]
    label2id = {l: i for i, l in enumerate(labels)}
    return labels, label2id

def load_split_list(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def letterbox_batch_to_square(frames_tchw: torch.Tensor, out_size: int = 224) -> torch.Tensor:
    T, C, H, W = frames_tchw.shape
    scale = out_size / max(H, W)
    new_h = int(round(H * scale))
    new_w = int(round(W * scale))
    resized = F.interpolate(frames_tchw, size=(new_h, new_w), mode="bilinear", align_corners=False)

    pad_h = out_size - new_h
    pad_w = out_size - new_w
    pad_left = pad_w // 2
    pad_right = pad_w - pad_left
    pad_top = pad_h // 2
    pad_bottom = pad_h - pad_top
    return F.pad(resized, (pad_left, pad_right, pad_top, pad_bottom), mode="constant", value=0.0)

class ActionClipDataset(Dataset):
    def __init__(
        self,
        ann_csv,
        videos_dir,
        split_txt,
        label2id,
        clip_len=32,
        out_size=224,
        stride=2,
        seed=42,
        normalize_imagenet=True,
        margin_sec: float = 0.5,
    ):
        self.df = pd.read_csv(ann_csv)
        split_videos = set(load_split_list(split_txt))
        self.df = self.df[self.df["video"].isin(split_videos)].reset_index(drop=True)
        if len(self.df) == 0:
            raise ValueError("No annotations for this split.")

        self.videos_dir = videos_dir
        self.clip_len = int(clip_len)
        self.out_size = int(out_size)
        self.stride = int(stride)
        self.rnd = random.Random(seed)
        self.normalize_imagenet = normalize_imagenet
        self.margin_sec = float(margin_sec)

        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
        self.std  = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)

        if "label_id" not in self.df.columns:
            self.df["label_id"] = self.df["label"].map(label2id).astype(int)

    def _get_vr(self, video_name: str):
        path = os.path.join(self.videos_dir, video_name)
        return VideoReader(path, ctx=cpu(0))

    def __len__(self):
        return len(self.df)

    def _safe_bounds_frames(self, vr, start_sec: float, end_sec: float):
        fps = float(vr.get_avg_fps())
        n_frames = len(vr)

        start_f = max(0, int(np.floor(start_sec * fps)))
        end_f = min(n_frames - 1, int(np.ceil(end_sec * fps)))

        margin_f = int(round(self.margin_sec * fps))
        safe_start = start_f + margin_f
        safe_end   = end_f - margin_f
        if safe_end <= safe_start:
            safe_start, safe_end = start_f, end_f

        return safe_start, safe_end, fps, n_frames

    def get_clip_from_segment(self, video: str, start_sec: float, end_sec: float, start_idx: int | None = None):
        """
        Возвращает клип [3,T,H,W] из сегмента. Если start_idx=None — выбирает случайно.
        """
        vr = self._get_vr(video)
        safe_start, safe_end, fps, n_frames = self._safe_bounds_frames(vr, start_sec, end_sec)

        need = 1 + (self.clip_len - 1) * self.stride
        seg_len = safe_end - safe_start + 1

        max_start = safe_start + max(0, seg_len - need)

        if start_idx is None:
            start_idx = self.rnd.randint(safe_start, max_start) if max_start >= safe_start else safe_start
        else:
            start_idx = int(np.clip(start_idx, safe_start, max_start if max_start >= safe_start else safe_start))

        frame_idxs = [start_idx + i * self.stride for i in range(self.clip_len)]
        frame_idxs = [min(i, safe_end) for i in frame_idxs]
        frame_idxs = [min(i, n_frames - 1) for i in frame_idxs]

        frames = vr.get_batch(frame_idxs).asnumpy()  # [T,H,W,3]
        frames = torch.from_numpy(frames).permute(0, 3, 1, 2).float() / 255.0  # [T,3,H,W]
        frames = letterbox_batch_to_square(frames, self.out_size)

        if self.normalize_imagenet:
            frames = (frames - self.mean) / self.std

        x = frames.permute(1, 0, 2, 3).contiguous()  # [3,T,H,W]
        return x

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video = str(row["video"])
        start_sec = float(row["start_sec"])
        end_sec = float(row["end_sec"])
        y = int(row["label_id"])

        x = self.get_clip_from_segment(video, start_sec, end_sec, start_idx=None)
        return x, torch.tensor(y, dtype=torch.long)

In [None]:
import torch
from torch import nn
from pytorchvideo.models.hub import x3d_s
from torch.optim.lr_scheduler import OneCycleLR

device = "cuda" if torch.cuda.is_available() else "cpu"

labels, label2id = read_labels(LABELS_TXT)
num_classes = len(labels)
print("Classes:", labels)

train_ds = ActionClipDataset(ANN_PATH, VIDEOS_DIR, SPLIT_TRAIN, label2id,
                             clip_len=32, out_size=224, stride=2, seed=42, margin_sec=0.5)
val_ds   = ActionClipDataset(ANN_PATH, VIDEOS_DIR, SPLIT_VAL,   label2id,
                             clip_len=32, out_size=224, stride=2, seed=43, margin_sec=0.5)

train_loader = DataLoader(train_ds, batch_size=1, shuffle=True, num_workers=0, pin_memory=False)
val_loader   = DataLoader(val_ds,   batch_size=1, shuffle=False, num_workers=0, pin_memory=False)

model = x3d_s(pretrained=True)
model.blocks[-1].proj = nn.Linear(model.blocks[-1].proj.in_features, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # помогает при шумной разметке
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))

EPOCHS = 15
ACCUM_STEPS = 4                  # эффективный batch ~4
VAL_NUM_CLIPS = 5                # multi-clip evaluation: 3/5/7 — разумные значения

# OneCycleLR: шаг делаем только когда делаем optimizer.step
steps_per_epoch = max(1, len(train_loader) // ACCUM_STEPS)
scheduler = OneCycleLR(
    optimizer,
    max_lr=1e-4,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    pct_start=0.1,
)

def _val_start_positions(vr, safe_start, safe_end, need, k):
    """
    Равномерные старты по сегменту (детерминированно).
    """
    seg_len = safe_end - safe_start + 1
    max_start = safe_start + max(0, seg_len - need)
    if max_start <= safe_start or k == 1:
        return [safe_start]
    return np.linspace(safe_start, max_start, num=k).round().astype(int).tolist()

@torch.no_grad()
def eval_one_epoch_multiclip(num_clips=5):
    model.eval()
    total_loss, correct, n = 0.0, 0, 0

    for i in range(len(val_ds.df)):
        row = val_ds.df.iloc[i]
        video = str(row["video"])
        start_sec = float(row["start_sec"])
        end_sec = float(row["end_sec"])
        y = int(row["label_id"])
        y_t = torch.tensor([y], dtype=torch.long, device=device)

        # Готовим K клипов детерминированно
        vr = val_ds._get_vr(video)
        safe_start, safe_end, fps, n_frames = val_ds._safe_bounds_frames(vr, start_sec, end_sec)
        need = 1 + (val_ds.clip_len - 1) * val_ds.stride
        starts = _val_start_positions(vr, safe_start, safe_end, need, num_clips)

        logits_sum = None
        for st in starts:
            x = val_ds.get_clip_from_segment(video, start_sec, end_sec, start_idx=st)
            x = x.unsqueeze(0).to(device, non_blocking=True)

            with torch.cuda.amp.autocast(enabled=(device=="cuda")):
                logits = model(x)  # [1,C]
            logits_sum = logits if logits_sum is None else (logits_sum + logits)

        logits_avg = logits_sum / len(starts)

        loss = criterion(logits_avg, y_t)
        total_loss += loss.item()
        pred = int(logits_avg.argmax(dim=1).item())
        correct += (pred == y)
        n += 1

    return total_loss / n, correct / n

def train_one_epoch(accum_steps=4):
    model.train()
    total_loss, correct, n = 0.0, 0, 0

    optimizer.zero_grad(set_to_none=True)
    opt_steps = 0

    for step, (x, y) in enumerate(train_loader):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            logits = model(x)
            loss = criterion(logits, y) / accum_steps

        scaler.scale(loss).backward()

        # метрики (на "реальном" loss)
        b = x.size(0)
        total_loss += (loss.item() * accum_steps) * b
        correct += (logits.argmax(1) == y).sum().item()
        n += b

        if (step + 1) % accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

            scheduler.step()   # ✅ scheduler step только на optimizer step
            opt_steps += 1

    # если осталось
    if (step + 1) % accum_steps != 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        scheduler.step()
        opt_steps += 1

    return total_loss / n, correct / n

best_val_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = train_one_epoch(accum_steps=ACCUM_STEPS)
    va_loss, va_acc = eval_one_epoch_multiclip(num_clips=VAL_NUM_CLIPS)

    lr_now = optimizer.param_groups[0]["lr"]
    print(f"Epoch {epoch:02d} | lr {lr_now:.2e} | train loss {tr_loss:.4f} acc {tr_acc:.3f} | val loss {va_loss:.4f} acc {va_acc:.3f}")

    if va_acc > best_val_acc:
        best_val_acc = va_acc
        torch.save({"model": model.state_dict(), "labels": labels}, "/kaggle/working/best_x3d_s.pt")

print("Best val acc:", best_val_acc)