In [2]:
shutil.rmtree('/content/trained_encoders')

NameError: name 'shutil' is not defined

In [3]:
!nvidia-smi

Thu Nov  6 18:37:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:0B:00.0  On |                  N/A |
|  0%   46C    P8             19W /  350W |     712MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [4]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())


12.6
True


In [5]:
# !wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [6]:
# encoders_window_level_v3_fixed2.py
# -*- coding: utf-8 -*-
"""
Window-level Emotion Encoders (Audio + Video), stable on small VMs/Colab.

Fixes vs v3_fixed:
- Do NOT call enable_input_require_grads() (it caused AttributeError with tuple outputs).
- Gradient checkpointing is optional and disabled by default (use_checkpoint=False).
- Keeps do_rescale=False, AMP, workers=0, pin_memory=False, etc.
"""

import os
import json
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import Dataset, DataLoader, Subset

# Stable AMP API
from torch.amp import GradScaler, autocast

# Silence noisy torchaudio warnings
warnings.filterwarnings("ignore", message=".*StreamingMediaDecoder.*")
warnings.filterwarnings("ignore", message=".*load_with_torchcodec.*")

# Safer multiprocessing defaults (we still use workers=0 by default)
import torch.multiprocessing as mp
try:
    mp.set_start_method("spawn", force=True)
except RuntimeError:
    pass
try:
    mp.set_sharing_strategy("file_system")
except RuntimeError:
    pass

from transformers import (
    Wav2Vec2ForSequenceClassification,
    HubertForSequenceClassification,
    Wav2Vec2FeatureExtractor,
    AutoImageProcessor,
    TimesformerForVideoClassification,
)

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


# -------------------
# Constants / Labels
# -------------------

EMOTION_TO_ID = {
    "neutral": 0, "calm": 1, "happy": 2, "sad": 3,
    "angry": 4, "fearful": 5, "disgust": 6, "surprised": 7
}
EMOTION_NAMES = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]


# -------------------
# Utils
# -------------------

def ensure_dir(p: Union[str, Path]):
    Path(p).mkdir(parents=True, exist_ok=True)


def uniform_indices(total: int, target: int) -> np.ndarray:
    if total <= 0:
        return np.zeros((target,), dtype=int)
    if total <= target:
        base = np.arange(total)
        pad = np.full(target - total, total - 1, dtype=int)
        return np.concatenate([base, pad])
    return np.round(np.linspace(0, total - 1, target)).astype(int)


def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def set_backbone_trainable_timesformer(model: nn.Module, trainable: bool):
    for n, p in model.named_parameters():
        if "classifier" in n:
            continue
        p.requires_grad = trainable


def safe_freeze_wav2vec_feature_encoder(model: nn.Module):
    if hasattr(model, "freeze_feature_encoder"):
        model.freeze_feature_encoder()
    else:
        for n, p in model.named_parameters():
            if "classifier" in n:
                continue
            p.requires_grad = False


def safe_unfreeze_wav2vec_feature_encoder(model: nn.Module):
    for p in model.parameters():
        p.requires_grad = True


# -------------------
# Dataset
# -------------------

class EmotionDataset(Dataset):
    def __init__(
        self,
        metadata_path: Union[str, Path],
        video_max_frames: int = 64,
        audio_target_sr: int = 16000,
        load_audio: bool = True,
        load_video: bool = True,
    ):
        with open(metadata_path, "r", encoding="utf-8") as f:
            self.meta: List[Dict] = json.load(f)
        if len(self.meta) == 0:
            raise ValueError("Empty metadata file.")

        self.video_max_frames = int(video_max_frames)
        self.audio_target_sr = int(audio_target_sr)
        self.load_audio = bool(load_audio)
        self.load_video = bool(load_video)

        m0 = self.meta[0]
        self.uses_npz = "video_npz" in m0

        self.frames_per_clip = int(m0.get("frames_per_clip", m0.get("fixed_T", 32)))
        if "frame_size" in m0:
            self.frame_size = tuple(m0["frame_size"])
        elif "video_size" in m0:
            H, W = m0["video_size"]
            self.frame_size = (W, H)
        else:
            self.frame_size = (224, 224)

    def __len__(self):
        return len(self.meta)

    def _load_audio(self, audio_path: str) -> torch.Tensor:
        wav, sr = torchaudio.load(audio_path)
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if sr != self.audio_target_sr:
            wav = torchaudio.transforms.Resample(sr, self.audio_target_sr)(wav)
        return wav.squeeze(0)

    def _load_video_npz(self, npz_path: str) -> Tuple[torch.Tensor, np.ndarray]:
        data = np.load(npz_path)
        frames = data["frames"]
        ts = data.get("timestamps", None)
        if ts is None:
            T = frames.shape[0]
            ts = np.linspace(0.0, float(T - 1) / 25.0, num=T, dtype=np.float32)
        if frames.shape[0] > self.video_max_frames:
            idx = uniform_indices(frames.shape[0], self.video_max_frames)
            frames = frames[idx]
            ts = ts[idx]
        frames = frames.astype(np.float32) / 255.0
        tchw = torch.from_numpy(frames).permute(0, 3, 1, 2).contiguous()
        return tchw, ts

    def _load_video_frames_dir(self, frames_dir: str) -> torch.Tensor:
        frame_files = sorted(Path(frames_dir).glob("frame_*.npy"))
        if len(frame_files) == 0:
            W, H = self.frame_size
            return torch.zeros((self.video_max_frames, 3, H, W), dtype=torch.float32)
        if len(frame_files) > self.video_max_frames:
            idx = uniform_indices(len(frame_files), self.video_max_frames)
            frame_files = [frame_files[i] for i in idx]
        frames = []
        for f in frame_files:
            arr = np.load(f, mmap_mode="r")
            if arr.ndim != 3 or arr.shape[2] != 3:
                W, H = self.frame_size
                arr = np.zeros((H, W, 3), dtype=np.uint8)
            frames.append(arr.astype(np.float32) / 255.0)
        frames = np.stack(frames, axis=0)
        tchw = torch.from_numpy(frames).permute(0, 3, 1, 2).contiguous()
        return tchw

    def __getitem__(self, idx: int) -> Dict:
        rec = self.meta[idx]
        out = {
            "sample_id": rec["sample_id"],
            "emotion_label": EMOTION_TO_ID.get(rec.get("emotion", ""), -1),
            "meta": rec,
        }
        if self.load_audio:
            out["audio"] = self._load_audio(rec["audio_path"])
            out["sample_rate"] = self.audio_target_sr
        if self.load_video:
            if self.uses_npz:
                v, ts = self._load_video_npz(rec["video_npz"])
                out["video"] = v
                out["timestamps"] = torch.from_numpy(ts)
            else:
                v = self._load_video_frames_dir(rec["video_frames_dir"])
                out["video"] = v
                fps = rec.get("target_fps", rec.get("original_fps", 25.0))
                T = v.shape[0]
                ts = np.arange(T, dtype=np.float32) / float(fps)
                out["timestamps"] = torch.from_numpy(ts)
        return out


def emotion_collate(batch: List[Dict]) -> Dict:
    out: Dict[str, Union[List, torch.Tensor]] = {
        "sample_id": [b["sample_id"] for b in batch],
        "emotion_label": torch.tensor([b["emotion_label"] for b in batch], dtype=torch.long),
        "meta": [b["meta"] for b in batch],
    }
    if "audio" in batch[0]:
        out["audio"] = [b["audio"] for b in batch]
        out["sample_rate"] = batch[0]["sample_rate"]
    if "video" in batch[0]:
        out["video"] = torch.stack([b["video"] for b in batch], dim=0)
        out["timestamps"] = [b["timestamps"] for b in batch]
    return out


# Window cropping helpers
def crop_audio_random(wav_1d: torch.Tensor, sr: int, dur_s: float) -> torch.Tensor:
    n = wav_1d.numel()
    L = int(round(dur_s * sr))
    if n <= L:
        pad_val = wav_1d[-1] if n > 0 else torch.tensor(0.0, device=wav_1d.device)
        pad = pad_val.repeat(L - n)
        return torch.cat([wav_1d, pad], 0)
    start = torch.randint(0, n - L + 1, ()).item()
    return wav_1d[start:start + L]


def crop_audio_center(wav_1d: torch.Tensor, sr: int, dur_s: float) -> torch.Tensor:
    n = wav_1d.numel()
    L = int(round(dur_s * sr))
    if n <= L:
        pad_val = wav_1d[-1] if n > 0 else torch.tensor(0.0, device=wav_1d.device)
        pad = pad_val.repeat(L - n)
        return torch.cat([wav_1d, pad], 0)
    start = max(0, (n - L) // 2)
    return wav_1d[start:start + L]


def crop_video_random_T(video_TCHW: torch.Tensor, Ts: int) -> torch.Tensor:
    T = video_TCHW.shape[0]
    if T <= Ts:
        idx = torch.linspace(0, T - 1, Ts).round().long()
        return video_TCHW[idx]
    start = torch.randint(0, T - Ts + 1, ()).item()
    return video_TCHW[start:start + Ts]


def crop_video_center_T(video_TCHW: torch.Tensor, Ts: int) -> torch.Tensor:
    T = video_TCHW.shape[0]
    if T <= Ts:
        idx = torch.linspace(0, T - 1, Ts).round().long()
        return video_TCHW[idx]
    start = (T - Ts) // 2
    return video_TCHW[start:start + Ts]


# Audio encoder
class AudioEmotionEncoder:
    def __init__(
        self,
        model_name: str = "superb/wav2vec2-base-superb-er",
        num_emotions: int = 8,
        lr: float = 1e-5,
        device: Optional[str] = None,
        window_seconds: float = 1.5,
        grad_clip: float = 1.0,
        use_amp: bool = True,
    ):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.num_emotions = num_emotions
        self.window_seconds = float(window_seconds)
        self.grad_clip = float(grad_clip)
        self.use_amp = bool(use_amp)
        self.lr = lr

        if "hubert" in model_name.lower():
            self.model = HubertForSequenceClassification.from_pretrained(
                model_name, num_labels=num_emotions, ignore_mismatched_sizes=True
            )
        else:
            self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
                model_name, num_labels=num_emotions, ignore_mismatched_sizes=True
            )

        self.model.config.output_hidden_states = True
        self.model.to(self.device)

        # Try load feature extractor; if model_name is a local dir without files this may throw.
        try:
            self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        except Exception:
            # fallback to a default extractor so validation/inference works
            self.processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")

        # create optimizer only from trainable params
        self.optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr)
        self.crit = nn.CrossEntropyLoss()
        self.emotion_names = EMOTION_NAMES

        # proper GradScaler init
        self.scaler = GradScaler(enabled=(self.use_amp and torch.cuda.is_available()))

    def update_optimizer(self, lr: Optional[float] = None):
        """Re-build the optimizer from current trainable parameters (call after unfreezing)."""
        if lr is not None:
            self.lr = lr
        self.optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr)

    def save(self, path: Union[str, Path]):
        """Save model + processor so HF's from_pretrained(path) works later."""
        p = Path(path)
        p.mkdir(parents=True, exist_ok=True)
        self.model.save_pretrained(str(p))
        try:
            # processor may be Wav2Vec2FeatureExtractor or Processor
            self.processor.save_pretrained(str(p))
        except Exception:
            # best effort
            pass

    def _prepare(self, batch: Dict, train: bool = True):
        sr = batch["sample_rate"]
        audios = []
        for a in batch["audio"]:
            a = a.to(self.device)
            seg = crop_audio_random(a, sr, self.window_seconds) if train else crop_audio_center(a, sr, self.window_seconds)
            audios.append(seg.cpu().numpy())
        proc = self.processor(
            audios, sampling_rate=sr, return_tensors="pt",
            padding=True, truncation=True, max_length=int(self.window_seconds * sr)
        )
        x = proc["input_values"].to(self.device)
        m = proc.get("attention_mask")
        m = m.to(self.device) if m is not None else None
        y = batch["emotion_label"].to(self.device)
        return x, m, y

    def train_epoch(self, loader: DataLoader) -> Dict[str, float]:
        self.model.train()
        total, preds_all, labels_all = 0.0, [], []
        for batch in tqdm(loader, desc="Training (Audio)"):
            x, m, y = self._prepare(batch, train=True)
            self.optim.zero_grad(set_to_none=True)
            with autocast("cuda", enabled=self.use_amp):
                out = self.model(input_values=x, attention_mask=m)
                loss = self.crit(out.logits, y)
            self.scaler.scale(loss).backward()
            if self.grad_clip is not None:
                self.scaler.unscale_(self.optim)
                nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
            self.scaler.step(self.optim)
            self.scaler.update()
            total += loss.item()
            preds_all.extend(out.logits.argmax(dim=1).detach().cpu().numpy())
            labels_all.extend(y.detach().cpu().numpy())
        return {
            "loss": total / len(loader),
            "accuracy": accuracy_score(labels_all, preds_all),
            "f1_score": f1_score(labels_all, preds_all, average="weighted")
        }

    @torch.no_grad()
    def validate(self, loader: DataLoader) -> Dict[str, float]:
        self.model.eval()
        total, preds_all, labels_all = 0.0, [], []
        for batch in tqdm(loader, desc="Validation (Audio)"):
            x, m, y = self._prepare(batch, train=False)
            with autocast("cuda", enabled=self.use_amp):
                out = self.model(input_values=x, attention_mask=m)
                loss = self.crit(out.logits, y)
            total += loss.item()
            preds_all.extend(out.logits.argmax(dim=1).detach().cpu().numpy())
            labels_all.extend(y.detach().cpu().numpy())
        cm = confusion_matrix(labels_all, preds_all)
        return {
            "loss": total / len(loader),
            "accuracy": accuracy_score(labels_all, preds_all),
            "f1_score": f1_score(labels_all, preds_all, average="weighted"),
            "confusion_matrix": cm,
            "predictions": preds_all,
            "labels": labels_all
        }

    @torch.no_grad()
    def extract_embeddings_clip(self, audios_1d: List[torch.Tensor], sr: int = 16000, window_seconds: float = 1.5) -> torch.Tensor:
        self.model.eval()
        crops = [crop_audio_center(a.to(self.device), sr, window_seconds).cpu().numpy() for a in audios_1d]
        proc = self.processor(crops, sampling_rate=sr, return_tensors="pt", padding=True, truncation=True,
                              max_length=int(window_seconds * sr))
        x = proc["input_values"].to(self.device)
        m = proc.get("attention_mask"); m = m.to(self.device) if m is not None else None
        out = self.model(input_values=x, attention_mask=m, output_hidden_states=True)
        last = getattr(out, "hidden_states", None)
        last = last[-1] if last is not None else getattr(out, "last_hidden_state")
        return last.mean(dim=1)  # (B, D)

    @torch.no_grad()
    def extract_embeddings_window(self, audio_1d: torch.Tensor, sr: int, t0: float, t1: float) -> torch.Tensor:
        self.model.eval()
        start = int(max(0, round(t0 * sr)))
        end = int(max(start + 1, round(t1 * sr)))
        seg = audio_1d[start:end]
        L = max(1, end - start)
        if seg.numel() < L:
            pad_val = seg[-1] if seg.numel() > 0 else torch.tensor(0.0, device=audio_1d.device)
            seg = torch.cat([seg, pad_val.repeat(L - seg.numel())], 0)
        proc = self.processor([seg.cpu().numpy()], sampling_rate=sr, return_tensors="pt", padding=True, truncation=True, max_length=L)
        x = proc["input_values"].to(self.device)
        m = proc.get("attention_mask"); m = m.to(self.device) if m is not None else None
        out = self.model(input_values=x, attention_mask=m, output_hidden_states=True)
        last = getattr(out, "hidden_states", None); last = last[-1] if last is not None else getattr(out, "last_hidden_state")
        return last.mean(dim=1)


# Video encoder (checkpointing disabled by default to avoid tuple hook crash)
class VideoEmotionEncoder:
    def __init__(
        self,
        model_name: str = "facebook/timesformer-base-finetuned-k400",
        num_emotions: int = 8,
        lr: float = 1e-5,
        frames_for_model: int = 16,
        device: Optional[str] = None,
        grad_clip: float = 1.0,
        use_amp: bool = True,
        use_checkpoint: bool = False,
    ):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.num_emotions = num_emotions
        self.frames_for_model = int(frames_for_model)
        self.grad_clip = float(grad_clip)
        self.use_amp = bool(use_amp)
        self.lr = lr

        self.model = TimesformerForVideoClassification.from_pretrained(
            model_name, num_labels=num_emotions, ignore_mismatched_sizes=True
        )
        self.model.config.output_hidden_states = True

        if use_checkpoint and hasattr(self.model, "gradient_checkpointing_enable"):
            try:
                self.model.gradient_checkpointing_enable()
            except Exception:
                pass

        self.model.to(self.device)

        # Robust processor loading
        try:
            self.processor = AutoImageProcessor.from_pretrained(model_name)
        except Exception:
            # fallback (the HF model should usually have a processor)
            self.processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")

        self.optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr)
        self.crit = nn.CrossEntropyLoss()
        self.emotion_names = EMOTION_NAMES
        self.scaler = GradScaler(enabled=(self.use_amp and torch.cuda.is_available()))

    def update_optimizer(self, lr: Optional[float] = None):
        if lr is not None:
            self.lr = lr
        self.optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr)

    def save(self, path: Union[str, Path]):
        p = Path(path)
        p.mkdir(parents=True, exist_ok=True)
        self.model.save_pretrained(str(p))
        try:
            self.processor.save_pretrained(str(p))
        except Exception:
            pass

    def _prepare(self, batch: Dict, train: bool = True):
        vids = []
        Ts = self.frames_for_model
        for v in batch["video"]:
            s = crop_video_random_T(v, Ts) if train else crop_video_center_T(v, Ts)
            frames = [s[i].permute(1, 2, 0).cpu().numpy() for i in range(s.shape[0])]
            vids.append(frames)
        proc = self.processor(vids, return_tensors="pt", do_rescale=False)  # frames already in [0,1]
        x = proc["pixel_values"].to(self.device)
        y = batch["emotion_label"].to(self.device)
        return x, y

    def train_epoch(self, loader: DataLoader) -> Dict[str, float]:
        self.model.train()
        total, preds_all, labels_all = 0.0, [], []
        for batch in tqdm(loader, desc="Training (Video)"):
            x, y = self._prepare(batch, train=True)
            self.optim.zero_grad(set_to_none=True)
            with autocast("cuda", enabled=self.use_amp):
                out = self.model(pixel_values=x)
                loss = self.crit(out.logits, y)
            self.scaler.scale(loss).backward()
            if self.grad_clip is not None:
                self.scaler.unscale_(self.optim)
                nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
            self.scaler.step(self.optim)
            self.scaler.update()
            total += loss.item()
            preds_all.extend(out.logits.argmax(dim=1).detach().cpu().numpy())
            labels_all.extend(y.detach().cpu().numpy())
        return {
            "loss": total / len(loader),
            "accuracy": accuracy_score(labels_all, preds_all),
            "f1_score": f1_score(labels_all, preds_all, average="weighted"),
        }

    @torch.no_grad()
    def validate(self, loader: DataLoader) -> Dict[str, float]:
        self.model.eval()
        total, preds_all, labels_all = 0.0, [], []
        for batch in tqdm(loader, desc="Validation (Video)"):
            x, y = self._prepare(batch, train=False)
            with autocast("cuda", enabled=self.use_amp):
                out = self.model(pixel_values=x)
                loss = self.crit(out.logits, y)
            total += loss.item()
            preds_all.extend(out.logits.argmax(dim=1).detach().cpu().numpy())
            labels_all.extend(y.detach().cpu().numpy())
        cm = confusion_matrix(labels_all, preds_all)
        return {
            "loss": total / len(loader),
            "accuracy": accuracy_score(labels_all, preds_all),
            "f1_score": f1_score(labels_all, preds_all, average="weighted"),
            "confusion_matrix": cm,
            "predictions": preds_all,
            "labels": labels_all
        }

    @torch.no_grad()
    def extract_embeddings_clip(self, video_TCHW: torch.Tensor, frames_for_model: Optional[int] = None) -> torch.Tensor:
        self.model.eval()
        Ts = frames_for_model or self.frames_for_model
        if video_TCHW.dim() == 4:
            video_TCHW = video_TCHW.unsqueeze(0)
        batch_embs = []
        for v in video_TCHW:
            s = crop_video_center_T(v, Ts)
            frames = [s[i].permute(1, 2, 0).cpu().numpy() for i in range(s.shape[0])]
            proc = self.processor([frames], return_tensors="pt", do_rescale=False)
            x = proc["pixel_values"].to(self.device)
            out = self.model(pixel_values=x, output_hidden_states=True)
            hs = getattr(out, "hidden_states", None)
            last = hs[-1] if hs is not None else getattr(out, "last_hidden_state")
            emb = last.mean(dim=1)
            batch_embs.append(emb)
        return torch.cat(batch_embs, dim=0)

    @torch.no_grad()
    def extract_embeddings_window_from_npz(self, npz_path: str, t0: float, t1: float, Ts: Optional[int] = None) -> torch.Tensor:
        self.model.eval()
        Ts = Ts or self.frames_for_model
        data = np.load(npz_path)
        frames = data["frames"]
        ts = data["timestamps"].astype(np.float32) if "timestamps" in data else np.arange(frames.shape[0], dtype=np.float32) / 25.0
        mask = (ts >= t0) & (ts <= t1)
        sub = frames[mask]
        if sub.shape[0] == 0:
            center = 0.5 * (t0 + t1)
            idx = int(np.argmin(np.abs(ts - center)))
            sub = frames[idx:idx+1]
        idx = uniform_indices(sub.shape[0], Ts)
        sub = sub[idx].astype(np.float32) / 255.0
        frames_list = [sub[i] for i in range(sub.shape[0])]
        proc = self.processor([frames_list], return_tensors="pt", do_rescale=False)
        x = proc["pixel_values"].to(self.device)
        out = self.model(pixel_values=x, output_hidden_states=True)
        hs = getattr(out, "hidden_states", None)
        last = hs[-1] if hs is not None else getattr(out, "last_hidden_state")
        return last.mean(dim=1)


# Trainer
def train_encoders(
    metadata_path: str,
    output_dir: str,
    audio_model: str = "superb/wav2vec2-base-superb-er",
    video_model: str = "facebook/timesformer-base-finetuned-k400",
    num_epochs: int = 20,
    batch_size: int = 4,
    val_split: float = 0.2,
    audio_window_s: float = 1.5,
    video_Ts: int = 16,
    video_max_frames: int = 64,
    use_wandb: bool = True,
    seed: int = 42,
    audio_freeze_epochs: int = 2,
    video_freeze_epochs: int = 1,
):
    set_seed(seed)
    ensure_dir(output_dir)

    WANDB = False
    if use_wandb:
        try:
            import wandb
            wandb.init(project="almost-human-encoders", config=dict(
                audio_model=audio_model, video_model=video_model, num_epochs=num_epochs,
                batch_size=batch_size, val_split=val_split, audio_window_s=audio_window_s,
                video_Ts=video_Ts, seed=seed
            ))
            WANDB = True
        except Exception as e:
            print(f"⚠ W&B init failed: {e}")
            WANDB = False

    base = EmotionDataset(metadata_path, video_max_frames=video_max_frames, load_audio=True, load_video=True)
    N = len(base)
    val_size = int(N * val_split)
    train_size = N - val_size
    indices = torch.randperm(N)
    train_idx, val_idx = indices[:train_size], indices[train_size:]

    ds_audio_train = Subset(EmotionDataset(metadata_path, video_max_frames=video_max_frames, load_audio=True, load_video=False), train_idx)
    ds_audio_val   = Subset(EmotionDataset(metadata_path, video_max_frames=video_max_frames, load_audio=True, load_video=False), val_idx)
    ds_video_train = Subset(EmotionDataset(metadata_path, video_max_frames=video_max_frames, load_audio=False, load_video=True), train_idx)
    ds_video_val   = Subset(EmotionDataset(metadata_path, video_max_frames=video_max_frames, load_audio=False, load_video=True), val_idx)

    train_loader_audio = DataLoader(ds_audio_train, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False, collate_fn=emotion_collate)
    val_loader_audio   = DataLoader(ds_audio_val,   batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=False, collate_fn=emotion_collate)
    train_loader_video = DataLoader(ds_video_train, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False, collate_fn=emotion_collate)
    val_loader_video   = DataLoader(ds_video_val,   batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=False, collate_fn=emotion_collate)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device} | Train: {train_size} | Val: {val_size}")

    audio_enc = AudioEmotionEncoder(model_name=audio_model, device=device, window_seconds=audio_window_s, use_amp=True)
    video_enc = VideoEmotionEncoder(model_name=video_model, device=device, frames_for_model=video_Ts, use_amp=True, use_checkpoint=False)

    safe_freeze_wav2vec_feature_encoder(audio_enc.model)
    set_backbone_trainable_timesformer(video_enc.model, trainable=False)

    best_audio_f1 = 0.0
    best_video_f1 = 0.0

    for epoch in range(num_epochs):
        print("\n" + "="*60)
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("="*60)

        if epoch == audio_freeze_epochs:
            safe_unfreeze_wav2vec_feature_encoder(audio_enc.model)
            print("→ Unfroze Wav2Vec2/HubERT feature encoder")
        if epoch == video_freeze_epochs:
            set_backbone_trainable_timesformer(video_enc.model, trainable=True)
            print("→ Unfroze TimeSformer backbone")

        a_train = audio_enc.train_epoch(train_loader_audio)
        a_val = audio_enc.validate(val_loader_audio)
        print(f"[Audio] Train: loss={a_train['loss']:.4f} acc={a_train['accuracy']:.4f} f1={a_train['f1_score']:.4f}")
        print(f"[Audio]   Val: loss={a_val['loss']:.4f} acc={a_val['accuracy']:.4f} f1={a_val['f1_score']:.4f}")

        v_train = video_enc.train_epoch(train_loader_video)
        v_val = video_enc.validate(val_loader_video)
        print(f"[Video] Train: loss={v_train['loss']:.4f} acc={v_train['accuracy']:.4f} f1={v_train['f1_score']:.4f}")
        print(f"[Video]   Val: loss={v_val['loss']:.4f} acc={v_val['accuracy']:.4f} f1={v_val['f1_score']:.4f}")

        if WANDB:
            wandb.log({
                "epoch": epoch + 1,
                "audio/train_loss": a_train["loss"], "audio/train_acc": a_train["accuracy"], "audio/train_f1": a_train["f1_score"],
                "audio/val_loss": a_val["loss"], "audio/val_acc": a_val["accuracy"], "audio/val_f1": a_val["f1_score"],
                "video/train_loss": v_train["loss"], "video/train_acc": v_train["accuracy"], "video/train_f1": v_train["f1_score"],
                "video/val_loss": v_val["loss"], "video/val_acc": v_val["accuracy"], "video/val_f1": v_val["f1_score"],
            })

        if a_val["f1_score"] > best_audio_f1:
            best_audio_f1 = a_val["f1_score"]
            save_path = Path(output_dir) / "best_audio_encoder"
            audio_enc.model.save_pretrained(str(save_path))
            print(f"✓ Saved best audio encoder → {save_path} (F1={best_audio_f1:.4f})")

        if v_val["f1_score"] > best_video_f1:
            best_video_f1 = v_val["f1_score"]
            save_path = Path(output_dir) / "best_video_encoder"
            video_enc.model.save_pretrained(str(save_path))
            print(f"✓ Saved best video encoder → {save_path} (F1={best_video_f1:.4f})")

    print("\n" + "="*60)
    print("Training complete!")
    print(f"Best Audio F1: {best_audio_f1:.4f} | Best Video F1: {best_video_f1:.4f}")
    print("="*60)

    if WANDB:
        wandb.finish()

    return audio_enc, video_enc, best_audio_f1, best_video_f1




In [7]:

if __name__ == "__main__":
    train_encoders(
        metadata_path="/content/processed_data/metadata.json",
        output_dir="/content/trained_encoders",
        audio_model="superb/wav2vec2-base-superb-er",
        video_model="facebook/timesformer-base-finetuned-k400",
        num_epochs=20,
        batch_size=4,
        val_split=0.2,
        audio_window_s=1.5,
        video_Ts=16,
        video_max_frames=64,
        use_wandb=True,
        seed=42,
        audio_freeze_epochs=2,
        video_freeze_epochs=1,
    )


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkatrinpochtar[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Device: cuda | Train: 576 | Val: 144


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/486M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



Epoch 1/20




Training (Audio):   1%|          | 1/144 [00:00<02:00,  1.19it/s][A
Training (Audio):   2%|▏         | 3/144 [00:01<00:39,  3.54it/s][A
Training (Audio):   3%|▎         | 4/144 [00:01<00:31,  4.50it/s][A
Training (Audio):   3%|▎         | 5/144 [00:01<00:25,  5.49it/s][A
Training (Audio):   4%|▍         | 6/144 [00:01<00:21,  6.38it/s][A
Training (Audio):   5%|▍         | 7/144 [00:01<00:19,  7.10it/s][A
Training (Audio):   6%|▌         | 8/144 [00:01<00:17,  7.57it/s][A
Training (Audio):   6%|▋         | 9/144 [00:01<00:16,  8.05it/s][A
Training (Audio):   7%|▋         | 10/144 [00:01<00:16,  8.21it/s][A
Training (Audio):   8%|▊         | 11/144 [00:01<00:15,  8.39it/s][A
Training (Audio):   8%|▊         | 12/144 [00:01<00:15,  8.69it/s][A
Training (Audio):   9%|▉         | 13/144 [00:02<00:15,  8.71it/s][A
Training (Audio):  10%|▉         | 14/144 [00:02<00:14,  8.90it/s][A
Training (Audio):  10%|█         | 15/144 [00:02<00:14,  9.19it/s][A
Training (Audio):  12%|█▏ 

[Audio] Train: loss=2.0724 acc=0.1198 f1=0.0957
[Audio]   Val: loss=2.0573 acc=0.2569 f1=0.1931


Training (Video): 100%|██████████| 144/144 [01:16<00:00,  1.89it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Video] Train: loss=2.1382 acc=0.1719 f1=0.1440
[Video]   Val: loss=2.0686 acc=0.1667 f1=0.1471
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.1931)
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.1471)

Epoch 2/20
→ Unfroze TimeSformer backbone


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.37it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.34it/s]


[Audio] Train: loss=2.0160 acc=0.2465 f1=0.1710
[Audio]   Val: loss=1.9726 acc=0.2778 f1=0.2073


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.92it/s]


[Video] Train: loss=1.5132 acc=0.4462 f1=0.4366
[Video]   Val: loss=0.9649 acc=0.6736 f1=0.6420
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.2073)
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.6420)

Epoch 3/20
→ Unfroze Wav2Vec2/HubERT feature encoder


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.63it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.84it/s]


[Audio] Train: loss=1.8947 acc=0.2726 f1=0.2019
[Audio]   Val: loss=1.8493 acc=0.2917 f1=0.2015


Training (Video): 100%|██████████| 144/144 [01:48<00:00,  1.33it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.91it/s]


[Video] Train: loss=0.7846 acc=0.7222 f1=0.7202
[Video]   Val: loss=0.8175 acc=0.6806 f1=0.6406

Epoch 4/20


Training (Audio): 100%|██████████| 144/144 [00:14<00:00, 10.21it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.48it/s]


[Audio] Train: loss=1.7981 acc=0.3021 f1=0.2047
[Audio]   Val: loss=1.7539 acc=0.3264 f1=0.2191


Training (Video): 100%|██████████| 144/144 [01:48<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.91it/s]


[Video] Train: loss=0.5224 acc=0.8281 f1=0.8287
[Video]   Val: loss=0.5964 acc=0.7917 f1=0.7812
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.2191)
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.7812)

Epoch 5/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.72it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.41it/s]


[Audio] Train: loss=1.6895 acc=0.3542 f1=0.2483
[Audio]   Val: loss=1.6479 acc=0.3403 f1=0.2446


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.91it/s]


[Video] Train: loss=0.4221 acc=0.8490 f1=0.8468
[Video]   Val: loss=0.4022 acc=0.8194 f1=0.8172
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.2446)
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.8172)

Epoch 6/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.61it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.56it/s]


[Audio] Train: loss=1.6300 acc=0.3819 f1=0.2900
[Audio]   Val: loss=1.5767 acc=0.3403 f1=0.2279


Training (Video): 100%|██████████| 144/144 [01:43<00:00,  1.39it/s]
Validation (Video): 100%|██████████| 36/36 [00:17<00:00,  2.04it/s]


[Video] Train: loss=0.3024 acc=0.9045 f1=0.9048
[Video]   Val: loss=0.5522 acc=0.8194 f1=0.8139

Epoch 7/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.94it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.44it/s]


[Audio] Train: loss=1.5420 acc=0.4045 f1=0.3047
[Audio]   Val: loss=1.5965 acc=0.3403 f1=0.2322


Training (Video): 100%|██████████| 144/144 [01:44<00:00,  1.38it/s]
Validation (Video): 100%|██████████| 36/36 [00:17<00:00,  2.00it/s]


[Video] Train: loss=0.2273 acc=0.9288 f1=0.9292
[Video]   Val: loss=0.5012 acc=0.8125 f1=0.8136

Epoch 8/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.81it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.49it/s]


[Audio] Train: loss=1.5061 acc=0.4271 f1=0.3308
[Audio]   Val: loss=1.5013 acc=0.4028 f1=0.2933


Training (Video): 100%|██████████| 144/144 [01:44<00:00,  1.38it/s]
Validation (Video): 100%|██████████| 36/36 [00:17<00:00,  2.03it/s]


[Video] Train: loss=0.2099 acc=0.9306 f1=0.9313
[Video]   Val: loss=0.5110 acc=0.8333 f1=0.8274
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.2933)
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.8274)

Epoch 9/20


Training (Audio): 100%|██████████| 144/144 [00:13<00:00, 10.66it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.49it/s]


[Audio] Train: loss=1.4045 acc=0.4878 f1=0.3934
[Audio]   Val: loss=1.4232 acc=0.4028 f1=0.2931


Training (Video): 100%|██████████| 144/144 [01:51<00:00,  1.29it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.90it/s]


[Video] Train: loss=0.2221 acc=0.9236 f1=0.9237
[Video]   Val: loss=0.4005 acc=0.8958 f1=0.8955
✓ Saved best video encoder → /content/trained_encoders/best_video_encoder (F1=0.8955)

Epoch 10/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.85it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.32it/s]


[Audio] Train: loss=1.3594 acc=0.4931 f1=0.4068
[Audio]   Val: loss=1.3871 acc=0.4167 f1=0.3015


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.85it/s]


[Video] Train: loss=0.1794 acc=0.9549 f1=0.9550
[Video]   Val: loss=0.4950 acc=0.8472 f1=0.8481
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.3015)

Epoch 11/20


Training (Audio): 100%|██████████| 144/144 [00:09<00:00, 14.54it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.79it/s]


[Audio] Train: loss=1.2497 acc=0.5729 f1=0.4938
[Audio]   Val: loss=1.2355 acc=0.5486 f1=0.4828


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.91it/s]


[Video] Train: loss=0.1456 acc=0.9583 f1=0.9583
[Video]   Val: loss=0.4338 acc=0.8819 f1=0.8800
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.4828)

Epoch 12/20


Training (Audio): 100%|██████████| 144/144 [00:13<00:00, 10.93it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.17it/s]


[Audio] Train: loss=1.2249 acc=0.5955 f1=0.5304
[Audio]   Val: loss=1.2299 acc=0.5903 f1=0.5289


Training (Video): 100%|██████████| 144/144 [01:51<00:00,  1.30it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.86it/s]


[Video] Train: loss=0.0970 acc=0.9809 f1=0.9809
[Video]   Val: loss=0.3225 acc=0.8819 f1=0.8809
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.5289)

Epoch 13/20


Training (Audio): 100%|██████████| 144/144 [00:09<00:00, 14.49it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.08it/s]


[Audio] Train: loss=1.0988 acc=0.6302 f1=0.5657
[Audio]   Val: loss=1.1848 acc=0.5625 f1=0.4912


Training (Video): 100%|██████████| 144/144 [01:51<00:00,  1.30it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.84it/s]


[Video] Train: loss=0.1122 acc=0.9618 f1=0.9619
[Video]   Val: loss=0.5836 acc=0.8333 f1=0.8281

Epoch 14/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 14.26it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 32.30it/s]


[Audio] Train: loss=1.0982 acc=0.6267 f1=0.5727
[Audio]   Val: loss=1.3265 acc=0.5347 f1=0.4576


Training (Video): 100%|██████████| 144/144 [01:50<00:00,  1.30it/s]
Validation (Video): 100%|██████████| 36/36 [00:20<00:00,  1.79it/s]


[Video] Train: loss=0.1476 acc=0.9618 f1=0.9624
[Video]   Val: loss=0.6223 acc=0.8264 f1=0.8239

Epoch 15/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 14.07it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.82it/s]


[Audio] Train: loss=1.0496 acc=0.6597 f1=0.6133
[Audio]   Val: loss=1.1054 acc=0.5972 f1=0.5367


Training (Video): 100%|██████████| 144/144 [01:51<00:00,  1.29it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.84it/s]


[Video] Train: loss=0.0956 acc=0.9740 f1=0.9739
[Video]   Val: loss=0.5583 acc=0.8681 f1=0.8659
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.5367)

Epoch 16/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 14.04it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.82it/s]


[Audio] Train: loss=0.9645 acc=0.6927 f1=0.6651
[Audio]   Val: loss=1.0109 acc=0.7083 f1=0.6989


Training (Video): 100%|██████████| 144/144 [01:50<00:00,  1.30it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.86it/s]


[Video] Train: loss=0.1229 acc=0.9670 f1=0.9670
[Video]   Val: loss=0.6023 acc=0.8750 f1=0.8739
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.6989)

Epoch 17/20


Training (Audio): 100%|██████████| 144/144 [00:09<00:00, 15.16it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 33.02it/s]


[Audio] Train: loss=0.8948 acc=0.7274 f1=0.7021
[Audio]   Val: loss=0.9164 acc=0.7222 f1=0.7181


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.31it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.84it/s]


[Video] Train: loss=0.0797 acc=0.9774 f1=0.9775
[Video]   Val: loss=0.6001 acc=0.8403 f1=0.8374
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.7181)

Epoch 18/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.79it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 31.44it/s]


[Audio] Train: loss=0.8822 acc=0.7378 f1=0.7227
[Audio]   Val: loss=0.9148 acc=0.7292 f1=0.7253


Training (Video): 100%|██████████| 144/144 [01:51<00:00,  1.29it/s]
Validation (Video): 100%|██████████| 36/36 [00:19<00:00,  1.84it/s]


[Video] Train: loss=0.1032 acc=0.9740 f1=0.9740
[Video]   Val: loss=0.5703 acc=0.8611 f1=0.8609
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.7253)

Epoch 19/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 13.79it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 33.29it/s]


[Audio] Train: loss=0.8069 acc=0.7726 f1=0.7607
[Audio]   Val: loss=0.9015 acc=0.7778 f1=0.7791


Training (Video): 100%|██████████| 144/144 [01:49<00:00,  1.32it/s]
Validation (Video): 100%|██████████| 36/36 [00:18<00:00,  1.92it/s]


[Video] Train: loss=0.0678 acc=0.9826 f1=0.9826
[Video]   Val: loss=0.6097 acc=0.8681 f1=0.8636
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.7791)

Epoch 20/20


Training (Audio): 100%|██████████| 144/144 [00:10<00:00, 14.29it/s]
Validation (Audio): 100%|██████████| 36/36 [00:01<00:00, 33.06it/s]


[Audio] Train: loss=0.7226 acc=0.7882 f1=0.7777
[Audio]   Val: loss=0.8005 acc=0.7847 f1=0.7864


Training (Video): 100%|██████████| 144/144 [01:52<00:00,  1.28it/s]
Validation (Video): 100%|██████████| 36/36 [00:20<00:00,  1.79it/s]


[Video] Train: loss=0.0580 acc=0.9878 f1=0.9879
[Video]   Val: loss=0.4600 acc=0.8750 f1=0.8753
✓ Saved best audio encoder → /content/trained_encoders/best_audio_encoder (F1=0.7864)

Training complete!
Best Audio F1: 0.7864 | Best Video F1: 0.8955


0,1
audio/train_acc,▁▂▃▃▃▄▄▄▅▅▆▆▆▆▇▇▇▇██
audio/train_f1,▁▂▂▂▃▃▃▃▄▄▅▅▆▆▆▇▇▇██
audio/train_loss,██▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁
audio/val_acc,▁▁▁▂▂▂▂▃▃▃▅▅▅▅▆▇▇▇██
audio/val_f1,▁▁▁▁▂▁▁▂▂▂▄▅▅▄▅▇▇▇██
audio/val_loss,██▇▆▆▅▅▅▄▄▃▃▃▄▃▂▂▂▂▁
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
video/train_acc,▁▃▆▇▇▇▇█▇███████████
video/train_f1,▁▃▆▇▇▇██▇███████████
video/train_loss,█▆▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
audio/train_acc,0.78819
audio/train_f1,0.77765
audio/train_loss,0.72265
audio/val_acc,0.78472
audio/val_f1,0.78639
audio/val_loss,0.80054
epoch,20
video/train_acc,0.98785
video/train_f1,0.98786
video/train_loss,0.05797
