In [3]:
!pip install -q transformers wandb


In [4]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkatrinpochtar[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import json
import random
import warnings
from functools import partial
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torchaudio
from torch.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    Wav2Vec2ForSequenceClassification,
    HubertForSequenceClassification,
    Wav2Vec2FeatureExtractor,
    AutoImageProcessor,
    TimesformerForVideoClassification,
)

warnings.filterwarnings("ignore")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
METADATA = "/content/processed_data/metadata.json"
OUT_DIR = Path("/content/trained_encoders")
OUT_DIR.mkdir(parents=True, exist_ok=True)
NUM_EMOTIONS = 8

print(f"Device: {DEVICE}")




Device: cuda


In [6]:
class EmotionDataset(Dataset):
    def __init__(self, metadata_path: str, split: str, modality: str):
        with open(metadata_path) as f:
            data = json.load(f)
        self.samples = [s for s in data if s["split"] == split]
        self.modality = modality

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        item = {"emotion": s["emotion_idx"]}
        if self.modality == "audio":
            wav, _ = torchaudio.load(s["audio_path"])
            item["audio"] = wav.squeeze(0)
        elif self.modality == "video":
            frames = np.load(s["frames_path"])
            item["video"] = torch.from_numpy(frames).permute(0, 3, 1, 2).float() / 255.0
        return item


def collate_fn(batch):
    out = {"emotion": torch.tensor([b["emotion"] for b in batch])}
    if "audio" in batch[0]:
        out["audio"] = [b["audio"] for b in batch]
    if "video" in batch[0]:
        out["video"] = torch.stack([b["video"] for b in batch])
    return out


In [7]:
def crop_audio(wav, sr, duration, train):
    L = int(round(duration * sr))
    n = wav.numel()
    if n <= L:
        return torch.nn.functional.pad(wav, (0, L - n))
    start = torch.randint(0, n - L + 1, ()).item() if train else (n - L) // 2
    return wav[start:start + L]


def crop_video(video, n_frames, train):
    T = video.shape[0]
    if T <= n_frames:
        idx = torch.linspace(0, T - 1, n_frames).round().long()
        return video[idx]
    start = torch.randint(0, T - n_frames + 1, ()).item() if train else (T - n_frames) // 2
    return video[start:start + n_frames]


def prepare_audio(batch, processor, window_s, device, train=True):
    sr = 16000
    wavs = [crop_audio(a, sr, window_s, train).numpy() for a in batch["audio"]]
    enc = processor(wavs, sampling_rate=sr, return_tensors="pt", padding=True,
                    truncation=True, max_length=int(window_s * sr))
    kwargs = {"input_values": enc["input_values"].to(device)}
    if "attention_mask" in enc:
        kwargs["attention_mask"] = enc["attention_mask"].to(device)
    return kwargs, batch["emotion"].to(device)


def prepare_video(batch, processor, n_frames, device, train=True):
    clips = []
    for v in batch["video"]:
        clip = crop_video(v, n_frames, train)
        clips.append([clip[i].permute(1, 2, 0).numpy() for i in range(clip.shape[0])])
    enc = processor(clips, return_tensors="pt", do_rescale=False)
    return {"pixel_values": enc["pixel_values"].to(device)}, batch["emotion"].to(device)

In [8]:
def train_one_epoch(model, loader, prep_fn, optimizer, scaler):
    model.train()
    total_loss, preds, labels = 0.0, [], []
    for batch in tqdm(loader, leave=False):
        kwargs, y = prep_fn(batch, train=True)
        optimizer.zero_grad(set_to_none=True)
        with autocast("cuda", enabled=DEVICE == "cuda"):
            logits = model(**kwargs).logits
            loss = nn.CrossEntropyLoss()(logits, y)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
        preds.extend(logits.argmax(1).detach().cpu().tolist())
        labels.extend(y.cpu().tolist())
    return {
        "loss": total_loss / len(loader),
        "acc": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


@torch.no_grad()
def evaluate(model, loader, prep_fn):
    model.eval()
    total_loss, preds, labels = 0.0, [], []
    for batch in tqdm(loader, leave=False):
        kwargs, y = prep_fn(batch, train=False)
        with autocast("cuda", enabled=DEVICE == "cuda"):
            logits = model(**kwargs).logits
            loss = nn.CrossEntropyLoss()(logits, y)
        total_loss += loss.item()
        preds.extend(logits.argmax(1).cpu().tolist())
        labels.extend(y.cpu().tolist())
    return {
        "loss": total_loss / len(loader),
        "acc": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

In [9]:
def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def run_experiment(cfg):
    seed_all(42)
    wandb.init(project="uncanny-valley-encoders", name=cfg["name"],
               group=cfg["modality"], config=cfg, reinit=True)

    modality = cfg["modality"]
    train_ds = EmotionDataset(METADATA, "train", modality)
    val_ds = EmotionDataset(METADATA, "val", modality)
    train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True,
                              num_workers=0, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=cfg["batch_size"], shuffle=False,
                            num_workers=0, collate_fn=collate_fn)

    if modality == "audio":
        model_cls = (HubertForSequenceClassification if "hubert" in cfg["model"].lower()
                     else Wav2Vec2ForSequenceClassification)
        model = model_cls.from_pretrained(
            cfg["model"], num_labels=NUM_EMOTIONS, ignore_mismatched_sizes=True)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(cfg["model"])
        prep_fn = partial(prepare_audio, processor=processor,
                          window_s=cfg.get("window_s", 3.0), device=DEVICE)
        if hasattr(model, "freeze_feature_encoder"):
            model.freeze_feature_encoder()
    else:
        model = TimesformerForVideoClassification.from_pretrained(
            cfg["model"], num_labels=NUM_EMOTIONS, ignore_mismatched_sizes=True)
        processor = AutoImageProcessor.from_pretrained(cfg["model"])
        prep_fn = partial(prepare_video, processor=processor,
                          n_frames=cfg.get("n_frames", 8), device=DEVICE)
        for n, p in model.named_parameters():
            if "classifier" not in n:
                p.requires_grad = False

    model.to(DEVICE)
    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()), lr=cfg["lr"])
    scaler = GradScaler(enabled=DEVICE == "cuda")

    best_f1, patience_cnt = 0.0, 0
    save_path = OUT_DIR / cfg["name"]

    for epoch in range(cfg["epochs"]):
        # Unfreeze backbone after warmup
        if epoch == cfg.get("freeze_epochs", 2):
            for p in model.parameters():
                p.requires_grad = True
            optimizer = torch.optim.AdamW(model.parameters(), lr=cfg["lr"] * 0.1)
            scaler = GradScaler(enabled=DEVICE == "cuda")

        t = train_one_epoch(model, train_loader, prep_fn, optimizer, scaler)
        v = evaluate(model, val_loader, prep_fn)

        wandb.log({
            "epoch": epoch + 1,
            "train/loss": t["loss"], "train/acc": t["acc"], "train/f1": t["f1"],
            "val/loss": v["loss"], "val/acc": v["acc"], "val/f1": v["f1"],
            "lr": optimizer.param_groups[0]["lr"],
        })
        print(f"  [{epoch+1:2d}/{cfg['epochs']}] "
              f"t_f1={t['f1']:.3f} v_f1={v['f1']:.3f} v_loss={v['loss']:.3f}")

        if v["f1"] > best_f1:
            best_f1 = v["f1"]
            save_path.mkdir(parents=True, exist_ok=True)
            model.save_pretrained(str(save_path))
            processor.save_pretrained(str(save_path))
            patience_cnt = 0
        else:
            patience_cnt += 1
            if patience_cnt >= cfg.get("patience", 5):
                print(f"  Early stopping at epoch {epoch+1}")
                break

    wandb.log({"best_val_f1": best_f1})
    wandb.finish()
    del model
    torch.cuda.empty_cache()
    print(f"  Best F1: {best_f1:.4f} -> {save_path}\n")
    return {"name": cfg["name"], "best_f1": best_f1, "path": str(save_path)}

In [10]:
EXPERIMENTS = [
    # --- Audio: Wav2Vec2 (3 LRs x 2 window sizes) ---
    {"name": "wav2vec2-lr1e5-w3s", "modality": "audio",
     "model": "superb/wav2vec2-base-superb-er",
     "lr": 1e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "wav2vec2-lr3e5-w3s", "modality": "audio",
     "model": "superb/wav2vec2-base-superb-er",
     "lr": 3e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "wav2vec2-lr5e5-w3s", "modality": "audio",
     "model": "superb/wav2vec2-base-superb-er",
     "lr": 5e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "wav2vec2-lr3e5-w2s", "modality": "audio",
     "model": "superb/wav2vec2-base-superb-er",
     "lr": 3e-5, "window_s": 2.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    # --- Audio: HuBERT (3 LRs x 2 window sizes) ---
    {"name": "hubert-lr1e5-w3s", "modality": "audio",
     "model": "superb/hubert-base-superb-er",
     "lr": 1e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "hubert-lr3e5-w3s", "modality": "audio",
     "model": "superb/hubert-base-superb-er",
     "lr": 3e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "hubert-lr5e5-w3s", "modality": "audio",
     "model": "superb/hubert-base-superb-er",
     "lr": 5e-5, "window_s": 3.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    {"name": "hubert-lr3e5-w2s", "modality": "audio",
     "model": "superb/hubert-base-superb-er",
     "lr": 3e-5, "window_s": 2.0, "batch_size": 8,
     "epochs": 25, "freeze_epochs": 3, "patience": 5},

    # --- Video: TimeSformer (2 LRs x 2 frame counts + freeze variant) ---
    {"name": "timesformer-lr1e5-8f", "modality": "video",
     "model": "facebook/timesformer-base-finetuned-k400",
     "lr": 1e-5, "n_frames": 8, "batch_size": 4,
     "epochs": 15, "freeze_epochs": 1, "patience": 4},

    {"name": "timesformer-lr3e5-8f", "modality": "video",
     "model": "facebook/timesformer-base-finetuned-k400",
     "lr": 3e-5, "n_frames": 8, "batch_size": 4,
     "epochs": 15, "freeze_epochs": 1, "patience": 4},

    {"name": "timesformer-lr1e5-16f", "modality": "video",
     "model": "facebook/timesformer-base-finetuned-k400",
     "lr": 1e-5, "n_frames": 16, "batch_size": 2,
     "epochs": 15, "freeze_epochs": 1, "patience": 4},

    {"name": "timesformer-lr3e5-16f", "modality": "video",
     "model": "facebook/timesformer-base-finetuned-k400",
     "lr": 3e-5, "n_frames": 16, "batch_size": 2,
     "epochs": 15, "freeze_epochs": 1, "patience": 4},

    {"name": "timesformer-lr1e5-8f-freeze3", "modality": "video",
     "model": "facebook/timesformer-base-finetuned-k400",
     "lr": 1e-5, "n_frames": 8, "batch_size": 4,
     "epochs": 15, "freeze_epochs": 3, "patience": 4},
]

results = []
for exp in EXPERIMENTS:
    print(f"{'='*60}\n{exp['name']}\n{'='*60}")
    results.append(run_experiment(exp))

wav2vec2-lr1e5-w3s




config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]


  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:09<19:03,  9.00s/it][A
  2%|▏         | 2/128 [00:09<07:57,  3.79s/it][A
  2%|▏         | 3/128 [00:09<04:23,  2.11s/it][A
  3%|▎         | 4/128 [00:09<02:43,  1.32s/it][A
  4%|▍         | 5/128 [00:09<01:48,  1.14it/s][A
  5%|▍         | 6/128 [00:09<01:15,  1.62it/s][A
  5%|▌         | 7/128 [00:09<00:54,  2.23it/s][A
  6%|▋         | 8/128 [00:09<00:40,  2.96it/s][A
  8%|▊         | 10/128 [00:09<00:26,  4.49it/s][A
  9%|▊         | 11/128 [00:10<00:22,  5.19it/s][A
 10%|█         | 13/128 [00:10<00:17,  6.58it/s][A
 11%|█         | 14/128 [00:10<00:16,  7.09it/s][A
 12%|█▎        | 16/128 [00:10<00:14,  8.00it/s][A
 13%|█▎        | 17/128 [00:10<00:13,  8.28it/s][A
 14%|█▍        | 18/128 [00:10<00:12,  8.62it/s][A
 15%|█▍        | 19/128 [00:10<00:12,  8.82it/s][A
 16%|█▌        | 20/128 [00:10<00:11,  9.03it/s][A
 17%|█▋        | 22/128 [00:11<00:11,  9.39it/s][A
 18%|█▊        | 23/128 [00:

  [ 1/25] t_f1=0.137 v_f1=0.137 v_loss=1.897




  [ 2/25] t_f1=0.171 v_f1=0.190 v_loss=1.815




  [ 3/25] t_f1=0.234 v_f1=0.204 v_loss=1.779




  [ 4/25] t_f1=0.299 v_f1=0.241 v_loss=1.776




  [ 5/25] t_f1=0.315 v_f1=0.239 v_loss=1.767




  [ 6/25] t_f1=0.330 v_f1=0.247 v_loss=1.756




  [ 7/25] t_f1=0.348 v_f1=0.257 v_loss=1.750




  [ 8/25] t_f1=0.356 v_f1=0.256 v_loss=1.756




  [ 9/25] t_f1=0.380 v_f1=0.255 v_loss=1.748




  [10/25] t_f1=0.384 v_f1=0.253 v_loss=1.744




  [11/25] t_f1=0.391 v_f1=0.258 v_loss=1.746




  [12/25] t_f1=0.395 v_f1=0.268 v_loss=1.748




  [13/25] t_f1=0.403 v_f1=0.287 v_loss=1.759




  [14/25] t_f1=0.421 v_f1=0.311 v_loss=1.738




  [15/25] t_f1=0.440 v_f1=0.301 v_loss=1.733




  [16/25] t_f1=0.426 v_f1=0.332 v_loss=1.743




  [17/25] t_f1=0.455 v_f1=0.348 v_loss=1.741




  [18/25] t_f1=0.455 v_f1=0.337 v_loss=1.748




  [19/25] t_f1=0.473 v_f1=0.334 v_loss=1.743




  [20/25] t_f1=0.481 v_f1=0.312 v_loss=1.751




  [21/25] t_f1=0.487 v_f1=0.328 v_loss=1.749




  [22/25] t_f1=0.538 v_f1=0.348 v_loss=1.727




  [23/25] t_f1=0.535 v_f1=0.341 v_loss=1.751




  [24/25] t_f1=0.546 v_f1=0.331 v_loss=1.766




  [25/25] t_f1=0.549 v_f1=0.362 v_loss=1.746


0,1
best_val_f1,▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇████
train/f1,▁▂▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/loss,█▆▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁
val/acc,▁▂▂▄▄▅▅▅▅▅▅▅▆▇▆██▇▆▅▆▇▆▆▇
val/f1,▁▃▃▄▄▄▅▅▅▅▅▅▆▆▆▇█▇▇▆▇█▇▇█
val/loss,█▅▃▃▃▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▁▂▃▂

0,1
best_val_f1,0.36239
epoch,25.0
lr,0.0
train/acc,0.59902
train/f1,0.54878
train/loss,1.18685
val/acc,0.38333
val/f1,0.36239
val/loss,1.7457


  Best F1: 0.3624 -> /content/trained_encoders/wav2vec2-lr1e5-w3s

wav2vec2-lr3e5-w3s


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.192 v_f1=0.171 v_loss=1.840




  [ 2/25] t_f1=0.306 v_f1=0.344 v_loss=1.654




  [ 3/25] t_f1=0.462 v_f1=0.379 v_loss=1.641




  [ 4/25] t_f1=0.583 v_f1=0.419 v_loss=1.537




  [ 5/25] t_f1=0.632 v_f1=0.447 v_loss=1.522




  [ 6/25] t_f1=0.650 v_f1=0.456 v_loss=1.485




  [ 7/25] t_f1=0.682 v_f1=0.451 v_loss=1.442




  [ 8/25] t_f1=0.688 v_f1=0.452 v_loss=1.489




  [ 9/25] t_f1=0.736 v_f1=0.444 v_loss=1.563




  [10/25] t_f1=0.762 v_f1=0.507 v_loss=1.458




  [11/25] t_f1=0.798 v_f1=0.470 v_loss=1.520




  [12/25] t_f1=0.839 v_f1=0.466 v_loss=1.541




  [13/25] t_f1=0.845 v_f1=0.498 v_loss=1.603




  [14/25] t_f1=0.862 v_f1=0.514 v_loss=1.576




  [15/25] t_f1=0.884 v_f1=0.504 v_loss=1.583




  [16/25] t_f1=0.891 v_f1=0.491 v_loss=1.637




  [17/25] t_f1=0.902 v_f1=0.497 v_loss=1.676




  [18/25] t_f1=0.907 v_f1=0.529 v_loss=1.652




  [19/25] t_f1=0.928 v_f1=0.545 v_loss=1.620




  [20/25] t_f1=0.933 v_f1=0.523 v_loss=1.709




  [21/25] t_f1=0.923 v_f1=0.545 v_loss=1.724




  [22/25] t_f1=0.935 v_f1=0.551 v_loss=1.715




  [23/25] t_f1=0.951 v_f1=0.558 v_loss=1.686




  [24/25] t_f1=0.958 v_f1=0.552 v_loss=1.820




  [25/25] t_f1=0.962 v_f1=0.564 v_loss=1.800


0,1
best_val_f1,▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▂▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███████
train/f1,▁▂▃▅▅▅▅▆▆▆▇▇▇▇▇▇▇████████
train/loss,█▇▆▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁
val/acc,▁▃▄▅▆▆▆▆▅▇▆▆▆▇▆▆▆▇█▇▇████
val/f1,▁▄▅▅▆▆▆▆▆▇▆▆▇▇▇▇▇▇█▇█████
val/loss,█▅▄▃▂▂▁▂▃▁▂▃▄▃▃▄▅▅▄▆▆▆▅█▇

0,1
best_val_f1,0.56352
epoch,25.0
lr,0.0
train/acc,0.96176
train/f1,0.96168
train/loss,0.23591
val/acc,0.56111
val/f1,0.56352
val/loss,1.79999


  Best F1: 0.5635 -> /content/trained_encoders/wav2vec2-lr3e5-w3s

wav2vec2-lr5e5-w3s


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.201 v_f1=0.149 v_loss=1.889




  [ 2/25] t_f1=0.383 v_f1=0.311 v_loss=1.632




  [ 3/25] t_f1=0.491 v_f1=0.476 v_loss=1.642




  [ 4/25] t_f1=0.697 v_f1=0.503 v_loss=1.482




  [ 5/25] t_f1=0.759 v_f1=0.507 v_loss=1.485




  [ 6/25] t_f1=0.821 v_f1=0.531 v_loss=1.479




  [ 7/25] t_f1=0.853 v_f1=0.560 v_loss=1.488




  [ 8/25] t_f1=0.868 v_f1=0.554 v_loss=1.538




  [ 9/25] t_f1=0.913 v_f1=0.559 v_loss=1.551




  [10/25] t_f1=0.917 v_f1=0.588 v_loss=1.546




  [11/25] t_f1=0.930 v_f1=0.574 v_loss=1.659




  [12/25] t_f1=0.928 v_f1=0.576 v_loss=1.775




  [13/25] t_f1=0.945 v_f1=0.583 v_loss=1.807




  [14/25] t_f1=0.944 v_f1=0.619 v_loss=1.726




  [15/25] t_f1=0.954 v_f1=0.597 v_loss=1.910




  [16/25] t_f1=0.953 v_f1=0.605 v_loss=2.011




  [17/25] t_f1=0.955 v_f1=0.586 v_loss=2.105




  [18/25] t_f1=0.967 v_f1=0.609 v_loss=2.193


                                               

  [19/25] t_f1=0.973 v_f1=0.598 v_loss=2.354
  Early stopping at epoch 19




0,1
best_val_f1,▁
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▃▄▆▆▇▇▇▇▇█████████
train/f1,▁▃▄▅▆▇▇▇▇▇█████████
train/loss,█▇▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val/acc,▁▄▅▆▆▆▇▇▇▇▇▇▇███▇██
val/f1,▁▃▆▆▆▇▇▇▇█▇▇▇███▇██
val/loss,▄▂▂▁▁▁▁▁▂▂▂▃▄▃▄▅▆▇█

0,1
best_val_f1,0.61924
epoch,19.0
lr,1e-05
train/acc,0.97255
train/f1,0.97255
train/loss,0.11747
val/acc,0.58889
val/f1,0.59828
val/loss,2.35417


  Best F1: 0.6192 -> /content/trained_encoders/wav2vec2-lr5e5-w3s

wav2vec2-lr3e5-w2s


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.158 v_f1=0.122 v_loss=1.866




  [ 2/25] t_f1=0.222 v_f1=0.198 v_loss=1.865




  [ 3/25] t_f1=0.372 v_f1=0.236 v_loss=1.833




  [ 4/25] t_f1=0.395 v_f1=0.350 v_loss=1.742




  [ 5/25] t_f1=0.497 v_f1=0.351 v_loss=1.721




  [ 6/25] t_f1=0.491 v_f1=0.383 v_loss=1.726




  [ 7/25] t_f1=0.537 v_f1=0.411 v_loss=1.680




  [ 8/25] t_f1=0.550 v_f1=0.392 v_loss=1.714




  [ 9/25] t_f1=0.567 v_f1=0.394 v_loss=1.679




  [10/25] t_f1=0.603 v_f1=0.384 v_loss=1.634




  [11/25] t_f1=0.639 v_f1=0.391 v_loss=1.660


                                               

  [12/25] t_f1=0.657 v_f1=0.399 v_loss=1.644
  Early stopping at epoch 12




0,1
best_val_f1,▁
epoch,▁▂▂▃▄▄▅▅▆▇▇█
lr,███▁▁▁▁▁▁▁▁▁
train/acc,▁▂▄▅▆▆▇▇▇▇██
train/f1,▁▂▄▄▆▆▆▆▇▇██
train/loss,█▆▅▄▃▃▂▂▂▂▁▁
val/acc,▁▂▃▇▆▇█▇▇▇▇▇
val/f1,▁▃▄▇▇▇███▇██
val/loss,██▇▄▄▄▂▃▂▁▂▁

0,1
best_val_f1,0.41125
epoch,12.0
lr,0.0
train/acc,0.69412
train/f1,0.65689
train/loss,0.98165
val/acc,0.42778
val/f1,0.39864
val/loss,1.64377


  Best F1: 0.4113 -> /content/trained_encoders/wav2vec2-lr3e5-w2s

hubert-lr1e5-w3s


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

  0%|          | 0/128 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]



  [ 1/25] t_f1=0.145 v_f1=0.151 v_loss=1.962




  [ 2/25] t_f1=0.136 v_f1=0.105 v_loss=1.924




  [ 3/25] t_f1=0.137 v_f1=0.138 v_loss=1.906




  [ 4/25] t_f1=0.163 v_f1=0.145 v_loss=1.866




  [ 5/25] t_f1=0.171 v_f1=0.159 v_loss=1.869




  [ 6/25] t_f1=0.195 v_f1=0.159 v_loss=1.864




  [ 7/25] t_f1=0.188 v_f1=0.170 v_loss=1.866




  [ 8/25] t_f1=0.203 v_f1=0.180 v_loss=1.860




  [ 9/25] t_f1=0.187 v_f1=0.169 v_loss=1.865




  [10/25] t_f1=0.202 v_f1=0.169 v_loss=1.846




  [11/25] t_f1=0.232 v_f1=0.182 v_loss=1.848




  [12/25] t_f1=0.213 v_f1=0.178 v_loss=1.840




  [13/25] t_f1=0.254 v_f1=0.214 v_loss=1.847




  [14/25] t_f1=0.246 v_f1=0.193 v_loss=1.835




  [15/25] t_f1=0.263 v_f1=0.211 v_loss=1.839




  [16/25] t_f1=0.268 v_f1=0.220 v_loss=1.846




  [17/25] t_f1=0.278 v_f1=0.244 v_loss=1.843




  [18/25] t_f1=0.285 v_f1=0.230 v_loss=1.826




  [19/25] t_f1=0.299 v_f1=0.273 v_loss=1.831




  [20/25] t_f1=0.304 v_f1=0.249 v_loss=1.822




  [21/25] t_f1=0.319 v_f1=0.250 v_loss=1.836




  [22/25] t_f1=0.339 v_f1=0.276 v_loss=1.829




  [23/25] t_f1=0.335 v_f1=0.281 v_loss=1.821




  [24/25] t_f1=0.361 v_f1=0.289 v_loss=1.820


                                               

  [25/25] t_f1=0.369 v_f1=0.275 v_loss=1.817




0,1
best_val_f1,▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▃▃▄▄▄▄▄▄▄▅▅▆▅▆▆▆▆▆▇▇▇▇██
train/f1,▁▁▁▂▂▃▃▃▃▃▄▃▅▄▅▅▅▅▆▆▆▇▇██
train/loss,█▆▅▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁
val/acc,▃▁▂▃▃▃▄▄▄▄▄▄▅▄▅▅▆▅▇▆▆▇███
val/f1,▃▁▂▃▃▃▃▄▃▃▄▄▅▄▅▅▆▆▇▆▆▇██▇
val/loss,█▆▅▃▄▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▂▂▁▁▁

0,1
best_val_f1,0.28948
epoch,25.0
lr,0.0
train/acc,0.44314
train/f1,0.36916
train/loss,1.50773
val/acc,0.36111
val/f1,0.27508
val/loss,1.81675


  Best F1: 0.2895 -> /content/trained_encoders/hubert-lr1e5-w3s

hubert-lr3e5-w3s


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.164 v_f1=0.125 v_loss=1.927




  [ 2/25] t_f1=0.215 v_f1=0.168 v_loss=1.810




  [ 3/25] t_f1=0.308 v_f1=0.306 v_loss=1.703




  [ 4/25] t_f1=0.433 v_f1=0.322 v_loss=1.679




  [ 5/25] t_f1=0.452 v_f1=0.341 v_loss=1.666




  [ 6/25] t_f1=0.464 v_f1=0.366 v_loss=1.641




  [ 7/25] t_f1=0.482 v_f1=0.355 v_loss=1.643




  [ 8/25] t_f1=0.506 v_f1=0.347 v_loss=1.630




  [ 9/25] t_f1=0.505 v_f1=0.356 v_loss=1.574




  [10/25] t_f1=0.536 v_f1=0.366 v_loss=1.562




  [11/25] t_f1=0.555 v_f1=0.429 v_loss=1.567




  [12/25] t_f1=0.588 v_f1=0.402 v_loss=1.579




  [13/25] t_f1=0.606 v_f1=0.410 v_loss=1.536




  [14/25] t_f1=0.603 v_f1=0.454 v_loss=1.531




  [15/25] t_f1=0.601 v_f1=0.454 v_loss=1.521




  [16/25] t_f1=0.632 v_f1=0.478 v_loss=1.527




  [17/25] t_f1=0.700 v_f1=0.489 v_loss=1.515




  [18/25] t_f1=0.711 v_f1=0.445 v_loss=1.520




  [19/25] t_f1=0.721 v_f1=0.445 v_loss=1.519




  [20/25] t_f1=0.764 v_f1=0.510 v_loss=1.449




  [21/25] t_f1=0.757 v_f1=0.530 v_loss=1.482




  [22/25] t_f1=0.783 v_f1=0.522 v_loss=1.474




  [23/25] t_f1=0.782 v_f1=0.538 v_loss=1.445




  [24/25] t_f1=0.811 v_f1=0.515 v_loss=1.510




  [25/25] t_f1=0.840 v_f1=0.535 v_loss=1.479


0,1
best_val_f1,▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▂▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
train/f1,▁▂▂▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
train/loss,█▇▆▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁
val/acc,▁▂▄▄▅▅▅▅▅▆▇▆▆▇▇▇▇▆▆██████
val/f1,▁▂▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▆▆██████
val/loss,█▆▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▂▁▁▂▂

0,1
best_val_f1,0.53756
epoch,25.0
lr,0.0
train/acc,0.84412
train/f1,0.83963
train/loss,0.74571
val/acc,0.53889
val/f1,0.535
val/loss,1.47942


  Best F1: 0.5376 -> /content/trained_encoders/hubert-lr3e5-w3s

hubert-lr5e5-w3s


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.170 v_f1=0.150 v_loss=2.002




  [ 2/25] t_f1=0.233 v_f1=0.219 v_loss=1.796




  [ 3/25] t_f1=0.360 v_f1=0.342 v_loss=1.821




  [ 4/25] t_f1=0.478 v_f1=0.310 v_loss=1.635




  [ 5/25] t_f1=0.559 v_f1=0.392 v_loss=1.537




  [ 6/25] t_f1=0.557 v_f1=0.449 v_loss=1.461




  [ 7/25] t_f1=0.621 v_f1=0.474 v_loss=1.430




  [ 8/25] t_f1=0.664 v_f1=0.517 v_loss=1.419




  [ 9/25] t_f1=0.705 v_f1=0.517 v_loss=1.354




  [10/25] t_f1=0.703 v_f1=0.539 v_loss=1.307




  [11/25] t_f1=0.750 v_f1=0.509 v_loss=1.309




  [12/25] t_f1=0.778 v_f1=0.521 v_loss=1.344




  [13/25] t_f1=0.818 v_f1=0.568 v_loss=1.306




  [14/25] t_f1=0.816 v_f1=0.581 v_loss=1.314




  [15/25] t_f1=0.830 v_f1=0.557 v_loss=1.380




  [16/25] t_f1=0.850 v_f1=0.538 v_loss=1.410




  [17/25] t_f1=0.891 v_f1=0.560 v_loss=1.349




  [18/25] t_f1=0.873 v_f1=0.550 v_loss=1.462




  [19/25] t_f1=0.906 v_f1=0.610 v_loss=1.392




  [20/25] t_f1=0.911 v_f1=0.616 v_loss=1.343




  [21/25] t_f1=0.897 v_f1=0.624 v_loss=1.364




  [22/25] t_f1=0.916 v_f1=0.616 v_loss=1.420




  [23/25] t_f1=0.926 v_f1=0.618 v_loss=1.513




  [24/25] t_f1=0.931 v_f1=0.607 v_loss=1.537


                                               

  [25/25] t_f1=0.942 v_f1=0.606 v_loss=1.548




0,1
best_val_f1,▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
lr,███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▁▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇███████
train/f1,▁▂▃▄▅▅▅▅▆▆▆▇▇▇▇▇█▇███████
train/loss,█▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁
val/acc,▁▂▄▃▅▅▆▇▇▇▆▆▇▇▇▆▇▇███████
val/f1,▁▂▄▃▅▅▆▆▆▇▆▆▇▇▇▇▇▇███████
val/loss,█▆▆▄▃▃▂▂▁▁▁▁▁▁▂▂▁▃▂▁▂▂▃▃▃

0,1
best_val_f1,0.62386
epoch,25.0
lr,1e-05
train/acc,0.94216
train/f1,0.94212
train/loss,0.31352
val/acc,0.60556
val/f1,0.60599
val/loss,1.54775


  Best F1: 0.6239 -> /content/trained_encoders/hubert-lr5e5-w3s

hubert-lr3e5-w2s


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/25] t_f1=0.157 v_f1=0.111 v_loss=1.855




  [ 2/25] t_f1=0.153 v_f1=0.183 v_loss=1.983




  [ 3/25] t_f1=0.253 v_f1=0.209 v_loss=1.880




  [ 4/25] t_f1=0.276 v_f1=0.233 v_loss=1.876




  [ 5/25] t_f1=0.257 v_f1=0.194 v_loss=1.811




  [ 6/25] t_f1=0.250 v_f1=0.215 v_loss=1.772




  [ 7/25] t_f1=0.254 v_f1=0.218 v_loss=1.739




  [ 8/25] t_f1=0.255 v_f1=0.229 v_loss=1.780


                                               

  [ 9/25] t_f1=0.265 v_f1=0.230 v_loss=1.753
  Early stopping at epoch 9




0,1
best_val_f1,▁
epoch,▁▂▃▄▅▅▆▇█
lr,███▁▁▁▁▁▁
train/acc,▁▃▆██▇███
train/f1,▁▁▇█▇▇▇▇▇
train/loss,█▅▄▂▂▂▁▁▁
val/acc,▁▂▇█▆▇█▇█
val/f1,▁▅▇█▆▇▇██
val/loss,▄█▅▅▃▂▁▂▁

0,1
best_val_f1,0.23269
epoch,9.0
lr,0.0
train/acc,0.38039
train/f1,0.26525
train/loss,1.46878
val/acc,0.34444
val/f1,0.2302
val/loss,1.75284


  Best F1: 0.2327 -> /content/trained_encoders/hubert-lr3e5-w2s

timesformer-lr1e5-8f


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/486M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.

  0%|          | 0/255 [00:00<?, ?it/s][A
  0%|          | 1/255 [00:01<04:31,  1.07s/it][A
  1%|          | 2/255 [00:01<02:13,  1.89it/s][A
  1%|          | 3/255 [00:01<01:28,  2.84it/s][A
  2%|▏         | 4/255 [00:01<01:06,  3.76it/s][A
  2%|▏         | 5/255 [00:01<00:54,  4.55it/s][A
  2%|▏         | 6/255 [00:01<00:47,  5.21it/s][A
  3%|▎         | 7/255 [00:01<00:43,  5.76it/s][A
  3%|▎         | 8/255 [00:02<00:38,  6.34it/s][A
  4%|▎         | 9/255 [00:02<00:36,  6.73it/s][A
  4%|▍         | 10/255 [00:02<00:35,  6.95it/s][A
  4%|▍         | 11/255 [00:02<00:34,  7.10it/s][A
  5%|▍         | 12/255 [00:02<00:33,  7.26it/s][A
  5%|▌         | 

  [ 1/15] t_f1=0.105 v_f1=0.088 v_loss=2.120




  [ 2/15] t_f1=0.219 v_f1=0.184 v_loss=1.918




  [ 3/15] t_f1=0.422 v_f1=0.260 v_loss=1.755




  [ 4/15] t_f1=0.546 v_f1=0.377 v_loss=1.630




  [ 5/15] t_f1=0.598 v_f1=0.337 v_loss=1.616




  [ 6/15] t_f1=0.682 v_f1=0.347 v_loss=1.539




  [ 7/15] t_f1=0.738 v_f1=0.369 v_loss=1.523




  [ 8/15] t_f1=0.766 v_f1=0.391 v_loss=1.466




  [ 9/15] t_f1=0.815 v_f1=0.381 v_loss=1.534




  [10/15] t_f1=0.827 v_f1=0.423 v_loss=1.406




  [11/15] t_f1=0.852 v_f1=0.412 v_loss=1.443




  [12/15] t_f1=0.857 v_f1=0.419 v_loss=1.438




  [13/15] t_f1=0.874 v_f1=0.417 v_loss=1.507




  [14/15] t_f1=0.887 v_f1=0.481 v_loss=1.446


                                               

  [15/15] t_f1=0.888 v_f1=0.441 v_loss=1.399




0,1
best_val_f1,▁
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
lr,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▂▄▅▅▆▇▇▇▇█████
train/f1,▁▂▄▅▅▆▇▇▇▇█████
train/loss,█▇▆▅▄▄▃▃▂▂▂▂▁▁▁
val/acc,▁▃▅▆▆▆▆▆▆▇▇▇▇█▇
val/f1,▁▃▄▆▅▆▆▆▆▇▇▇▇█▇
val/loss,█▆▄▃▃▂▂▂▂▁▁▁▂▁▁

0,1
best_val_f1,0.48106
epoch,15.0
lr,0.0
train/acc,0.88922
train/f1,0.88834
train/loss,0.40332
val/acc,0.50556
val/f1,0.44145
val/loss,1.39943


  Best F1: 0.4811 -> /content/trained_encoders/timesformer-lr1e5-8f

timesformer-lr3e5-8f


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/15] t_f1=0.149 v_f1=0.178 v_loss=2.027




  [ 2/15] t_f1=0.368 v_f1=0.352 v_loss=1.624




  [ 3/15] t_f1=0.595 v_f1=0.314 v_loss=1.549




  [ 4/15] t_f1=0.712 v_f1=0.509 v_loss=1.297




  [ 5/15] t_f1=0.799 v_f1=0.469 v_loss=1.298




  [ 6/15] t_f1=0.858 v_f1=0.458 v_loss=1.463




  [ 7/15] t_f1=0.866 v_f1=0.466 v_loss=1.382


                                               

  [ 8/15] t_f1=0.903 v_f1=0.499 v_loss=1.571
  Early stopping at epoch 8




0,1
best_val_f1,▁
epoch,▁▂▃▄▅▆▇█
lr,█▁▁▁▁▁▁▁
train/acc,▁▃▅▆▇███
train/f1,▁▃▅▆▇███
train/loss,█▇▄▃▂▂▁▁
val/acc,▁▅▅█▇▇▇█
val/f1,▁▅▄█▇▇▇█
val/loss,█▄▃▁▁▃▂▄

0,1
best_val_f1,0.50926
epoch,8.0
lr,0.0
train/acc,0.90294
train/f1,0.90257
train/loss,0.32862
val/acc,0.55
val/f1,0.49942
val/loss,1.5706


  Best F1: 0.5093 -> /content/trained_encoders/timesformer-lr3e5-8f

timesformer-lr1e5-16f


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/15] t_f1=0.104 v_f1=0.127 v_loss=2.077




  [ 2/15] t_f1=0.295 v_f1=0.313 v_loss=1.769




  [ 3/15] t_f1=0.513 v_f1=0.315 v_loss=1.595




  [ 4/15] t_f1=0.676 v_f1=0.421 v_loss=1.462




  [ 5/15] t_f1=0.755 v_f1=0.434 v_loss=1.406




  [ 6/15] t_f1=0.818 v_f1=0.474 v_loss=1.329




  [ 7/15] t_f1=0.863 v_f1=0.472 v_loss=1.414




  [ 8/15] t_f1=0.875 v_f1=0.506 v_loss=1.467




  [ 9/15] t_f1=0.912 v_f1=0.444 v_loss=1.572




  [10/15] t_f1=0.921 v_f1=0.504 v_loss=1.450




  [11/15] t_f1=0.932 v_f1=0.520 v_loss=1.564




  [12/15] t_f1=0.945 v_f1=0.519 v_loss=1.589




  [13/15] t_f1=0.954 v_f1=0.538 v_loss=1.713




  [14/15] t_f1=0.961 v_f1=0.532 v_loss=1.713




  [15/15] t_f1=0.960 v_f1=0.546 v_loss=1.635


0,1
best_val_f1,▁
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
lr,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▂▄▆▆▇▇▇███████
train/f1,▁▃▄▆▆▇▇▇███████
train/loss,█▇▆▄▄▃▂▂▂▂▁▁▁▁▁
val/acc,▁▅▅▆▆▇▇▇▆▇█████
val/f1,▁▄▄▆▆▇▇▇▆▇█████
val/loss,█▅▃▂▂▁▂▂▃▂▃▃▅▅▄

0,1
best_val_f1,0.54606
epoch,15.0
lr,0.0
train/acc,0.96078
train/f1,0.96043
train/loss,0.12252
val/acc,0.6
val/f1,0.54606
val/loss,1.6348


  Best F1: 0.5461 -> /content/trained_encoders/timesformer-lr1e5-16f

timesformer-lr3e5-16f


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/15] t_f1=0.170 v_f1=0.246 v_loss=1.955




  [ 2/15] t_f1=0.462 v_f1=0.404 v_loss=1.469




  [ 3/15] t_f1=0.720 v_f1=0.432 v_loss=1.517




  [ 4/15] t_f1=0.811 v_f1=0.482 v_loss=1.588




  [ 5/15] t_f1=0.882 v_f1=0.549 v_loss=1.395




  [ 6/15] t_f1=0.921 v_f1=0.523 v_loss=1.756




  [ 7/15] t_f1=0.944 v_f1=0.590 v_loss=1.552




  [ 8/15] t_f1=0.962 v_f1=0.584 v_loss=1.732




  [ 9/15] t_f1=0.980 v_f1=0.556 v_loss=1.857




  [10/15] t_f1=0.978 v_f1=0.575 v_loss=1.786


                                               

  [11/15] t_f1=0.977 v_f1=0.574 v_loss=1.769
  Early stopping at epoch 11




0,1
best_val_f1,▁
epoch,▁▂▂▃▄▅▅▆▇▇█
lr,█▁▁▁▁▁▁▁▁▁▁
train/acc,▁▄▆▇▇▇█████
train/f1,▁▄▆▇▇▇█████
train/loss,█▆▄▃▂▂▁▁▁▁▁
val/acc,▁▅▅▆▇▇██▇▇█
val/f1,▁▄▅▆▇▇██▇██
val/loss,█▂▃▃▁▆▃▅▇▆▆

0,1
best_val_f1,0.58972
epoch,11.0
lr,0.0
train/acc,0.97647
train/f1,0.97653
train/loss,0.0652
val/acc,0.61667
val/f1,0.57418
val/loss,1.76896


  Best F1: 0.5897 -> /content/trained_encoders/timesformer-lr3e5-16f

timesformer-lr1e5-8f-freeze3


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  [ 1/15] t_f1=0.105 v_f1=0.088 v_loss=2.120




  [ 2/15] t_f1=0.137 v_f1=0.119 v_loss=2.067




  [ 3/15] t_f1=0.183 v_f1=0.159 v_loss=2.024




  [ 4/15] t_f1=0.296 v_f1=0.259 v_loss=1.847




  [ 5/15] t_f1=0.458 v_f1=0.250 v_loss=1.743




  [ 6/15] t_f1=0.573 v_f1=0.299 v_loss=1.637




  [ 7/15] t_f1=0.629 v_f1=0.317 v_loss=1.589




  [ 8/15] t_f1=0.686 v_f1=0.394 v_loss=1.506




  [ 9/15] t_f1=0.745 v_f1=0.360 v_loss=1.546




  [10/15] t_f1=0.770 v_f1=0.424 v_loss=1.399




  [11/15] t_f1=0.810 v_f1=0.411 v_loss=1.444




  [12/15] t_f1=0.818 v_f1=0.421 v_loss=1.409




  [13/15] t_f1=0.844 v_f1=0.419 v_loss=1.473




  [14/15] t_f1=0.872 v_f1=0.464 v_loss=1.415


                                               

  [15/15] t_f1=0.862 v_f1=0.460 v_loss=1.372




0,1
best_val_f1,▁
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
lr,███▁▁▁▁▁▁▁▁▁▁▁▁
train/acc,▁▁▁▂▄▅▆▆▇▇▇▇███
train/f1,▁▁▂▃▄▅▆▆▇▇▇████
train/loss,██▇▇▆▅▄▃▃▂▂▂▁▁▁
val/acc,▁▁▂▄▅▅▆▇▆▇▇▇▇██
val/f1,▁▂▂▄▄▅▅▇▆▇▇▇▇██
val/loss,██▇▅▄▃▃▂▃▁▂▁▂▁▁

0,1
best_val_f1,0.46417
epoch,15.0
lr,0.0
train/acc,0.86373
train/f1,0.86207
train/loss,0.48112
val/acc,0.51111
val/f1,0.45973
val/loss,1.37221


  Best F1: 0.4642 -> /content/trained_encoders/timesformer-lr1e5-8f-freeze3



In [11]:
print(f"\n{'='*60}")
print("RESULTS SUMMARY")
print(f"{'='*60}")
print(f"{'Name':30s} {'Best Val F1':>12s}")
print("-" * 44)
for r in sorted(results, key=lambda x: -x["best_f1"]):
    print(f"{r['name']:30s} {r['best_f1']:12.4f}")


RESULTS SUMMARY
Name                            Best Val F1
--------------------------------------------
hubert-lr5e5-w3s                     0.6239
wav2vec2-lr5e5-w3s                   0.6192
timesformer-lr3e5-16f                0.5897
wav2vec2-lr3e5-w3s                   0.5635
timesformer-lr1e5-16f                0.5461
hubert-lr3e5-w3s                     0.5376
timesformer-lr3e5-8f                 0.5093
timesformer-lr1e5-8f                 0.4811
timesformer-lr1e5-8f-freeze3         0.4642
wav2vec2-lr3e5-w2s                   0.4113
wav2vec2-lr1e5-w3s                   0.3624
hubert-lr1e5-w3s                     0.2895
hubert-lr3e5-w2s                     0.2327


In [12]:
import torch
torch.cuda.empty_cache()
print("GPU RAM cleaned.")

GPU RAM cleaned.


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  17536 KiB |   7016 MiB | 740660 GiB | 740660 GiB |
|       from large pool |  17536 KiB |   7010 MiB | 734773 GiB | 734773 GiB |
|       from small pool |      0 KiB |    111 MiB |   5886 GiB |   5886 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  17536 KiB |   7016 MiB | 740660 GiB | 740660 GiB |
|       from large pool |  17536 KiB |   7010 MiB | 734773 GiB | 734773 GiB |
|       from small pool |      0 KiB |    111 MiB |   5886 GiB |   5886 GiB |
|---------------------------------------------------------------