In [None]:
# 1️⃣ Install the CMU Multimodal SDK
!git clone https://github.com/ecfm/CMU-MultimodalSDK.git
%cd CMU-MultimodalSDK
!pip install .
!pip install validators

# 2️⃣ Import required modules
from mmsdk import mmdatasdk as md
import os
import torch

# 3️⃣ Define the paths to your MOSEI .csd files
mosei_csd_files = {
    "CMU_MOSEI_TimestampedWordVectors": "/content/drive/MyDrive/EmotionFusion/MOSEI_features/CMU_MOSEI_TimestampedWordVectors.csd",
    "CMU_MOSEI_COVAREP": "/content/drive/MyDrive/EmotionFusion/MOSEI_features/CMU_MOSEI_COVAREP.csd",
    "CMU_MOSEI_VisualFacet42": "/content/drive/MyDrive/EmotionFusion/MOSEI_features/CMU_MOSEI_VisualFacet42.csd",
    "CMU_MOSEI_Labels": "/content/drive/MyDrive/EmotionFusion/MOSEI_features/CMU_MOSEI_Labels.csd"
}

#  Load the dataset
dataset = md.mmdataset(mosei_csd_files)

#  Inspect available modalities and sample keys
print("Modalities loaded:", dataset.keys())
print("Sample utterance keys (first 5):", list(dataset["CMU_MOSEI_Labels"].keys())[:5])

#  Access features for a single utterance
sample_key = list(dataset["CMU_MOSEI_Labels"].keys())[0]  # example: pick first utterance
text_features = dataset["CMU_MOSEI_TimestampedWordVectors"][sample_key]['features']
audio_features = dataset["CMU_MOSEI_COVAREP"][sample_key]['features']
visual_features = dataset["CMU_MOSEI_VisualFacet42"][sample_key]['features']
label = dataset["CMU_MOSEI_Labels"][sample_key]['features']

sample_key = list(dataset['CMU_MOSEI_Labels'].keys())[0]
text_features = dataset['CMU_MOSEI_TimestampedWordVectors'][sample_key]['features']
audio_features = dataset['CMU_MOSEI_COVAREP'][sample_key]['features']
visual_features = dataset['CMU_MOSEI_VisualFacet42'][sample_key]['features']

# ✅ get label data directly from dataset.data dict
label_data = dataset['CMU_MOSEI_Labels'].data[sample_key]['features']

sentiment_score = label_data[0]
emotion_classes = label_data[1:]


print(f"\nFeatures for sample {sample_key}:")
print(f"  Text: shape {text_features.shape}")
print(f"  Audio: shape {audio_features.shape}")
print(f"  Visual: shape {visual_features.shape}")
print(f"  Label sentiment: {sentiment_score}")
print(f"  Label emotions: {emotion_classes}")



In [None]:

print("⏳ Aligning all modalities to text...")
dataset.align('CMU_MOSEI_TimestampedWordVectors')
print("✅ Alignment complete.")


sample_key = list(dataset['CMU_MOSEI_Labels'].keys())[0]
if sample_key in dataset['CMU_MOSEI_TimestampedWordVectors']:
    print(f"✅ Sample {sample_key} verified in aligned dataset.")
else:
    print("⚠️ Sample not found after alignment.")

In [None]:
import pickle

save_path = "/content/drive/MyDrive/EmotionFusion/Aligned_MOSEI.pkl"

with open(save_path, 'wb') as f:
    pickle.dump(dataset, f)

print(f"✅ Saved aligned dataset to {save_path}")


In [None]:

!git clone https://github.com/ecfm/CMU-MultimodalSDK.git
%cd CMU-MultimodalSDK
!pip install .
!pip install validators


from mmsdk import mmdatasdk as md
import pickle

with open("/content/drive/MyDrive/EmotionFusion/Aligned_MOSEI.pkl", "rb") as f:
    dataset = pickle.load(f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/ecfm/CMU-MultimodalSDK.git
%cd CMU-MultimodalSDK
!pip install .
!pip install validators
!pip install pytorch_lightning

In [None]:

import pickle, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
import pytorch_lightning as pl
from pytorch_lightning import Trainer, Callback
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy

def safe_tensor(arr, clamp=10_000.0):
    t = torch.tensor(arr, dtype=torch.float32)
    t = torch.nan_to_num(t, nan=0.0, posinf=0.0, neginf=0.0)
    return torch.clamp(t, -clamp, clamp)

def z_norm(seq, eps=1e-8):
    m, s = seq.mean(0, keepdim=True), seq.std(0, keepdim=True, unbiased=False)
    return (seq - m) / (s + eps)

with open("/content/drive/MyDrive/EmotionFusion/Aligned_MOSEI.pkl", "rb") as f:
    dataset = pickle.load(f)
print(" Dataset loaded.")

from mmsdk.mmdatasdk.dataset.standard_datasets.CMU_MOSEI.cmu_mosei_std_folds import \
    standard_train_fold, standard_valid_fold
def vid_id(k): return k.split('[')[0].lstrip('-')

keys = {name: dataset[name].keys() for name in
        ['CMU_MOSEI_TimestampedWordVectors','CMU_MOSEI_COVAREP',
         'CMU_MOSEI_VisualFacet42','CMU_MOSEI_Labels']}
avail = set.intersection(*map(set, keys.values()))

train_ids = [k for k in keys['CMU_MOSEI_Labels']
             if vid_id(k) in standard_train_fold and k in avail]
val_ids   = [k for k in keys['CMU_MOSEI_Labels']
             if vid_id(k) in standard_valid_fold and k in avail]
print(f" Filtered splits: {len(train_ids)} train • {len(val_ids)} val")

class MOSEI(Dataset):
    def __init__(self, ids): self.ids = ids
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        k = self.ids[idx]; d = dataset
        x_t = z_norm(safe_tensor(d['CMU_MOSEI_TimestampedWordVectors'][k]['features']))
        x_a = z_norm(safe_tensor(d['CMU_MOSEI_COVAREP'][k]['features']))
        x_v = z_norm(safe_tensor(d['CMU_MOSEI_VisualFacet42'][k]['features']))
        y   = torch.tensor(d['CMU_MOSEI_Labels'][k]['features'], dtype=torch.float32).mean(0)
        y   = torch.nan_to_num(y).clamp(0,1)
        return x_t, x_a, x_v, y

def collate(batch):
    t,a,v,y = zip(*batch)
    tlen = torch.tensor([x.size(0) for x in t])
    alen = torch.tensor([x.size(0) for x in a])
    vlen = torch.tensor([x.size(0) for x in v])
    return (nn.utils.rnn.pad_sequence(t,batch_first=True), tlen,
            nn.utils.rnn.pad_sequence(a,batch_first=True), alen,
            nn.utils.rnn.pad_sequence(v,batch_first=True), vlen,
            torch.stack(y))

train_loader = DataLoader(MOSEI(train_ids),32,True, collate_fn=collate,num_workers=4,pin_memory=True)
val_loader   = DataLoader(MOSEI(val_ids)  ,32,False,collate_fn=collate,num_workers=4,pin_memory=True)

bin_counts = torch.zeros(7)
for _,_,_,y in MOSEI(train_ids):
    bin_counts += (y>0).float()
pos_weight = (len(train_ids)-bin_counts)/(bin_counts+1e-6)
print(f"  pos_weight = {pos_weight.tolist()}")

class XModal(pl.LightningModule):
    def __init__(self, pos_weight):
        super().__init__()
        self.text_rnn   = nn.GRU(300,256,batch_first=True)
        self.audio_rnn  = nn.GRU(74 ,128,batch_first=True)
        self.visual_rnn = nn.GRU(35 ,128,batch_first=True)
        self.fc = nn.Linear(256+128+128,7)
        self.loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        self.thresh = 0.3
        self._micro = MultilabelF1Score(7,threshold=self.thresh,average='micro')
        self._macro = MultilabelF1Score(7,threshold=self.thresh,average='macro')
        self._acc   = MultilabelAccuracy(7,threshold=self.thresh)

    @staticmethod
    def _last(rnn,x,l): return rnn(pack_padded_sequence(x,l.cpu(),batch_first=True,enforce_sorted=False))[1].squeeze(0)

    def forward(self,t,tlen,a,alen,v,vlen):
        h = torch.cat([self._last(self.text_rnn,t,tlen),
                       self._last(self.audio_rnn,a,alen),
                       self._last(self.visual_rnn,v,vlen)],-1)
        return self.fc(h)

    def _common_step(self,b):
        t,tlen,a,alen,v,vlen,y = b
        logit = self(t,tlen,a,alen,v,vlen); loss=self.loss(logit,y)
        return loss, logit, (y>0).int()


    def training_step(self,b,_):
        loss,_,_ = self._common_step(b)
        self.log("train_loss",loss,prog_bar=True); return loss


    def validation_step(self,b,_):
        loss,logit,yt = self._common_step(b)
        self.log("val_loss",loss,prog_bar=True)
        pred = (torch.sigmoid(logit)>self.thresh).int()
        self._micro.update(pred,yt); self._macro.update(pred,yt); self._acc.update(pred,yt)

    def on_validation_epoch_end(self):
        micro = self._micro.compute(); macro = self._macro.compute(); acc = self._acc.compute()
        self.log_dict({"val_f1_micro":micro,"val_f1_macro":macro,"val_acc":acc},prog_bar=True)
        self._micro.reset(); self._macro.reset(); self._acc.reset()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(),lr=1e-4,amsgrad=True)

class SweepThreshold(Callback):
    def __init__(self, every=1): self.every=every
    def on_validation_epoch_end(self,tr,pl_module):
        if (tr.current_epoch+1)%self.every: return
        logits, targets = [], []
        pl_module.eval(); dev=pl_module.device
        with torch.no_grad():
            for b in val_loader:
                b=[x.to(dev) for x in b]; logits.append(pl_module(*b[:-1]).cpu()); targets.append((b[-1]>0).int().cpu())
        logits, targets = torch.cat(logits), torch.cat(targets)
        best_t,best=pl_module.thresh,0.0
        for t in torch.linspace(0.05,0.5,10):
            f1 = MultilabelF1Score(7,threshold=0.5,average='micro')((torch.sigmoid(logits)>t).int(),targets)
            if f1>best: best,best_t=f1.item(),t.item()
        pl_module.thresh=best_t
        print(f"🔄  new decision threshold = {best_t:.2f} (micro-F1={best:.3f})")


ckpt = ModelCheckpoint("/content/drive/MyDrive/EmotionFusion/checkpoints",
                       filename="best-{epoch:02d}-{val_f1_micro:.3f}",
                       monitor="val_f1_micro",mode="max",save_top_k=1)
early = EarlyStopping(monitor="val_f1_micro",mode="max",patience=5,verbose=True)

trainer = Trainer(max_epochs=30,accelerator="auto",
                  devices=1 if torch.cuda.is_available() else None,
                  gradient_clip_val=1.0,
                  callbacks=[ckpt,early,SweepThreshold()])

model = XModal(pos_weight)
trainer.fit(model,train_loader,val_loader)


In [None]:
import pickle, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy


def safe_tensor(arr, clamp=10_000.0):
    t = torch.tensor(arr, dtype=torch.float32)
    t = torch.nan_to_num(t, nan=0.0, posinf=0.0, neginf=0.0)
    return torch.clamp(t, -clamp, clamp)

def z_norm(seq, eps=1e-8):
    m, s = seq.mean(0, keepdim=True), seq.std(0, keepdim=True, unbiased=False)
    return (seq - m) / (s + eps)


with open("/content/drive/MyDrive/EmotionFusion/Aligned_MOSEI.pkl", "rb") as f:
    dataset = pickle.load(f)

from mmsdk.mmdatasdk.dataset.standard_datasets.CMU_MOSEI.cmu_mosei_std_folds import standard_test_fold
def vid_id(k): return k.split('[')[0].lstrip('-')

keys = {name: dataset[name].keys() for name in
        ['CMU_MOSEI_TimestampedWordVectors','CMU_MOSEI_COVAREP',
         'CMU_MOSEI_VisualFacet42','CMU_MOSEI_Labels']}
avail = set.intersection(*map(set, keys.values()))
test_ids = [k for k in keys['CMU_MOSEI_Labels']
            if vid_id(k) in standard_test_fold and k in avail]
print(f"✅ Test split size: {len(test_ids)}")


class MOSEITest(Dataset):
    def __init__(self, ids): self.ids=ids
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        k = self.ids[idx]; d = dataset
        t = z_norm(safe_tensor(d['CMU_MOSEI_TimestampedWordVectors'][k]['features']))
        a = z_norm(safe_tensor(d['CMU_MOSEI_COVAREP'][k]['features']))
        v = z_norm(safe_tensor(d['CMU_MOSEI_VisualFacet42'][k]['features']))
        y = torch.tensor(d['CMU_MOSEI_Labels'][k]['features'], dtype=torch.float32).mean(0)
        y = torch.nan_to_num(y).clamp(0,1)
        return t,a,v,y

def collate(batch):
    t,a,v,y=zip(*batch)
    tlen=torch.tensor([x.size(0) for x in t]); alen=torch.tensor([x.size(0) for x in a])
    vlen=torch.tensor([x.size(0) for x in v])
    return (nn.utils.rnn.pad_sequence(t,batch_first=True), tlen,
            nn.utils.rnn.pad_sequence(a,batch_first=True), alen,
            nn.utils.rnn.pad_sequence(v,batch_first=True), vlen,
            torch.stack(y))

test_loader = DataLoader(MOSEITest(test_ids), batch_size=32, shuffle=False,
                         collate_fn=collate, num_workers=4, pin_memory=True)


class XModal(pl.LightningModule):
    def __init__(self, pos_weight=torch.ones(7)):
        super().__init__()
        self.text_rnn = nn.GRU(300,256,batch_first=True)
        self.audio_rnn= nn.GRU(74 ,128,batch_first=True)
        self.visual_rnn=nn.GRU(35 ,128,batch_first=True)
        self.fc = nn.Linear(256+128+128,7)
        self.loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        self.thresh = 0.3                                    # will be overwritten by checkpoint
    @staticmethod
    def _last(rnn,x,l): return rnn(pack_padded_sequence(x,l.cpu(),batch_first=True,enforce_sorted=False))[1].squeeze(0)
    def forward(self,t,tlen,a,alen,v,vlen):
        h=torch.cat([self._last(self.text_rnn,t,tlen),
                     self._last(self.audio_rnn,a,alen),
                     self._last(self.visual_rnn,v,vlen)],-1)
        return self.fc(h)


ckpt_path = "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"
model = XModal.load_from_checkpoint(ckpt_path)
print(f"🔑  Loaded checkpoint from {ckpt_path}")
print(f"🔎  Decision threshold stored in checkpoint: {model.thresh:.2f}")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

micro = MultilabelF1Score(7,threshold=model.thresh,average='micro').to(device)
macro = MultilabelF1Score(7,threshold=model.thresh,average='macro').to(device)
acc   = MultilabelAccuracy(7,threshold=model.thresh).to(device)
per_class_f1 = MultilabelF1Score(7,threshold=model.thresh,average=None).to(device)

with torch.no_grad():
    for b in test_loader:
        b = [x.to(device) for x in b]
        logits = model(*b[:-1])
        preds  = (torch.sigmoid(logits)>model.thresh).int()
        target = (b[-1]>0).int()
        micro.update(preds,target); macro.update(preds,target)
        acc.update(preds,target);   per_class_f1.update(preds,target)

print(f"\n===== Test-set results =====")
print(f"Micro-F1  : {micro.compute():.3f}")
print(f"Macro-F1  : {macro.compute():.3f}")
print(f"Accuracy  : {acc.compute():.3f}")
print(f"Per-class F1 [neutral, happy, sad, anger, fear, disgust, surprise]:\n  {per_class_f1.compute().cpu().numpy().round(3)}")


In [None]:
%%bash
echo "🚀  Fast reinstall using wheels only"
pip -q uninstall -y torch torchvision torchaudio triton pytorch-lightning scipy numpy pandas gensim openai-whisper || true

pip -q install numpy==1.26.4 scipy==1.11.4 pandas==2.2.2 tqdm           # will pull wheels
pip -q install torch==2.1.2+cu121 torchvision==0.16.2+cu121 torchaudio==2.1.2+cu121 \
               --index-url https://download.pytorch.org/whl/cu121
pip -q install pytorch-lightning==2.1.3 triton==2.0.0 openai-whisper==20230314 gensim==4.3.2


In [None]:
!pip install whisper
!pip install gensim
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%bash
set -e
echo "🧹 uninstall any orphan whisper"
pip -q uninstall -y whisper || true

echo "📦 install CPU-only wheels (Python 3.11 compatible)…"
pip -q install --upgrade pip
pip -q install \
      numpy==1.26.4 scipy==1.11.4 pandas==2.2.2 tqdm scikit-learn \
      torch==2.2.1 torchaudio==2.2.1 torchvision==0.17.1           \
      git+https://github.com/openai/whisper.git                    \
      gensim==4.3.2 py-feat==0.5.0 opencv-python-headless==4.9.0.80

echo "✅  all deps ready"


In [None]:

from pathlib import Path
import csv, pickle, numpy as np, cv2, subprocess, tempfile
import torch, torchaudio, torchvision, whisper, gensim.downloader as api
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import torchvision.transforms as T
from google.colab import drive

# 0. Mount Google Drive
drive.mount('/content/drive', force_remount=True)

root    = Path("/content/drive/MyDrive/movie_clips")   # mp4 + labels.csv now on Drive
lab_csv = root / "labels.csv"
assert lab_csv.exists(), "labels.csv missing!"

with lab_csv.open() as f:
    rdr = csv.DictReader(f)
    emo_cols = rdr.fieldnames[1:]
    labels = {row["clip_id"]: np.array([int(row[c]) for c in emo_cols], np.float32)
              for row in rdr}


whisp = whisper.load_model("base")                      # CPU is fine
w2v   = api.load("word2vec-google-news-300")

mel74 = torchaudio.transforms.MelSpectrogram(sample_rate=16_000, n_mels=74)

resnet_full = torchvision.models.resnet18(weights="DEFAULT").eval()
feature_net = torch.nn.Sequential(*list(resnet_full.children())[:-1]).eval()  # outputs (B,512,1,1)
to224 = T.Compose([T.ToPILImage(), T.Resize(224), T.ToTensor()])


pca35 = PCA(35).fit(np.random.randn(400, 512))

def text_vec(words):
    return np.stack([w2v[w] if w in w2v else np.zeros(300, np.float32) for w in words]) \
           if words else np.zeros((1, 300), np.float32)

def audio_vec(mp4_path):
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        subprocess.run(
            ["ffmpeg", "-loglevel", "error", "-y", "-i", mp4_path,
             "-ar", "16000", "-ac", "1", tmp.name]
        )
        sig, _ = torchaudio.load(tmp.name)
    return mel74(sig)[0].mean(-1, keepdim=True).T.numpy()        # (1,74)

def visual_seq(mp4_path):
    cap = cv2.VideoCapture(str(mp4_path))
    fps = cap.get(cv2.CAP_PROP_FPS) or 25
    step = max(int(fps), 1)
    frames, f = [], 0
    with torch.no_grad():
        while cap.isOpened():
            ok, frame = cap.read()
            if not ok:
                break
            if f % step == 0:
                x = to224(frame[:, :, ::-1]).unsqueeze(0)        # (1,3,224,224)
                vec = feature_net(x).squeeze().numpy()          # (512,)
                frames.append(pca35.transform(vec[None])[0])    # (35,)
            f += 1
    cap.release()
    return np.stack(frames) if frames else np.zeros((1, 35), np.float32)


out = {k:{} for k in ["CMU_MOSEI_TimestampedWordVectors",
                      "CMU_MOSEI_COVAREP",
                      "CMU_MOSEI_VisualFacet42",
                      "CMU_MOSEI_Labels"]}

for mp4 in tqdm(root.glob("*.mp4"), desc="extracting"):
    cid = mp4.stem
    # --- text ---
    seg   = whisp.transcribe(str(mp4), language="en", word_timestamps=True, verbose=False)
    words = [w["word"].lower().strip(".,?!") for s in seg["segments"] for w in s["words"]]
    out["CMU_MOSEI_TimestampedWordVectors"][cid] = {"features": text_vec(words)}
    # --- audio ---
    out["CMU_MOSEI_COVAREP"][cid] = {"features": audio_vec(str(mp4))}
    # --- visual ---
    out["CMU_MOSEI_VisualFacet42"][cid] = {"features": visual_seq(mp4)}
    # --- label ---
    out["CMU_MOSEI_Labels"][cid] = {"features": labels[cid]}

# ---------- save to Drive ----------
save_path = "/content/drive/MyDrive/movie_clips_simple.pkl"
with open(save_path, "wb") as f:
    pickle.dump(out, f)

print(f"🎉  Saved movie_clips_simple.pkl with {len(out['CMU_MOSEI_Labels'])} clips → {save_path}")


In [None]:


import pickle, torch, torch.nn as nn, pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy


with open("/content/movie_clips_simple.pkl", "rb") as f:
    movie_data = pickle.load(f)

clip_ids = list(movie_data["CMU_MOSEI_Labels"])
print("🔢 clips:", len(clip_ids))


class MovieDS(Dataset):
    def __init__(self, ids): self.ids = ids
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        k   = self.ids[idx]; d = movie_data
        txt = torch.tensor(d["CMU_MOSEI_TimestampedWordVectors"][k]["features"],
                           dtype=torch.float32)
        aud = torch.tensor(d["CMU_MOSEI_COVAREP"][k]["features"],
                           dtype=torch.float32)
        vis = torch.tensor(d["CMU_MOSEI_VisualFacet42"][k]["features"],
                           dtype=torch.float32)
        lab = torch.tensor(d["CMU_MOSEI_Labels"][k]["features"],
                           dtype=torch.float32)
        return txt, aud, vis, lab


def collate(batch):
    t,a,v,y = zip(*batch)
    tlen = torch.tensor([x.size(0) for x in t])
    alen = torch.tensor([x.size(0) for x in a])
    vlen = torch.tensor([x.size(0) for x in v])
    return (pad_sequence(t, True), tlen,
            pad_sequence(a, True), alen,
            pad_sequence(v, True), vlen,
            torch.stack(y))

loader = DataLoader(MovieDS(clip_ids), batch_size=32, shuffle=False, collate_fn=collate)

class XModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU( 74, 128, batch_first=True)
        self.visual_rnn = nn.GRU( 35, 128, batch_first=True)
        self.fc = nn.Linear(256+128+128, 7)
        self.thresh = 0.30                      # will be overwritten by ckpt if stored

    def _last(self, rnn, x, lengths):
        _, h_n = rnn(pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False))
        return h_n.squeeze(0)                   # (B, hidden)

    def forward(self, t, tlen, a, alen, v, vlen):
        h = torch.cat([self._last(self.text_rnn,  t, tlen),
                       self._last(self.audio_rnn, a, alen),
                       self._last(self.visual_rnn,v, vlen)], dim=-1)
        return self.fc(h)

ckpt_path = "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"
model = XModal.load_from_checkpoint(ckpt_path, strict=False).eval()
device = "cpu"                                  # stay on CPU
model.to(device)
print(f"🔑  Loaded checkpoint: {ckpt_path}")
print(f"🔎  Decision threshold: {model.thresh:.2f}")


micro = MultilabelF1Score(num_labels=7, threshold=model.thresh, average="micro").to(device)
macro = MultilabelF1Score(num_labels=7, threshold=model.thresh, average="macro").to(device)
acc   = MultilabelAccuracy(num_labels=7, threshold=model.thresh).to(device)

with torch.no_grad():
    for batch in loader:
        batch = [x.to(device) for x in batch]
        logits = model(*batch[:-1])
        preds  = (torch.sigmoid(logits) > model.thresh).int()
        target = (batch[-1] > 0).int()
        micro.update(preds, target)
        macro.update(preds, target)
        acc.update(preds, target)

print("\n=====  Results on 28 movie clips  =====")
print("Micro-F1 :", micro.compute().item())
print("Macro-F1 :", macro.compute().item())
print("Accuracy :", acc.compute().item())


In [None]:
import os
import glob
import pickle
import cv2
import numpy as np
import torch
import torchaudio
import opensmile
import whisper
import gensim.downloader as api
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict, Counter
from torchvision.models import resnet50, ResNet50_Weights
import torchvision.transforms as T

# 1. Configuration
DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
DRIVE_ROOT   = "/content/drive/MyDrive"
CAER_ROOT    = f"{DRIVE_ROOT}/CAER"
OUT_PKL_PATH = f"{DRIVE_ROOT}/caer_aligned.pkl"
EMOTIONS     = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
CAER2MOSEI   = [2, 4, 5, 0, 6, 1, 3]

# 2. Load models once
print(f"⚙ Device: {DEVICE}")
whisper_model = whisper.load_model("base").to(DEVICE)
smile         = opensmile.Smile(feature_set=opensmile.FeatureSet.ComParE_2016,
                               feature_level=opensmile.FeatureLevel.LowLevelDescriptors)
resnet        = resnet50(weights=ResNet50_Weights.DEFAULT).to(DEVICE).eval()
resnet.fc     = torch.nn.Identity()
glove         = api.load("glove-wiki-gigaword-300")

img_tf = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406],
                [0.229, 0.224, 0.225]),
])

# 3. Extraction helpers, each returns (True, features) or (False, exception)
@torch.no_grad()
def extract_visual(path, every_n=10):
    try:
        cap, feats, idx = cv2.VideoCapture(path), [], 0
        while True:
            ok, frame = cap.read()
            if not ok:
                break
            if idx % every_n == 0:
                x = img_tf(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\
                      .unsqueeze(0).to(DEVICE)
                feats.append(resnet(x).cpu())
            idx += 1
        cap.release()
        if not feats:
            raise RuntimeError("No frames extracted")
        return True, torch.vstack(feats)
    except Exception as e:
        return False, e

def extract_audio(path):
    try:
        wav, sr = torchaudio.load(path)
        if sr != 16000:
            wav = torchaudio.functional.resample(wav, sr, 16000)
        df = smile.process_signal(wav.squeeze().numpy(), 16000)
        if df.empty:
            raise RuntimeError("Empty audio features")
        return True, torch.tensor(df.values, dtype=torch.float32)
    except Exception as e:
        return False, e

def extract_text(path):
    try:
        res    = whisper_model.transcribe(path, language="en", fp16=(DEVICE=="cuda"))
        tokens = res["text"].lower().split()
        vecs   = [glove[w] for w in tokens if w in glove]
        if not vecs:
            raise RuntimeError("No valid words in transcription")
        return True, np.mean(vecs, axis=0).astype(np.float32)
    except Exception as e:
        return False, e

# 4. Main processing loop
out = {
    "CMU_MOSEI_TimestampedWordVectors": {},
    "CMU_MOSEI_COVAREP": {},
    "CMU_MOSEI_VisualFacet42": {},
    "CMU_MOSEI_Labels": {},
}
failures      = defaultdict(list)
emotion_counts = Counter()
total_clips    = 0

for split in ["train", "validation", "test"]:
    for emo in EMOTIONS:
        files = glob.glob(f"{CAER_ROOT}/{split}/{emo}/*.avi")
        print(f"📂 {split}/{emo}: {len(files)} videos")
        for vid in tqdm(files, desc=f"{split}/{emo}", ncols=80):
            total_clips += 1
            cid = Path(vid).stem

            # Run each extractor
            ok_t, txt = extract_text(vid)
            ok_a, aud = extract_audio(vid)
            ok_v, vis = extract_visual(vid)

            # Record successes
            if ok_t:
                out["CMU_MOSEI_TimestampedWordVectors"][cid] = {"features": torch.tensor(txt)}
            else:
                failures[cid].append(f"text: {txt}")

            if ok_a:
                out["CMU_MOSEI_COVAREP"][cid] = {"features": aud}
            else:
                failures[cid].append(f"audio: {aud}")

            if ok_v:
                out["CMU_MOSEI_VisualFacet42"][cid] = {"features": vis}
            else:
                failures[cid].append(f"vision: {vis}")

            # Always store label (we know the emotion)
            oh = np.zeros(len(EMOTIONS), np.float32)
            oh[CAER2MOSEI[EMOTIONS.index(emo)]] = 1
            out["CMU_MOSEI_Labels"][cid] = {"features": oh}

            # Count if **all three** modalities succeeded
            if ok_t and ok_a and ok_v:
                emotion_counts[emo] += 1

# 5. Report summary
print(f"\n✅ Processed {total_clips} clips.")
print("✅ Fully-successful clips per emotion:")
for emo, cnt in emotion_counts.items():
    print(f"  - {emo}: {cnt}")

print(f"\n⚠ {len(failures)} clips had at least one failure.")
# Optionally, write failures to a log file for later manual inspection:
with open("failure_log.txt", "w") as fw:
    for cid, errs in failures.items():
        fw.write(f"{cid}: " + "; ".join(errs) + "\n")

# 6. Save output
backup = OUT_PKL_PATH + ".bak"
if Path(OUT_PKL_PATH).exists():
    Path(OUT_PKL_PATH).rename(backup)
    print(f"🔄 Backed up existing pickle to {backup}")
with open(OUT_PKL_PATH, "wb") as f:
    pickle.dump(out, f)
print(f"✅ Saved new pickle → {OUT_PKL_PATH}")


In [None]:
import os
import glob
import pickle
import cv2
import numpy as np
import torch
import torchaudio
import opensmile
import whisper
import gensim.downloader as api
from pathlib import Path
from tqdm import tqdm
from collections import Counter
from torchvision.models import resnet50, ResNet50_Weights
import torchvision.transforms as T
from google.colab import drive

# 1. Mount Drive
drive.mount('/content/drive', force_remount=True)

# 2. Configuration
DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
DRIVE_ROOT   = "/content/drive/MyDrive"
CAER_ROOT    = f"{DRIVE_ROOT}/CAER"
OUT_PKL_PATH = f"{DRIVE_ROOT}/caer_aligned_full.pkl"  # final output

EMOTIONS   = ['Anger','Disgust','Fear','Happy','Neutral','Sad','Surprise']
CAER2MOSEI = [2,4,5,0,6,1,3]

# 3. Prepare empty output
out = {
    "CMU_MOSEI_TimestampedWordVectors": {},
    "CMU_MOSEI_COVAREP": {},
    "CMU_MOSEI_VisualFacet42": {},
    "CMU_MOSEI_Labels": {},
}

# 4. Load models
print(f"⚙ Device: {DEVICE}")
whisper_model = whisper.load_model("base").to(DEVICE)
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors
)
resnet = resnet50(weights=ResNet50_Weights.DEFAULT).to(DEVICE).eval()
resnet.fc = torch.nn.Identity()
glove = api.load("glove-wiki-gigaword-300")

img_tf = T.Compose([
    T.ToPILImage(),
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

# 5. Compute zero-pad placeholders
TEXT_DIM  = glove.vector_size
AUDIO_DIM = smile.process_signal(np.zeros(16000), 16000).values.shape[1]
ZERO_TEXT = torch.zeros(TEXT_DIM, dtype=torch.float32)
ZERO_AUD  = torch.zeros(AUDIO_DIM, dtype=torch.float32)
ZERO_VIS  = torch.zeros(1, 2048, dtype=torch.float32)  # one frame of ResNet features

# 6. Extraction helpers
@torch.no_grad()
def extract_visual(path, every_n=10):
    cap, feats, idx = cv2.VideoCapture(path), [], 0
    while True:
        ok, frame = cap.read()
        if not ok: break
        if idx % every_n == 0:
            x = img_tf(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).unsqueeze(0).to(DEVICE)
            feats.append(resnet(x).cpu())
        idx += 1
    cap.release()
    if not feats:
        raise RuntimeError("No frames extracted")
    return torch.vstack(feats)

def extract_audio(path):
    wav, sr = torchaudio.load(path)
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
    df = smile.process_signal(wav.squeeze().numpy(), 16000)
    if df.empty:
        raise RuntimeError("Empty audio features")
    return torch.tensor(df.values, dtype=torch.float32)

def extract_text(path):
    outp = whisper_model.transcribe(path, language="en", fp16=(DEVICE=="cuda"))
    toks = outp["text"].lower().split()
    vecs = [glove[w] for w in toks if w in glove]
    if not vecs:
        raise RuntimeError("No valid words")
    return np.mean(vecs, axis=0).astype(np.float32)

# 7. Main loop: process all clips with unique IDs
for split in ["train","validation","test"]:
    for emo in EMOTIONS:
        vids = glob.glob(f"{CAER_ROOT}/{split}/{emo}/*.avi")
        print(f"📂 {split}/{emo}: {len(vids)} clips")
        for vid_path in tqdm(vids, desc=f"{split}/{emo}", ncols=80):
            stem = Path(vid_path).stem
            cid  = f"{split}_{emo}_{stem}"  # unique ID per split/emotion

            # Text features
            try:
                txt = torch.tensor(extract_text(vid_path))
            except:
                txt = ZERO_TEXT
            out["CMU_MOSEI_TimestampedWordVectors"][cid] = {"features": txt}

            # Audio features
            try:
                aud = extract_audio(vid_path)
            except:
                aud = ZERO_AUD
            out["CMU_MOSEI_COVAREP"][cid] = {"features": aud}

            # Visual features
            try:
                vis = extract_visual(vid_path)
            except:
                vis = ZERO_VIS
            out["CMU_MOSEI_VisualFacet42"][cid] = {"features": vis}

            # One-hot label
            oh = np.zeros(len(EMOTIONS), np.float32)
            oh[CAER2MOSEI[EMOTIONS.index(emo)]] = 1
            out["CMU_MOSEI_Labels"][cid] = {"features": oh}

# 8. Save the full-run pickle
with open(OUT_PKL_PATH, "wb") as f:
    pickle.dump(out, f)
print(f"✅ Saved full-run pickle with {len(out['CMU_MOSEI_Labels'])} clips → {OUT_PKL_PATH}")


In [None]:
import os
import pickle
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy


class CAERDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data
        self.ids  = list(data['CMU_MOSEI_Labels'].keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        k = self.ids[idx]
        t = torch.as_tensor(self.data['CMU_MOSEI_TimestampedWordVectors'][k]['features'])
        a = torch.as_tensor(self.data['CMU_MOSEI_COVAREP'][k]['features'])
        v = torch.as_tensor(self.data['CMU_MOSEI_VisualFacet42'][k]['features'])
        y = torch.as_tensor(self.data['CMU_MOSEI_Labels'][k]['features'])
        return t, a, v, y

def coll(batch):
    ts, as_, vs, ys = zip(*batch)
    t = torch.stack(ts).unsqueeze(1)       # (B,1,300)
    tlen = torch.ones(len(ts), dtype=torch.long)
    alens = torch.tensor([a.size(0) for a in as_])
    a_padded = pad_sequence(as_, batch_first=True)   # (B,Ta,65)
    vlens = torch.tensor([v.size(0) for v in vs])
    v_padded = pad_sequence(vs, batch_first=True)    # (B,Tv,2048)
    y = torch.stack(ys)                              # (B,7)
    return t, tlen, a_padded, alens, v_padded, vlens, y


class XModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU(74, 128, batch_first=True)
        self.visual_rnn = nn.GRU(35, 128, batch_first=True)
        self.fc         = nn.Linear(256 + 128 + 128, 7)

        self.loss   = nn.BCEWithLogitsLoss()   # no pos_weight for main run
        self.thresh = 0.3
        self._micro = MultilabelF1Score(7, threshold=self.thresh, average='micro')
        self._acc   = MultilabelAccuracy(7, threshold=self.thresh)

    def _last(self, rnn, x, lengths):
        packed, _ = pack_padded_sequence(x, lengths.cpu(),
                                         batch_first=True,
                                         enforce_sorted=False), None
        return rnn(packed)[1].squeeze(0)

    def forward(self, t, tlen, a, alen, v, vlen):
        h = torch.cat([
            self._last(self.text_rnn,   t,    tlen),
            self._last(self.audio_rnn,  a,    alen),
            self._last(self.visual_rnn, v,    vlen),
        ], dim=-1)
        return self.fc(h)

    def training_step(self, batch, _):
        t, tlen, a, alen, v, vlen, y = batch
        logits = self(t, tlen, a, alen, v, vlen)
        loss = self.loss(logits, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, _):
        t, tlen, a, alen, v, vlen, y = batch
        logits = self(t, tlen, a, alen, v, vlen)
        preds  = (torch.sigmoid(logits) > self.thresh).int()
        self._micro.update(preds, y.int())
        self._acc.update(preds, y.int())

    def on_validation_epoch_end(self):
        self.log("val_f1_micro", self._micro.compute(), prog_bar=True)
        self.log("val_acc",      self._acc.compute(),    prog_bar=True)
        self._micro.reset(); self._acc.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4, amsgrad=True)


class XModalProj(pl.LightningModule):
    def __init__(self, ckpt_path):
        super().__init__()
        # load MOSEI weights into backbone
        base = XModal.load_from_checkpoint(ckpt_path, strict=False)
        self.text_rnn   = base.text_rnn
        self.audio_rnn  = base.audio_rnn
        self.visual_rnn = base.visual_rnn
        self.fc         = base.fc

        # projection layers (CAER→MOSEI dims)
        self.a_proj = nn.Linear(65,   74,   bias=False)
        self.v_proj = nn.Linear(2048, 35,   bias=True)
        nn.init.eye_(self.a_proj.weight)
        nn.init.zeros_(self.v_proj.weight)

        self.dropout = nn.Dropout(0.3)
        self.loss    = nn.BCEWithLogitsLoss()
        self.thresh  = base.thresh
        self.f1      = MultilabelF1Score(7, threshold=self.thresh, average='micro')

    def _last(self, rnn, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(),
                                      batch_first=True,
                                      enforce_sorted=False)
        return rnn(packed)[1].squeeze(0)

    def forward(self, t, tlen, a, alen, v, vlen):
        a_proj = self.a_proj(a)
        v_proj = self.v_proj(v)
        h = torch.cat([
            self._last(self.text_rnn,   t,    tlen),
            self._last(self.audio_rnn,  a_proj, alen),
            self._last(self.visual_rnn, v_proj, vlen),
        ], dim=-1)
        return self.fc(self.dropout(h))

    def training_step(self, batch, _):
        t, tlen, a, alen, v, vlen, y = batch
        logits = self(t, tlen, a, alen, v, vlen)
        loss   = self.loss(logits, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, _):
        t, tlen, a, alen, v, vlen, y = batch
        logits = self(t, tlen, a, alen, v, vlen)
        preds  = (torch.sigmoid(logits) > self.thresh).int()
        self.f1.update(preds, y.int())

    def on_validation_epoch_end(self):
        self.log("val_f1", self.f1.compute(), prog_bar=True)
        self.f1.reset()

    def configure_optimizers(self):
        pretrained = (list(self.text_rnn.parameters()) +
                      list(self.audio_rnn.parameters()) +
                      list(self.visual_rnn.parameters()))
        new_params = (list(self.a_proj.parameters()) +
                      list(self.v_proj.parameters()) +
                      list(self.fc.parameters()))
        return torch.optim.Adam([
            {'params': pretrained, 'lr': 1e-5},
            {'params': new_params,  'lr': 1e-4}
        ], amsgrad=True)


if __name__ == "__main__":
    DRIVE_ROOT   = "/content/drive/MyDrive"
    CAER_PKL     = f"{DRIVE_ROOT}/caer_aligned_full.pkl"
    MOSEI_CKPT   = f"{DRIVE_ROOT}/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"
    CKPT_DIR     = f"{DRIVE_ROOT}/CAER_finetune_checkpoints"

    os.makedirs(CKPT_DIR, exist_ok=True)

    ds      = CAERDataset(CAER_PKL)
    N       = len(ds)
    train_s = int(0.7 * N)
    val_s   = int(0.15 * N)
    test_s  = N - train_s - val_s
    train_ds, val_ds, _ = random_split(ds, [train_s, val_s, test_s])

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=coll, num_workers=4)
    val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=coll, num_workers=4)

    ckpt_cb = ModelCheckpoint(
        dirpath=CKPT_DIR,
        filename="best-caer-{epoch:02d}-{val_f1:.3f}",
        monitor="val_f1",
        mode="max",
        save_top_k=1
    )
    stop_cb = EarlyStopping(monitor="val_f1", mode="max", patience=5, verbose=True)

    trainer = Trainer(
        max_epochs=30,
        accelerator="auto",
        callbacks=[ckpt_cb, stop_cb]
    )

    model_ft = XModalProj(MOSEI_CKPT)
    print("⚙️ Starting fine-tuning on CAER...")
    trainer.fit(model_ft, train_loader, val_loader)
    print("✅ Done. Best checkpoint →", ckpt_cb.best_model_path)


In [None]:
import pickle
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from torchmetrics.classification import (
    MultilabelF1Score,
    MultilabelPrecision,
    MultilabelRecall,
    MultilabelAccuracy
)

# ---------------------- CAER Dataset ----------------------
class CAERDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data
        self.ids  = list(data['CMU_MOSEI_Labels'].keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        k = self.ids[idx]
        t = torch.as_tensor(self.data['CMU_MOSEI_TimestampedWordVectors'][k]['features'])
        a = torch.as_tensor(self.data['CMU_MOSEI_COVAREP'][k]['features'])
        v = torch.as_tensor(self.data['CMU_MOSEI_VisualFacet42'][k]['features'])
        y = torch.as_tensor(self.data['CMU_MOSEI_Labels'][k]['features'])
        return t, a, v, y

def coll(batch):
    ts, as_, vs, ys = zip(*batch)
    t = torch.stack(ts).unsqueeze(1)                 # (B,1,300)
    tlen = torch.ones(len(ts), dtype=torch.long)     # all length=1

    alens = torch.tensor([a.shape[0] for a in as_])
    a_padded = pad_sequence(as_, batch_first=True)   # (B, T_a, 65)

    vlens = torch.tensor([v.shape[0] for v in vs])
    v_padded = pad_sequence(vs, batch_first=True)    # (B, T_v, 2048)

    y = torch.stack(ys)                              # (B,7)
    return t, tlen, a_padded, alens, v_padded, vlens, y

# ---------------------- Fine-tunable Model Definition ----------------------
class XModalProj(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # Define MOSEI‐pretrained architecture slots to be filled by checkpoint
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU(74,  128, batch_first=True)
        self.visual_rnn = nn.GRU(35,  128, batch_first=True)
        self.fc         = nn.Linear(256 + 128 + 128, 7)

        # Projection layers for CAER→MOSEI dims
        self.a_proj = nn.Linear(65,   74, bias=False)
        self.v_proj = nn.Linear(2048, 35, bias=True)
        nn.init.eye_( self.a_proj.weight )
        nn.init.zeros_(self.v_proj.weight)

        self.dropout = nn.Dropout(0.3)
        self.loss    = nn.BCEWithLogitsLoss()
        self.thresh  = 0.3

    def forward(self, t, tlen, a, alen, v, vlen):
        # project CAER features into MOSEI dims
        a_proj = self.a_proj(a)
        v_proj = self.v_proj(v)

        # helper to get last hidden state
        def last(rnn, x, lengths):
            packed = pack_padded_sequence(x, lengths.cpu(),
                                          batch_first=True,
                                          enforce_sorted=False)
            return rnn(packed)[1].squeeze(0)

        h_text  = last(self.text_rnn,   t,      tlen)
        h_audio = last(self.audio_rnn,  a_proj, alen)
        h_vis   = last(self.visual_rnn, v_proj, vlen)
        h       = torch.cat([h_text, h_audio, h_vis], dim=-1)
        h       = self.dropout(h)
        return self.fc(h)

# ---------------------- Evaluation Script ----------------------
if __name__ == "__main__":
    # Paths
    DRIVE_ROOT     = "/content/drive/MyDrive"
    PICKLE_PATH    = f"{DRIVE_ROOT}/caer_aligned_full.pkl"
    BEST_CKPT_PATH = f"{DRIVE_ROOT}/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"

    # Load dataset and split
    ds = CAERDataset(PICKLE_PATH)
    N  = len(ds)
    train_sz = int(0.7 * N)
    val_sz   = int(0.15 * N)
    test_sz  = N - train_sz - val_sz
    _, _, test_ds = random_split(ds, [train_sz, val_sz, test_sz])
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=coll)

    # Load model from checkpoint (fills text_rnn, audio_rnn, visual_rnn, fc)
    model = XModalProj.load_from_checkpoint(BEST_CKPT_PATH, strict=False)
    model.eval().cuda()

    # Metrics
    threshold = 0.3
    f1    = MultilabelF1Score(num_labels=7, threshold=threshold, average='none').cuda()
    prec  = MultilabelPrecision(num_labels=7, threshold=threshold, average='none').cuda()
    rec   = MultilabelRecall(num_labels=7, threshold=threshold, average='none').cuda()
    acc   = MultilabelAccuracy(num_labels=7, threshold=threshold, average='none').cuda()

    # Run evaluation
    print("🔍 Evaluating on CAER test set...")
    with torch.no_grad():
        for batch in test_loader:
            t, tlen, a, alen, v, vlen, y = [x.cuda() for x in batch]
            logits = model(t, tlen, a, alen, v, vlen)
            preds  = (torch.sigmoid(logits) > threshold).int()
            y_bin  = y.int().cuda()
            f1.update(preds, y_bin)
            prec.update(preds, y_bin)
            rec.update(preds, y_bin)
            acc.update(preds, y_bin)

    # Report results
    class_names = ["happy","sad","angry","surprise","disgust","fear","neutral"]
    import pandas as pd
    df = pd.DataFrame({
        "emotion":   class_names,
        "F1":        f1.compute().cpu().numpy(),
        "Precision": prec.compute().cpu().numpy(),
        "Recall":    rec.compute().cpu().numpy(),
        "Accuracy":  acc.compute().cpu().numpy()
    })
    print(df.to_string(index=False))


In [None]:
import pickle
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy

# 1. Load your 28‐movie pickle
with open("/content/drive/MyDrive/movie_clips_simple.pkl", "rb") as f:
    movie_data = pickle.load(f)
clip_ids = list(movie_data["CMU_MOSEI_Labels"].keys())
print(f"🔢 Loaded {len(clip_ids)} movie clips")

# 2. Dataset & collate
class MovieDS(Dataset):
    def __init__(self, ids): self.ids = ids
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        k = self.ids[idx]; d = movie_data
        t = torch.tensor(d["CMU_MOSEI_TimestampedWordVectors"][k]["features"], dtype=torch.float32)
        a = torch.tensor(d["CMU_MOSEI_COVAREP"][k]["features"], dtype=torch.float32)
        v = torch.tensor(d["CMU_MOSEI_VisualFacet42"][k]["features"], dtype=torch.float32)
        y = torch.tensor(d["CMU_MOSEI_Labels"][k]["features"], dtype=torch.float32)
        return t, a, v, y

def collate(batch):
    t,a,v,y = zip(*batch)
    tlen = torch.tensor([x.size(0) for x in t])
    alen = torch.tensor([x.size(0) for x in a])
    vlen = torch.tensor([x.size(0) for x in v])
    return (
        pad_sequence(t, True), tlen,
        pad_sequence(a, True), alen,
        pad_sequence(v, True), vlen,
        torch.stack(y)
    )

loader = DataLoader(MovieDS(clip_ids), batch_size=16, shuffle=False, collate_fn=collate)

# 3. Model definition (must match training)
class XModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300,256,batch_first=True)
        self.audio_rnn  = nn.GRU( 74,128,batch_first=True)
        self.visual_rnn = nn.GRU( 35,128,batch_first=True)
        self.fc         = nn.Linear(256+128+128, 7)
        self.thresh     = 0.30

    def _last(self, rnn, x, lengths):
        _, h_n = rnn(pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False))
        return h_n.squeeze(0)

    def forward(self, t, tlen, a, alen, v, vlen):
        h = torch.cat([
            self._last(self.text_rnn,   t, tlen),
            self._last(self.audio_rnn,  a, alen),
            self._last(self.visual_rnn, v, vlen),
        ], dim=-1)
        return self.fc(h)

# 4. Evaluation function
def eval_model(ckpt_path, loader, device="cuda"):
    # load checkpoint
    model = XModal.load_from_checkpoint(ckpt_path, strict=False).eval().to(device)
    thresh = model.thresh
    f1_metric = MultilabelF1Score(num_labels=7, threshold=thresh, average="macro").to(device)
    acc_metric = MultilabelAccuracy(num_labels=7, threshold=thresh).to(device)
    with torch.no_grad():
        for batch in loader:
            batch = [x.to(device) for x in batch]
            logits = model(*batch[:-1])
            preds  = (torch.sigmoid(logits) > thresh).int()
            target = (batch[-1] > 0).int()
            f1_metric.update(preds, target)
            acc_metric.update(preds, target)
    return f1_metric.compute().item(), acc_metric.compute().item()

# 5. Paths to your two checkpoints
zero_ckpt = "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"
caer_ckpt = "/content/drive/MyDrive/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("🔍 Evaluating zero‐shot MOSEI model...")
f1_zero, acc_zero = eval_model(zero_ckpt, loader, device)
print(f"Zero-shot MOSEI → Movie clips  | Macro-F1: {f1_zero:.4f} | Accuracy: {acc_zero:.4f}")

print("\n🔍 Evaluating CAER-fine-tuned model...")
f1_caer, acc_caer = eval_model(caer_ckpt, loader, device)
print(f"CAER-fine-tuned → Movie clips | Macro-F1: {f1_caer:.4f} | Accuracy: {acc_caer:.4f}")


In [None]:
# 1) Install & enable Git LFS
!apt-get update -qq && apt-get install -qq git-lfs
!git lfs install

# 2) Clone the CREMA-D repo (with all FLV video files)
!git clone https://github.com/CheyneyComputerScience/CREMA-D.git /content/CREMA-D

# 3) Verify you have the FLVs
!ls /content/CREMA-D/VideoFlash | head -n 10


In [None]:
import os, glob, subprocess


out_dir = "/content/drive/MyDrive/CREMA-D_MP4"
os.makedirs(out_dir, exist_ok=True)


flvs = glob.glob("/content/CREMA-D/VideoFlash/*.flv")


for f in flvs:
    name = os.path.basename(f).rsplit(".", 1)[0]
    mp4 = os.path.join(out_dir, f"{name}.mp4")
    print("Converting", f, "→", mp4)
    subprocess.run([
        "ffmpeg", "-y", "-i", f,
        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
        "-c:a", "aac", "-b:a", "128k",
        mp4
    ], check=True)



In [None]:


from pathlib import Path
import pickle, numpy as np, cv2, subprocess, tempfile
import torch, torchaudio, torchvision
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import torchvision.transforms as T
from google.colab import drive


drive.mount('/content/drive', force_remount=True)


video_root = Path("/content/drive/MyDrive/CREMA-D_MP4")
assert video_root.exists(), "Make sure CREMA-D_MP4 exists!"

mel74 = torchaudio.transforms.MelSpectrogram(sample_rate=16_000, n_mels=74)
resnet_full = torchvision.models.resnet18(weights="DEFAULT").eval()
feature_net = torch.nn.Sequential(*list(resnet_full.children())[:-1]).eval()
to224 = T.Compose([T.ToPILImage(), T.Resize(224), T.ToTensor()])


pca35 = PCA(35).fit(np.random.randn(400, 512))


def extract_covarep_from_mp4(mp4_path):
    # Extract audio track to temp wav, then compute mel74→(1,74)
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        subprocess.run([
            "ffmpeg", "-y", "-i", str(mp4_path),
            "-ar", "16000", "-ac", "1", "-loglevel", "error",
            tmp.name
        ], check=True)
        sig, _ = torchaudio.load(tmp.name)
    return mel74(sig)[0].mean(-1, keepdim=True).T.numpy().astype(np.float32)

def extract_visualfacet42_from_mp4(mp4_path):
    cap = cv2.VideoCapture(str(mp4_path))
    fps = cap.get(cv2.CAP_PROP_FPS) or 25
    step = max(int(fps), 1)
    frames, f = [], 0
    with torch.no_grad():
        while True:
            ok, frame = cap.read()
            if not ok:
                break
            if f % step == 0:
                x   = to224(frame[:, :, ::-1]).unsqueeze(0)     # (1,3,224,224)
                vec = feature_net(x).squeeze().numpy()          # (512,)
                frames.append(pca35.transform(vec[None])[0])    # (35,)
            f += 1
    cap.release()
    return (np.stack(frames, axis=0) if frames else np.zeros((1,35),np.float32)).astype(np.float32)


out = {
    "CMU_MOSEI_TimestampedWordVectors": {},
    "CMU_MOSEI_COVAREP": {},
    "CMU_MOSEI_VisualFacet42": {},
    "CMU_MOSEI_Labels": {}
}


code2idx = {"ANG":0,"DIS":1,"HAP":2,"SAD":3,"FEA":4,"NEU":5}


for idx, mp4_fp in enumerate(tqdm(sorted(video_root.glob("*.mp4")))):
    base = mp4_fp.stem   # e.g. "1001_DFA_ANG_XX"
    key  = f"{idx:05d}_{base}"


    out["CMU_MOSEI_TimestampedWordVectors"][key] = {
        "features": np.zeros((1,300), np.float32)
    }


    cov = extract_covarep_from_mp4(mp4_fp)
    out["CMU_MOSEI_COVAREP"][key] = {"features": cov}


    vis = extract_visualfacet42_from_mp4(mp4_fp)
    out["CMU_MOSEI_VisualFacet42"][key] = {"features": vis}

    parts = base.split("_")
    emo_code = parts[2] if len(parts) > 2 else None
    y7 = np.zeros(7, np.float32)
    if emo_code in code2idx:
        y7[code2idx[emo_code]] = 1.0
    out["CMU_MOSEI_Labels"][key] = {"features": y7}


save_path = "/content/drive/MyDrive/crema_d_features.pkl"
with open(save_path, "wb") as f:
    pickle.dump(out, f)

print(f"🎉 Saved {len(out['CMU_MOSEI_Labels'])} CREMA-D feature entries → {save_path}")


In [None]:


import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch import nn
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy

# 1️⃣ Load CREMA-D features pickle
CREMA_PKL = "/content/drive/MyDrive/crema_d_features.pkl"
with open(CREMA_PKL, "rb") as f:
    data = pickle.load(f)

uids = list(data["CMU_MOSEI_Labels"].keys())

# 2️⃣ Dataset & collate
class CremaDataset(Dataset):
    def __init__(self, uids, data):
        self.uids = uids
        self.data = data
    def __len__(self):
        return len(self.uids)
    def __getitem__(self, idx):
        uid = self.uids[idx]
        t = torch.from_numpy(self.data["CMU_MOSEI_TimestampedWordVectors"][uid]["features"])
        a = torch.from_numpy(self.data["CMU_MOSEI_COVAREP"][uid]["features"])
        v = torch.from_numpy(self.data["CMU_MOSEI_VisualFacet42"][uid]["features"])
        y = torch.from_numpy(self.data["CMU_MOSEI_Labels"][uid]["features"])
        return t, a, v, y

def coll(batch):
    ts, as_, vs, ys = zip(*batch)
    # pad text (B, T_t, 300)
    t_p = pad_sequence(ts, batch_first=True)
    tlen = torch.tensor([t.size(0) for t in ts])
    # pad audio (B, T_a, 65)
    a_p = pad_sequence(as_, batch_first=True)
    alen = torch.tensor([a.size(0) for a in as_])
    # pad visual (B, T_v, 2048)
    v_p = pad_sequence(vs, batch_first=True)
    vlen = torch.tensor([v.size(0) for v in vs])
    # labels (B,7)
    y = torch.stack(ys)
    return t_p, tlen, a_p, alen, v_p, vlen, y

dataset = CremaDataset(uids, data)
loader  = DataLoader(dataset, batch_size=16, collate_fn=coll, num_workers=2)

# 3️⃣ Define the core model class
class XModal(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300,256,batch_first=True)
        self.audio_rnn  = nn.GRU(74,128,batch_first=True)
        self.visual_rnn = nn.GRU(35,128,batch_first=True)
        self.fc         = nn.Linear(256+128+128, 7)
        self.thresh     = 0.3

    def forward(self, t, tlen, a, alen, v, vlen):
        def last(rnn, x, l):
            return rnn(pack_padded_sequence(x, l.cpu(), batch_first=True, enforce_sorted=False))[1].squeeze(0)
        h = torch.cat([
            last(self.text_rnn,   t,    tlen),
            last(self.audio_rnn,  a,    alen),
            last(self.visual_rnn, v,    vlen),
        ], dim=-1)
        return self.fc(h)

# 4️⃣ Evaluation function
def evaluate_model(ckpt_path, loader, device="cuda"):
    # load model
    model = XModal().to(device)
    state = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state["state_dict"], strict=False)
    model.eval()

    f1_metric = MultilabelF1Score(num_labels=7, threshold=model.thresh, average="macro").to(device)
    acc_metric = MultilabelAccuracy    (num_labels=7, threshold=model.thresh).to(device)

    with torch.no_grad():
        for batch in loader:
            t, tlen, a, alen, v, vlen, y = [b.to(device) for b in batch]
            logits = model(t, tlen, a, alen, v, vlen)
            preds  = (torch.sigmoid(logits) > model.thresh).int()
            target = y.int()
            f1_metric.update(preds, target)
            acc_metric.update(preds, target)

    f1  = f1_metric.compute().item()
    acc = acc_metric.compute().item()
    return f1, acc

# 5️⃣ Paths to your checkpoints
MOSEI_CKPT = "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"
CAER_CKPT  = "/content/drive/MyDrive/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"

# 6️⃣ Run evaluations
device = "cuda" if torch.cuda.is_available() else "cpu"

f1_zero, acc_zero = evaluate_model(MOSEI_CKPT, loader, device)
print(f"Zero-shot MOSEI → CREMA-D | Macro-F1: {f1_zero:.4f} | Accuracy: {acc_zero:.4f}")

f1_caer, acc_caer = evaluate_model(CAER_CKPT, loader, device)
print(f"CAER-fine-tuned → CREMA-D | Macro-F1: {f1_caer:.4f} | Accuracy: {acc_caer:.4f}")


In [None]:
import pickle, torch, numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# 1) Load movie-clips pickle
with open("/content/drive/MyDrive/movie_clips_simple.pkl","rb") as f:
    data = pickle.load(f)
uids = list(data["CMU_MOSEI_Labels"].keys())

# 2) Dataset + collate
class MovieDS(Dataset):
    def __init__(self,u): self.u=u
    def __len__(self): return len(self.u)
    def __getitem__(self,i):
        uid = self.u[i]
        t = torch.from_numpy(data["CMU_MOSEI_TimestampedWordVectors"][uid]["features"]).float()
        a = torch.from_numpy(data["CMU_MOSEI_COVAREP"][uid]["features"]).float()
        v = torch.from_numpy(data["CMU_MOSEI_VisualFacet42"][uid]["features"]).float()
        y = torch.from_numpy(data["CMU_MOSEI_Labels"][uid]["features"]).float()
        return t, a, v, y

def coll(b):
    ts,as_,vs,ys = zip(*b)
    t = pad_sequence(ts, True); tlen = torch.tensor([x.size(0) for x in ts])
    a = pad_sequence(as_,True); alen = torch.tensor([x.size(0) for x in as_])
    v = pad_sequence(vs,True); vlen = torch.tensor([x.size(0) for x in vs])
    return t,tlen,a,alen,v,vlen,torch.stack(ys)

loader = DataLoader(MovieDS(uids), batch_size=16, collate_fn=coll)

# 3) Core model
class XModal(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.text_rnn   = torch.nn.GRU(300,256,batch_first=True)
        self.audio_rnn  = torch.nn.GRU(74,128,batch_first=True)
        self.visual_rnn = torch.nn.GRU(35,128,batch_first=True)
        self.fc         = torch.nn.Linear(256+128+128,7)
    def forward(self,t,tlen,a,alen,v,vlen):
        def last(rnn,x,l):
            return rnn(pack_padded_sequence(x,l.cpu(),batch_first=True,enforce_sorted=False))[1].squeeze(0)
        h = torch.cat([
            last(self.text_rnn,   t,    tlen),
            last(self.audio_rnn,  a,    alen),
            last(self.visual_rnn, v,    vlen)
        ], dim=-1)
        return self.fc(h)

# 4) Checkpoints
paths = {
  "Zero-shot MOSEI": "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt",
  "CAER-fine-tuned": "/content/drive/MyDrive/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"
}

device = "cuda" if torch.cuda.is_available() else "cpu"
results = {}

# Prepare plots
plt.figure(figsize=(10, 4))

for i, (name, ckpt) in enumerate(paths.items(), 1):
    # load model
    model = XModal().to(device)
    sd = torch.load(ckpt, map_location=device)["state_dict"]
    model.load_state_dict(sd, strict=False)
    model.eval()

    # collect logits & targets
    all_logits, all_targets = [], []
    with torch.no_grad():
        for t,tlen,a,alen,v,vlen,y in loader:
            t, a, v = t.to(device), a.to(device), v.to(device)
            logits = model(t,tlen,a,alen,v,vlen).cpu().numpy()
            all_logits.append(logits)
            all_targets.append(y.numpy())
    logits = np.concatenate(all_logits)
    targets = np.concatenate(all_targets)

    # sweep thresholds
    ths = np.linspace(0.1, 0.9, 17)
    f1s, accs = [], []
    for th in ths:
        preds = (1/(1+np.exp(-logits)) > th).astype(int)
        f1s.append(f1_score(targets, preds, average="macro", zero_division=0))
        accs.append((preds == targets).mean())

    # find best F1 and best accuracy thresholds
    best_f1_idx  = int(np.argmax(f1s))
    best_acc_idx = int(np.argmax(accs))
    results[name] = {
        "best_f1":  (ths[best_f1_idx], f1s[best_f1_idx]),
        "best_acc": (ths[best_acc_idx], accs[best_acc_idx])
    }

    # plot
    plt.subplot(1, 2, 1)
    plt.plot(ths, f1s, marker='o', label=name)
    plt.title("Macro-F1 vs Threshold")
    plt.xlabel("Threshold"); plt.ylabel("Macro-F1"); plt.grid(True)
    plt.subplot(1, 2, 2)
    plt.plot(ths, accs, marker='o', label=name)
    plt.title("Accuracy vs Threshold")
    plt.xlabel("Threshold"); plt.ylabel("Accuracy"); plt.grid(True)

# finalize
for ax in plt.gcf().axes:
    ax.legend()
plt.tight_layout()
plt.show()

# print summary
for name, stats in results.items():
    th_f1,  f1  = stats["best_f1"]
    th_acc, acc = stats["best_acc"]
    print(f"{name}:")
    print(f"  Best Macro-F1  = {f1:.4f} at threshold {th_f1:.2f}")
    print(f"  Best Accuracy  = {acc:.4f} at threshold {th_acc:.2f}")


In [None]:
import pickle, torch, numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.metrics import f1_score, accuracy_score

# --- helper to evaluate one dataset + model ckpt ---
def evaluate_at_best_f1(data_pkl, ckpt_path, batch_size=16, device="cpu"):
    # 1) load features
    with open(data_pkl, "rb") as f:
        data = pickle.load(f)
    uids = list(data["CMU_MOSEI_Labels"].keys())

    # 2) dataset + collate
    class DS(Dataset):
        def __init__(self,u): self.u=u
        def __len__(self): return len(self.u)
        def __getitem__(self,i):
            uid = self.u[i]
            t = torch.from_numpy(data["CMU_MOSEI_TimestampedWordVectors"][uid]["features"]).float()
            a = torch.from_numpy(data["CMU_MOSEI_COVAREP"][uid]["features"]).float()
            v = torch.from_numpy(data["CMU_MOSEI_VisualFacet42"][uid]["features"]).float()
            y = torch.from_numpy(data["CMU_MOSEI_Labels"][uid]["features"]).float()
            return t,a,v,y

    def coll(b):
        ts,as_,vs,ys = zip(*b)
        t_p = pad_sequence(ts , batch_first=True);   tlen = torch.tensor([x.size(0) for x in ts])
        a_p = pad_sequence(as_, batch_first=True);   alen = torch.tensor([x.size(0) for x in as_])
        v_p = pad_sequence(vs , batch_first=True);   vlen = torch.tensor([x.size(0) for x in vs])
        return t_p,tlen,a_p,alen,v_p,vlen,torch.stack(ys)

    loader = DataLoader(DS(uids), batch_size=batch_size, collate_fn=coll)

    # 3) model definition must match your XModal
    class XModal(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.text_rnn   = torch.nn.GRU(300,256,batch_first=True)
            self.audio_rnn  = torch.nn.GRU( 74,128,batch_first=True)
            self.visual_rnn = torch.nn.GRU( 35,128,batch_first=True)
            self.fc         = torch.nn.Linear(256+128+128,7)
        def forward(self,t,tlen,a,alen,v,vlen):
            def last(rnn,x,l):
                return rnn(pack_padded_sequence(x,l.cpu(),batch_first=True,enforce_sorted=False))[1].squeeze(0)
            h = torch.cat([
                last(self.text_rnn,  t,   tlen),
                last(self.audio_rnn, a,   alen),
                last(self.visual_rnn,v,   vlen),
            ], dim=-1)
            return self.fc(h)

    # 4) load model
    model = XModal().to(device)
    sd = torch.load(ckpt_path, map_location=device)["state_dict"]
    model.load_state_dict(sd, strict=False)
    model.eval()

    # 5) collect all logits & targets
    all_logits, all_targets = [], []
    with torch.no_grad():
        for t,tlen,a,alen,v,vlen,y in loader:
            t,a,v = t.to(device), a.to(device), v.to(device)
            logits = model(t,tlen,a,alen,v,vlen).cpu().numpy()
            all_logits.append(logits)
            all_targets.append(y.numpy())
    logits = np.concatenate(all_logits)
    targets = np.concatenate(all_targets)

    # 6) sweep thresholds
    ths = np.linspace(0.1,0.9,17)
    best = {"th": None, "f1": -1, "acc": None}
    for th in ths:
        preds = (1/(1+np.exp(-logits)) > th).astype(int)
        f1  = f1_score(targets, preds, average="macro", zero_division=0)
        acc = accuracy_score(targets.flatten(), preds.flatten())
        if f1 > best["f1"]:
            best.update(th=th, f1=f1, acc=acc)
    return best

# --- run for both datasets & both ckpts ---
results = {}
models = {
    "Zero-shot MOSEI": "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt",
    "CAER-fine-tuned": "/content/drive/MyDrive/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"
}
datasets = {
    "Movie Clips":   "/content/drive/MyDrive/movie_clips_simple.pkl",
    "CREMA-D (4.5k)":"/content/drive/MyDrive/crema_d_features.pkl"
}

for mdl_name, ckpt in models.items():
    for ds_name, pkl in datasets.items():
        key = f"{mdl_name} on {ds_name}"
        results[key] = evaluate_at_best_f1(pkl, ckpt, device="cuda" if torch.cuda.is_available() else "cpu")

# --- print a clean table ---
print(f"{'Model':<20} {'Dataset':<18} {'Thresh':>6} {'Macro-F1':>10} {'Accuracy':>10}")
print("-"*60)
for k,v in results.items():
    mdl, ds = k.split(" on ")
    print(f"{mdl:<20} {ds:<18} {v['th']:6.2f} {v['f1']:10.4f} {v['acc']:10.4f}")



In [None]:
import pickle
import numpy as np

# Load original CAER pickle
with open("/content/drive/MyDrive/caer_aligned_full.pkl", "rb") as f:
    data = pickle.load(f)

# Prepare new dictionary
new_data = {
    "CMU_MOSEI_TimestampedWordVectors": {},
    "CMU_MOSEI_COVAREP": {},
    "CMU_MOSEI_VisualFacet42": {},
    "CMU_MOSEI_Labels": {}
}

for k in data['CMU_MOSEI_Labels'].keys():
    # TEXT → target shape: [T, 300]
    t = data['CMU_MOSEI_TimestampedWordVectors'][k]['features']
    if t.ndim == 1:
        t = np.expand_dims(t, 0)
    t = t[:, :300] if t.shape[1] >= 300 else np.pad(t, ((0,0),(0,300 - t.shape[1])))

    # AUDIO → target shape: [T, 74]
    a = data['CMU_MOSEI_COVAREP'][k]['features']
    a = a[:, :74] if a.shape[1] >= 74 else np.pad(a, ((0,0),(0,74 - a.shape[1])))

    # VISUAL → target shape: [T, 35]
    v = data['CMU_MOSEI_VisualFacet42'][k]['features']
    v = v[:, :35] if v.shape[1] >= 35 else np.pad(v, ((0,0),(0,35 - v.shape[1])))

    # LABEL
    y = data['CMU_MOSEI_Labels'][k]['features']

    # Store in new dict
    new_data['CMU_MOSEI_TimestampedWordVectors'][k] = {'features': t}
    new_data['CMU_MOSEI_COVAREP'][k] = {'features': a}
    new_data['CMU_MOSEI_VisualFacet42'][k] = {'features': v}
    new_data['CMU_MOSEI_Labels'][k] = {'features': y}

# Save new pickle
with open("/content/drive/MyDrive/caer_aligned_mosei_format.pkl", "wb") as f:
    pickle.dump(new_data, f)

print("✅ New pickle saved as caer_aligned_mosei_format.pkl")


In [None]:
import pickle, torch, numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics.classification import MultilabelF1Score, MultilabelAccuracy
from sklearn.metrics import f1_score, accuracy_score

# ──────────────── 1. DATASET & DATALOADER ────────────────
class CAERDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data
        self.ids  = list(data['CMU_MOSEI_Labels'].keys())

    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        k = self.ids[idx]
        t = torch.as_tensor(self.data['CMU_MOSEI_TimestampedWordVectors'][k]['features'], dtype=torch.float32)
        a = torch.as_tensor(self.data['CMU_MOSEI_COVAREP'][k]['features'], dtype=torch.float32)
        v = torch.as_tensor(self.data['CMU_MOSEI_VisualFacet42'][k]['features'], dtype=torch.float32)
        y = torch.as_tensor(self.data['CMU_MOSEI_Labels'][k]['features'], dtype=torch.float32)

        # Patch: avoid empty sequences
        if t.size(0) == 0: t = torch.zeros(1, 300)
        if a.size(0) == 0: a = torch.zeros(1, 74)
        if v.size(0) == 0: v = torch.zeros(1, 35)

        return t, a, v, y


def collate_fn(batch):
    ts, as_, vs, ys = zip(*batch)
    t_p = pad_sequence(ts, batch_first=True)
    t_l = torch.tensor([x.size(0) for x in ts])
    a_p = pad_sequence(as_, batch_first=True)
    a_l = torch.tensor([x.size(0) for x in as_])
    v_p = pad_sequence(vs, batch_first=True)
    v_l = torch.tensor([x.size(0) for x in vs])
    y   = torch.stack(ys)
    return t_p, t_l, a_p, a_l, v_p, v_l, y

# load & split
DATA_PKL = "/content/drive/MyDrive/caer_aligned_mosei_format.pkl"
dataset  = CAERDataset(DATA_PKL)
N        = len(dataset)
train_n  = int(0.7*N)
val_n    = int(0.15*N)
test_n   = N - train_n - val_n
train_ds, val_ds, test_ds = random_split(dataset, [train_n, val_n, test_n], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=collate_fn, num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=4)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=4)

# ──────────────── 2. SCRATCH MODEL ────────────────
class ScratchXModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300,256,batch_first=True)
        self.audio_rnn  = nn.GRU( 74,128,batch_first=True)
        self.visual_rnn = nn.GRU( 35,128,batch_first=True)
        self.fc         = nn.Linear(256+128+128, 7)
        self.loss       = nn.BCEWithLogitsLoss()
        self.thresh     = 0.3
        self.f1         = MultilabelF1Score(num_labels=7, threshold=self.thresh, average='macro')
        self.acc        = MultilabelAccuracy(num_labels=7, threshold=self.thresh)

    def forward(self, t, tl, a, al, v, vl):
        def last(rnn, x, lengths):
            _, h = rnn(pack_padded_sequence(x, lengths.cpu(),
                                            batch_first=True,
                                            enforce_sorted=False))
            return h.squeeze(0)
        h_t = last(self.text_rnn,   t,  tl)
        h_a = last(self.audio_rnn,  a,  al)
        h_v = last(self.visual_rnn, v,  vl)
        return self.fc(torch.cat([h_t,h_a,h_v], dim=-1))

    def training_step(self, batch, _):
        t, tl, a, al, v, vl, y = batch
        logits = self(t,tl,a,al,v,vl)
        loss   = self.loss(logits, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, _):
        t, tl, a, al, v, vl, y = batch
        logits = self(t,tl,a,al,v,vl)
        preds  = (torch.sigmoid(logits)>self.thresh).int()
        self.f1.update(preds, y.int())
        self.acc.update(preds, y.int())

    def on_validation_epoch_end(self):
        self.log("val_macro_f1", self.f1.compute(), prog_bar=True)
        self.log("val_acc",      self.acc.compute(), prog_bar=True)
        self.f1.reset(); self.acc.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

# ──────────────── 3. TRAIN & SAVE ────────────────
ckpt_cb = ModelCheckpoint(
    monitor="val_macro_f1", mode="max",
    filename="scratch-caer-{epoch:02d}-{val_macro_f1:.3f}",
    save_top_k=1
)
stop_cb = EarlyStopping(monitor="val_macro_f1", mode="max", patience=5)

trainer = pl.Trainer(
    max_epochs=100,
    accelerator="auto",
    callbacks=[ckpt_cb, stop_cb]
)
model = ScratchXModal()
trainer.fit(model, train_loader, val_loader)

print("Best Scratch-CAER ckpt:", ckpt_cb.best_model_path)


In [None]:
import pickle
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from torchmetrics.classification import (
    MultilabelF1Score,
    MultilabelPrecision,
    MultilabelRecall,
    MultilabelAccuracy
)

# ---------- Dataset ----------
class CAERMoseiFormatDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            self.data = pickle.load(f)
        self.ids = list(self.data["CMU_MOSEI_Labels"].keys())

    def __len__(self): return len(self.ids)

    def __getitem__(self, idx):
        k = self.ids[idx]
        def get(name): return torch.tensor(self.data[name][k]["features"], dtype=torch.float32)
        return get("CMU_MOSEI_TimestampedWordVectors"), get("CMU_MOSEI_COVAREP"), get("CMU_MOSEI_VisualFacet42"), torch.tensor(self.data["CMU_MOSEI_Labels"][k]["features"], dtype=torch.float32)

def collate(batch):
    ts, as_, vs, ys = zip(*batch)
    t_p = pad_sequence(ts, batch_first=True)
    t_l = torch.tensor([x.size(0) for x in ts])
    a_p = pad_sequence(as_, batch_first=True)
    a_l = torch.tensor([x.size(0) for x in as_])
    v_p = pad_sequence(vs, batch_first=True)
    v_l = torch.tensor([x.size(0) for x in vs])
    y   = torch.stack(ys)
    return t_p, t_l, a_p, a_l, v_p, v_l, y

# ---------- Model ----------
class XModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU(74, 128, batch_first=True)
        self.visual_rnn = nn.GRU(35, 128, batch_first=True)
        self.fc         = nn.Linear(256 + 128 + 128, 7)

    def forward(self, t, tl, a, al, v, vl):
        def last(rnn, x, lengths):
            packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
            return rnn(packed)[1].squeeze(0)
        h_t = last(self.text_rnn,   t, tl)
        h_a = last(self.audio_rnn,  a, al)
        h_v = last(self.visual_rnn, v, vl)
        return self.fc(torch.cat([h_t, h_a, h_v], dim=-1))

# ---------- Evaluation ----------
if __name__ == "__main__":
    pkl_path  = "/content/drive/MyDrive/caer_aligned_mosei_format.pkl"
    ckpt_path = "/content/drive/MyDrive/EmotionFusion/checkpoints/best-epoch=12-val_f1_micro=0.459.ckpt"

    ds = CAERMoseiFormatDataset(pkl_path)
    N = len(ds)
    train_sz = int(0.7 * N)
    val_sz   = int(0.15 * N)
    test_sz  = N - train_sz - val_sz  # ensures exact sum
    _, _, test_ds = random_split(ds, [train_sz, val_sz, test_sz])
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate)

    model = XModal.load_from_checkpoint(ckpt_path, strict=False)
    model.eval().cuda()

    # Metrics
    threshold = 0.3
    f1    = MultilabelF1Score(num_labels=7, threshold=threshold, average='none').cuda()
    prec  = MultilabelPrecision(num_labels=7, threshold=threshold, average='none').cuda()
    rec   = MultilabelRecall(num_labels=7, threshold=threshold, average='none').cuda()
    acc   = MultilabelAccuracy(num_labels=7, threshold=threshold, average='none').cuda()

    # Run
    print("🔍 Evaluating Zero-Shot MOSEI model on CAER...")
    with torch.no_grad():
        for batch in test_loader:
            t, tl, a, al, v, vl, y = [x.cuda() for x in batch]
            logits = model(t, tl, a, al, v, vl)
            preds  = (torch.sigmoid(logits) > threshold).int()
            f1.update(preds, y.int())
            prec.update(preds, y.int())
            rec.update(preds, y.int())
            acc.update(preds, y.int())

    # Results
    import pandas as pd
    class_names = ["happy", "sad", "angry", "surprise", "disgust", "fear", "neutral"]
    df = pd.DataFrame({
        "Emotion":   class_names,
        "F1":        f1.compute().cpu().numpy(),
        "Precision": prec.compute().cpu().numpy(),
        "Recall":    rec.compute().cpu().numpy(),
        "Accuracy":  acc.compute().cpu().numpy()
    })
    print(df.to_string(index=False))


In [None]:
import pickle
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from torchmetrics.classification import (
    MultilabelF1Score,
    MultilabelPrecision,
    MultilabelRecall,
    MultilabelAccuracy
)

# ---------------------- CAER Dataset ----------------------
class CAERDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data
        self.ids  = list(data['CMU_MOSEI_Labels'].keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        k = self.ids[idx]
        t = torch.as_tensor(self.data['CMU_MOSEI_TimestampedWordVectors'][k]['features'])
        a = torch.as_tensor(self.data['CMU_MOSEI_COVAREP'][k]['features'])
        v = torch.as_tensor(self.data['CMU_MOSEI_VisualFacet42'][k]['features'])
        y = torch.as_tensor(self.data['CMU_MOSEI_Labels'][k]['features'])
        return t, a, v, y

def coll(batch):
    ts, as_, vs, ys = zip(*batch)
    t = torch.stack(ts).unsqueeze(1)                 # (B,1,300)
    tlen = torch.ones(len(ts), dtype=torch.long)     # all length=1

    alens = torch.tensor([a.shape[0] for a in as_])
    a_padded = pad_sequence(as_, batch_first=True)   # (B, T_a, 65)

    vlens = torch.tensor([v.shape[0] for v in vs])
    v_padded = pad_sequence(vs, batch_first=True)    # (B, T_v, 2048)

    y = torch.stack(ys)                              # (B,7)
    return t, tlen, a_padded, alens, v_padded, vlens, y

# ---------------------- Fine-tunable Model Definition ----------------------
class XModalProj(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # Define MOSEI‐pretrained architecture slots to be filled by checkpoint
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU(74,  128, batch_first=True)
        self.visual_rnn = nn.GRU(35,  128, batch_first=True)
        self.fc         = nn.Linear(256 + 128 + 128, 7)

        # Projection layers for CAER→MOSEI dims
        self.a_proj = nn.Linear(65,   74, bias=False)
        self.v_proj = nn.Linear(2048, 35, bias=True)
        nn.init.eye_( self.a_proj.weight )
        nn.init.zeros_(self.v_proj.weight)

        self.dropout = nn.Dropout(0.3)
        self.loss    = nn.BCEWithLogitsLoss()
        self.thresh  = 0.3

    def forward(self, t, tlen, a, alen, v, vlen):
        # project CAER features into MOSEI dims
        a_proj = self.a_proj(a)
        v_proj = self.v_proj(v)

        # helper to get last hidden state
        def last(rnn, x, lengths):
            packed = pack_padded_sequence(x, lengths.cpu(),
                                          batch_first=True,
                                          enforce_sorted=False)
            return rnn(packed)[1].squeeze(0)

        h_text  = last(self.text_rnn,   t,      tlen)
        h_audio = last(self.audio_rnn,  a_proj, alen)
        h_vis   = last(self.visual_rnn, v_proj, vlen)
        h       = torch.cat([h_text, h_audio, h_vis], dim=-1)
        h       = self.dropout(h)
        return self.fc(h)

# ---------------------- Evaluation Script ----------------------
if __name__ == "__main__":
    # Paths
    DRIVE_ROOT     = "/content/drive/MyDrive"
    PICKLE_PATH    = f"{DRIVE_ROOT}/caer_aligned_full.pkl"
    BEST_CKPT_PATH = f"{DRIVE_ROOT}/CAER_finetune_checkpoints/best-caer-epoch=25-val_f1=0.391.ckpt"

    # Load dataset and split
    ds = CAERDataset(PICKLE_PATH)
    N  = len(ds)
    train_sz = int(0.7 * N)
    val_sz   = int(0.15 * N)
    test_sz  = N - train_sz - val_sz
    _, _, test_ds = random_split(ds, [train_sz, val_sz, test_sz])
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=coll)

    # Load model from checkpoint (fills text_rnn, audio_rnn, visual_rnn, fc)
    model = XModalProj.load_from_checkpoint(BEST_CKPT_PATH, strict=False)
    model.eval().cuda()

    # Metrics
    threshold = 0.3
    f1    = MultilabelF1Score(num_labels=7, threshold=threshold, average='none').cuda()
    prec  = MultilabelPrecision(num_labels=7, threshold=threshold, average='none').cuda()
    rec   = MultilabelRecall(num_labels=7, threshold=threshold, average='none').cuda()
    acc   = MultilabelAccuracy(num_labels=7, threshold=threshold, average='none').cuda()

    # Run evaluation
    print("🔍 Evaluating on CAER test set...")
    with torch.no_grad():
        for batch in test_loader:
            t, tlen, a, alen, v, vlen, y = [x.cuda() for x in batch]
            logits = model(t, tlen, a, alen, v, vlen)
            preds  = (torch.sigmoid(logits) > threshold).int()
            y_bin  = y.int().cuda()
            f1.update(preds, y_bin)
            prec.update(preds, y_bin)
            rec.update(preds, y_bin)
            acc.update(preds, y_bin)

    # Report results
    class_names = ["happy","sad","angry","surprise","disgust","fear","neutral"]
    import pandas as pd
    df = pd.DataFrame({
        "emotion":   class_names,
        "F1":        f1.compute().cpu().numpy(),
        "Precision": prec.compute().cpu().numpy(),
        "Recall":    rec.compute().cpu().numpy(),
        "Accuracy":  acc.compute().cpu().numpy()
    })
    print(df.to_string(index=False))

import pickle
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from torchmetrics.classification import (
    MultilabelF1Score,
    MultilabelPrecision,
    MultilabelRecall,
    MultilabelAccuracy
)
import pandas as pd

# ---------------------- Dataset ----------------------
class CAERScratchDataset(Dataset):
    def __init__(self, pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        self.data = data
        self.ids  = list(data['CMU_MOSEI_Labels'].keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        k = self.ids[idx]
        t = torch.as_tensor(self.data['CMU_MOSEI_TimestampedWordVectors'][k]['features'])
        a = torch.as_tensor(self.data['CMU_MOSEI_COVAREP'][k]['features'])
        v = torch.as_tensor(self.data['CMU_MOSEI_VisualFacet42'][k]['features'])
        y = torch.as_tensor(self.data['CMU_MOSEI_Labels'][k]['features'])
        return t, a, v, y

def coll(batch):
    ts, as_, vs, ys = zip(*batch)
    t = pad_sequence(ts, batch_first=True)
    tlen = torch.tensor([x.size(0) for x in ts])
    a_p = pad_sequence(as_, batch_first=True)
    alen = torch.tensor([x.size(0) for x in as_])
    v_p = pad_sequence(vs, batch_first=True)
    vlen = torch.tensor([x.size(0) for x in vs])
    y = torch.stack(ys)
    return t, tlen, a_p, alen, v_p, vlen, y

# ---------------------- Scratch Model ----------------------
class ScratchXModal(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.text_rnn   = nn.GRU(300, 256, batch_first=True)
        self.audio_rnn  = nn.GRU(74,  128, batch_first=True)
        self.visual_rnn = nn.GRU(35,  128, batch_first=True)
        self.fc         = nn.Linear(256 + 128 + 128, 7)
        self.dropout    = nn.Dropout(0.3)

    def forward(self, t, tlen, a, alen, v, vlen):
        def last(rnn, x, lengths):
            packed = pack_padded_sequence(x, lengths.cpu(),
                                          batch_first=True,
                                          enforce_sorted=False)
            return rnn(packed)[1].squeeze(0)

        h_t = last(self.text_rnn,   t,  tlen)
        h_a = last(self.audio_rnn,  a,  alen)
        h_v = last(self.visual_rnn, v,  vlen)
        h   = torch.cat([h_t, h_a, h_v], dim=-1)
        h   = self.dropout(h)
        return self.fc(h)

# ---------------------- Evaluation ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pkl_path  = "/content/drive/MyDrive/caer_aligned_mosei_format.pkl"
ckpt_path = "/content/drive/MyDrive/scratch-caer-epoch=33-val_macro_f1=0.134.ckpt"

# Load dataset
ds = CAERScratchDataset(pkl_path)
N = len(ds)
train_sz, val_sz = int(0.7 * N), int(0.15 * N)
test_sz = N - train_sz - val_sz
_, _, test_ds = random_split(ds, [train_sz, val_sz, test_sz])
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=coll)

# Load model
model = ScratchXModal.load_from_checkpoint(ckpt_path, strict=False)
model.eval().to(device)

# Metrics
threshold = 0.3
f1    = MultilabelF1Score(num_labels=7, threshold=threshold, average='none').to(device)
prec  = MultilabelPrecision(num_labels=7, threshold=threshold, average='none').to(device)
rec   = MultilabelRecall(num_labels=7, threshold=threshold, average='none').to(device)
acc   = MultilabelAccuracy(num_labels=7, threshold=threshold, average='none').to(device)

# Evaluate
print("🔍 Evaluating Scratch-CAER model on test set...")
with torch.no_grad():
    for batch in test_loader:
        t, tlen, a, alen, v, vlen, y = [x.to(device) for x in batch]
        logits = model(t, tlen, a, alen, v, vlen)
        preds  = (torch.sigmoid(logits) > threshold).int()
        f1.update(preds, y.int())
        prec.update(preds, y.int())
        rec.update(preds, y.int())
        acc.update(preds, y.int())

# Report
class_names = ["happy", "sad", "angry", "surprise", "disgust", "fear", "neutral"]
df = pd.DataFrame({
    "emotion":   class_names,
    "F1":        f1.compute().cpu().numpy(),
    "Precision": prec.compute().cpu().numpy(),
    "Recall":    rec.compute().cpu().numpy(),
    "Accuracy":  acc.compute().cpu().numpy()
})
print(df.to_string(index=False))
