In [3]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pandas
Successfully installed pandas-2.3.3
[0m

In [7]:
import os, torch
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"         # 让 CUDA 报错更早暴露，而不是静默崩
os.environ["TORCH_USE_CUDA_DSA"] = "1"           # 设备侧断言（能多报一些越界错误）
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# 先禁用 cudnn，看看是不是 cudnn 的锅（只用在定位阶段；跑通后再打开）
torch.backends.cudnn.enabled = False


In [8]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
import gc


In [9]:

# 路径 
'''
CSV_PATH = "/workspace/dataset/MELD/dev/dev_sent_emo.csv"
VIDEO_DIR = "/workspace/dataset/MELD/dev/dev_splits"
AUDIO_DIR = "/workspace/dataset/MELD/dev/wav"
SAVE_PATH = "/workspace/dataset/MELD/dev/pt"
'''
CSV_PATH = "/workspace/dataset/MELD/test/test_sent_emo.csv"
VIDEO_DIR = "/workspace/dataset/MELD/test/test_splits"
AUDIO_DIR = "/workspace/dataset/MELD/test/wav"
SAVE_PATH = "/workspace/dataset/MELD/test/pt"

# 模型加载 
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval().to(device)

# 读 CSV
df = pd.read_csv(CSV_PATH, sep=',')  # 若不是\t，可改成 sep=','
results = []


In [10]:
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, hes lost it. Hes totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,Youre a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we wont be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"


In [11]:
import os, time, gc, torch
from contextlib import contextmanager

# 建议：定位时先把 cudnn benchmark 关掉，保持确定性
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

def print_gpu_snapshot(tag=""):
    """打印 cudnn 开关、是否有 CUDA、当前/峰值显存(MB)"""
    alloc = torch.cuda.memory_allocated() / 1024 / 1024
    reserved = torch.cuda.memory_reserved() / 1024 / 1024
    peak = torch.cuda.max_memory_allocated() / 1024 / 1024
    print(f"[{tag}] cudnn={torch.backends.cudnn.enabled} | "
          f"CUDA={torch.cuda.is_available()} | "
          f"alloc={alloc:.1f}MB reserved={reserved:.1f}MB peak={peak:.1f}MB")

def assert_on_cuda(t, name="tensor"):
    if torch.is_tensor(t):
        print(f"  {name}: device={t.device} shape={tuple(t.shape)}")
    else:
        print(f"  {name}: (not a tensor)")

@contextmanager
def cudnn_off_only_here():
    old = torch.backends.cudnn.enabled
    torch.backends.cudnn.enabled = False
    try:
        yield
    finally:
        torch.backends.cudnn.enabled = old

def force_cuda_burn():
    """强制做一次小的 CUDA 计算，验证确实在用 GPU"""
    a = torch.randn(4096, 4096, device="cuda")
    b = torch.randn(4096, 4096, device="cuda")
    c = (a @ b).sum()
    c.item()  # 同步



In [4]:
from imagebind import data
import inspect, importlib
print("data module path:", data.__file__)
print("audio func path:", inspect.getsourcefile(data.load_and_transform_audio_data))
print("waveform2melspec_fixed in module?:", hasattr(data, "waveform2melspec_fixed"))


data module path: /ImageBind/imagebind/data.py
audio func path: /ImageBind/imagebind/data.py
waveform2melspec_fixed in module?: False


In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    dia_id = int(row["Dialogue_ID"])
    utt_id = int(row["Utterance_ID"])
    uid = f"dia{dia_id}_utt{utt_id}"

    video_path = os.path.join(VIDEO_DIR, f"{uid}.mp4")
    audio_path = os.path.join(AUDIO_DIR, f"{uid}.wav")
    if not (os.path.exists(video_path) and os.path.exists(audio_path)):
        continue

    # 已存在则跳过（断点续跑）
    out_file = os.path.join(SAVE_PATH , f"{uid}.pt")
    if os.path.exists(out_file):
        continue

    # ========== 预处理 ==========
    try:
        inputs = {
            ModalityType.TEXT:   data.load_and_transform_text([str(row["Utterance"])], device),         # [1, L]
            ModalityType.VISION: data.load_and_transform_video_data([video_path], device),              # [1, 15, 3, T, 224, 224]
            ModalityType.AUDIO:  data.load_and_transform_audio_data([audio_path], device),              # [1, 3, 1, 128, 204]
        }
    except Exception as e:
        print(f"[WARN] preprocess failed on {uid}: {e}")
        continue

    # 可选：打印一次形状核对
    # print("TEXT  :", inputs[ModalityType.TEXT].shape)
    # print("AUDIO :", inputs[ModalityType.AUDIO].shape)
    # print("VISION:", inputs[ModalityType.VISION].shape)

    # ========== 单次前向（不分 CHUNK）==========
    try:
        with torch.no_grad(), torch.cuda.amp.autocast():  
            emb = model(inputs)
    except RuntimeError as oom:
        print(f"[OOM] {uid}: {oom}")
        # 如果这里 OOM，可尝试把 UniformTemporalSubsample 的帧数从 2 再降低，或去掉 autocast
        continue

    text_emb  = emb[ModalityType.TEXT].detach().cpu()          # [1, 1024]
    audio_emb = emb[ModalityType.AUDIO].detach().cpu()         # [1, 1024]
    video_raw = emb[ModalityType.VISION].detach().cpu()        # [15, 1024] 或 [1,15,1024] 视实现而定
    if video_raw.dim() == 3:   # [1, 15, 1024] -> [15, 1024]
        video_raw = video_raw.squeeze(0)
    video_emb = video_raw.mean(dim=0, keepdim=True)            # [1, 1024]

    # ========== 保存 ==========
    torch.save({
        "id": uid,
        "text_emb":  text_emb,
        "audio_emb": audio_emb,
        "video_emb": video_emb,
        "meta": {
            "Speaker":   row["Speaker"],
            "Emotion":   row["Emotion"],
            "Sentiment": row["Sentiment"],
            "Season":    row["Season"],
            "Episode":   row["Episode"],
            "StartTime": row["StartTime"],
            "EndTime":   row["EndTime"],
        }
    }, out_file)

    del inputs, emb, text_emb, audio_emb, video_emb, video_raw
    torch.cuda.empty_cache(); gc.collect()

print(" All embeddings saved!")

  1%|          | 21/2610 [01:16<2:05:09,  2.90s/it]