In [7]:
!pip install transformers==4.40.1 soundfile ftfy regex einops scipy tqdm librosa timm opencv-python-headless ffmpeg-python py7zr



Collecting transformers==4.40.1
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting opencv-python-headless
  Downloading opencv_python_headless-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Collecting py7zr
  Downloading py7zr-1.0.0-py3-none-any.whl.metadata (17 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting numpy>=1.17 (from transformers==4.40.1)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB

In [4]:
pip install --upgrade pytorchvideo


[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
import json

# === 数据路径配置 ===
root_dir = "/workspace/dataset/S1"  # 包含 Ses01F_impro01, Ses01F_impro02, ...
txt_dir = os.path.join(root_dir, "TXT")  # 存放 Ses01F_impro01.txt 等

# === 正则解析每行文本 ===
pattern = re.compile(r"(\S+)\s*\[(\d+\.\d+)-(\d+\.\d+)\]:\s*(.*)")

# === 输出数据结构 ===
dataset = []

# 遍历所有 txt 文件
for fname in os.listdir(txt_dir):
    if not fname.endswith(".txt"):
        continue

    video_name = os.path.splitext(fname)[0]
    txt_path = os.path.join(txt_dir, fname)
    video_dir = os.path.join(root_dir, video_name)

    if not os.path.exists(video_dir):
        print(f"[WARN] missing video folder for {video_name}")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        m = pattern.match(line)
        if not m:
            continue

        utt_id, start, end, text = m.groups()
        video_path = os.path.join(video_dir, f"{utt_id}.mp4")
        audio_path = os.path.join(video_dir, f"{utt_id}.wav")

        if not (os.path.exists(video_path) and os.path.exists(audio_path)):
            print(f"[WARN] missing segment for {utt_id}")
            continue

        dataset.append({
            "utt_id": utt_id,
            "video_path": video_path,
            "audio_path": audio_path,
            "text": text.strip()
        })

print(f"Parsed {len(dataset)} utterances in total.")
os.makedirs(os.path.join(root_dir, "meta"), exist_ok=True)

out_json = os.path.join(root_dir, "meta", "train_meta.json")
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print(f" Metadata saved to: {out_json}")


Parsed 120 utterances in total.
 Metadata saved to: /workspace/dataset/S1/meta/train_meta.json


In [5]:
!pip uninstall -y torch torchvision torchaudio pytorchvideo transformers accelerate

!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorchvideo==0.1.5




Found existing installation: torch 2.1.2
Uninstalling torch-2.1.2:
  Successfully uninstalled torch-2.1.2
Found existing installation: torchvision 0.16.2
Uninstalling torchvision-0.16.2:
  Successfully uninstalled torchvision-0.16.2
Found existing installation: torchaudio 2.1.2
Uninstalling torchaudio-2.1.2:
  Successfully uninstalled torchaudio-2.1.2
Found existing installation: pytorchvideo 0.1.5
Uninstalling pytorchvideo-0.1.5:
  Successfully uninstalled pytorchvideo-0.1.5
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
[0mCollecting torch==2.1.2
  Using cached torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.16.2
  Using cached torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.1.2
  Using cached torchaudio-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting pytorchvideo==0.1.5
  Using cached pytorch

In [6]:
import torch, torchvision, torchaudio; print(torch.__version__, torchvision.__version__, torchaudio.__version__)

2.1.2+cu121 0.16.2+cu121 2.1.2+cu121


In [None]:
'''
 image bind 的定义
def load_and_transform_vision_data(image_paths, device):
    if image_paths is None:
        return None

    image_outputs = []

    data_transform = transforms.Compose(
        [
            transforms.Resize(
                224, interpolation=transforms.InterpolationMode.BICUBIC
            ),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=(0.48145466, 0.4578275, 0.40821073),
                std=(0.26862954, 0.26130258, 0.27577711),
            ),
        ]
    )
    
    for image_path in image_paths:
        with open(image_path, "rb") as fopen:
            image = Image.open(fopen).convert("RGB")

        image = data_transform(image).to(device)
        image_outputs.append(image)
    return torch.stack(image_outputs, dim=0)
    
'''

In [12]:
import os
import json
import torch
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda
from pytorchvideo.transforms import UniformTemporalSubsample
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
import torchvision.transforms as T
device = "cuda:0"
model = imagebind_model.imagebind_huge(pretrained=True)
for param in model.parameters():
    param.requires_grad = False 
model.eval().to(device)

meta_path = "/workspace/dataset/S1/meta/train_meta.json"
save_dir = "/workspace/dataset/S1/features"
os.makedirs(save_dir, exist_ok=True)


def load_and_transform_vision_data_from_tensor(video_tensor, device):
    """
    模仿 imagebind.data.load_and_transform_vision_data 的行为，
    对视频帧张量做标准化预处理后送入模型。
    video_tensor: torch.Tensor [C, T, H, W]
    """
    # ImageBind vision 模块的标准归一化参数
    mean = torch.tensor([0.48145466, 0.4578275, 0.40821073], device=device)[:, None, None, None]
    std = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=device)[:, None, None, None]

    # Resize + CenterCrop（保持一致性）
    transform = T.Compose([
        T.Resize((224, 224)),
    ])

    # video_tensor: (C, T, H, W)
    frames = []
    for t in range(video_tensor.shape[1]):
        frame = video_tensor[:, t, :, :].cpu()
        frame = T.ToPILImage()(frame)
        frame = transform(frame)
        frame = T.ToTensor()(frame)
        frames.append(frame)

    x = torch.stack(frames, dim=0).to(device)  # [T, 3, 224, 224]
    x = (x - mean.permute(1, 0, 2, 3)) / std.permute(1, 0, 2, 3)  # 归一化
    return x


def load_video_gpu(video_path, num_frames=16, max_duration=15.0):
    """
    用 PyTorchVideo 直接在 GPU 上读取视频并采样为若干帧（不落地jpg）
    """
    video = EncodedVideo.from_path(video_path)
    duration = video.duration

    # 截断长视频
    end_sec = min(duration, max_duration)
    clip = video.get_clip(start_sec=0, end_sec=end_sec)

    video_data = clip["video"]  # (C, T, H, W)
    T = video_data.shape[1]
    if T > num_frames:
        sampler = UniformTemporalSubsample(num_frames)
        video_data = sampler(video_data)
    elif T < num_frames:
        repeat = (num_frames // T) + 1
        video_data = video_data.repeat(1, repeat, 1, 1)[:, :num_frames]

    return video_data  # (3, num_frames, H, W)


def extract_embeddings(meta_file):
    with open(meta_file, "r", encoding="utf-8") as f:
        items = json.load(f)

    grouped = {}
    for item in items:
        vid = item["video_path"].split("/")[-2]  # e.g., Ses01F_impro01
        grouped.setdefault(vid, []).append(item)

    for vid, utterances in grouped.items():
        print(f" Processing {vid} ...")
        all_embeddings = {}

        for item in utterances:
            utt_id = item["utt_id"]
            text = item["text"]
            audio_path = item["audio_path"]
            video_path = item["video_path"]

            # GPU加载视频（多帧）
            video_tensor = load_video_gpu(video_path, num_frames=16)
            vision_input = load_and_transform_vision_data_from_tensor(video_tensor, device)


            # 文本 & 音频
            text_input = data.load_and_transform_text([text], device)
            audio_input = data.load_and_transform_audio_data([audio_path], device)

            inputs = {
                ModalityType.TEXT: text_input,
                ModalityType.AUDIO: audio_input,
                ModalityType.VISION: vision_input,
            }

            with torch.no_grad():
                emb = model(inputs)

            all_embeddings[utt_id] = {
                "text_emb": emb[ModalityType.TEXT].cpu(),
                "audio_emb": emb[ModalityType.AUDIO].cpu(),
                "vision_emb": emb[ModalityType.VISION].mean(dim=0, keepdim=True).cpu(),  # 均值池化帧
            }

        # 保存当前视频的所有embedding
        save_path = os.path.join(save_dir, f"{vid}.pt")
        torch.save(all_embeddings, save_path)
        print(f" Saved {save_path} ({len(all_embeddings)} utterances)")

    print(" All videos processed!")


if __name__ == "__main__":
    extract_embeddings(meta_path)


 Processing Ses01F_impro03 ...




 Saved /workspace/dataset/S1/features/Ses01F_impro03.pt (52 utterances)
 Processing Ses01F_impro01 ...




 Saved /workspace/dataset/S1/features/Ses01F_impro01.pt (30 utterances)
 Processing Ses01F_impro02 ...




 Saved /workspace/dataset/S1/features/Ses01F_impro02.pt (38 utterances)
 All videos processed!
