In [1]:
import os
import re
import json
import copy
import requests
import warnings

import cv2
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from moviepy.editor import VideoFileClip
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
%matplotlib inline

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
warnings.filterwarnings("ignore")

2025-03-14 12:06:43.388642: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 12:06:43.395711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-14 12:06:43.404699: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-14 12:06:43.407463: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-14 12:06:43.414249: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data_path = '/shareds/IEMOCAP/IEMOCAP_full_release/'
save_path = '/shareds/IEMOCAP/IEMOCAP_video'

In [3]:
emotion2short = {'Neutral': 'neu', 
                 'Excited': 'exc', 
                 'Frustration': 'fru', 
                 'Sadness': 'sad', 
                 'Anger': 'ang', 
                 'Happiness': 'hap' 
                }

# Extract video

In [4]:
def clip_video(video_path, start_time, end_time, output_path):
    video = VideoFileClip(video_path)
    start_sec = float(start_time) + 2 / 100
    end_sec = float(end_time) + 2 / 100
    video_subclip = video.subclip(start_sec, min(video.duration, end_sec))
    video_subclip.write_videofile(output_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)


def get_emotion(label):
    emotions = dict()
    for row in ''.join(label).split('\n\n')[1:-1]:
        row = row.split('\n')
        head = row[0].split('\t')
        name = head[1]
        emo = head[2]
        if emo not in list(emotion2short.values()):
            answers = ''
            for e in row[1:]:
                if e[0] == 'C':
                    answers += e.split('\t')[1] + ' '
            answers = answers.split('; ')
            answers = [e for e in answers if e in emotion2short]
            if answers:
                emo = emotion2short[max(set(answers), key=answers.count)]
            else:
                emo = np.nan
        emotions[name] = emo
    return emotions

In [5]:
label_pattern = r'^(\S+)\s+\[([0-9]+\.[0-9]+)-([0-9]+\.[0-9]+)\]:\s*(.*)$'

os.makedirs(save_path, exist_ok=True)

markup = pd.DataFrame(columns=['session', 'fn', 'idx', 'sex', 'emotion', 'text'])
for i in tqdm(range(1, 6)):
    os.makedirs(f'{save_path}/Session{i}', exist_ok=True)
    for fn in os.listdir(f'{data_path}Session{i}/dialog/transcriptions/'):
        if fn[0] == '.':
            continue
        os.makedirs(f'{save_path}/Session{i}/{fn[:-4]}', exist_ok=True)
        
        with open(f'{data_path}/Session{i}/dialog/transcriptions/{fn}', 'r') as f:
            labels = f.readlines()

        with open(f'{data_path}/Session{i}/dialog/EmoEvaluation/{fn}', 'r') as f:
            emotions = f.readlines()
            emotions = get_emotion(emotions)

        for j, label in enumerate(labels):
            match = re.match(label_pattern, label)
            if match is None:
                # print(f"Invalid label: {label}")
                continue
            row_name, start_time, end_time, text = [match.group(i) for i in range(1, 5)]

            video_path = f'{data_path}Session{i}/dialog/avi/DivX/{fn[:-4]}.avi'
            if not os.path.isfile(video_path):
                print(f"No such file or directory: {video_path}")
                continue
        
            video_save_path = f'{save_path}/Session{i}/{fn[:-4]}/{j}.avi'   

            if label.split(' ')[0] in emotions:
                clip_video(video_path, start_time, end_time, video_save_path)
                markup.loc[markup.shape[0]] = [i, fn[:-4], j, row_name[-4], emotions[label.split(' ')[0]], text]
markup = markup.dropna()
markup.to_csv(f'{save_path}/markup.csv', index=False)

100%|██████████████████████████████████████████| 5/5 [1:08:14<00:00, 818.98s/it]


# Get description

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", device_map="auto", load_in_4bit=True)
_ = model.eval()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
data = pd.read_csv(f'{save_path}/markup.csv')
data.head()

Unnamed: 0,session,fn,idx,sex,emotion,text
0,1,Ses01F_impro07,0,M,exc,Did you get the letter?
1,1,Ses01F_impro07,1,F,exc,"Yes. There's a big envelope it says, you're i..."
2,1,Ses01F_impro07,2,M,exc,Yeah. That is so awesome.
3,1,Ses01F_impro07,4,M,exc,Oh my God. What are you going to do? [LAUGHTER]
4,1,Ses01F_impro07,5,F,exc,So I have to move back to the ghetto but...I k...


In [7]:
def extract_first_frame(video_path, resize_shape=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    cap.release()
    if not ret:
        return None
    
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image = Image.fromarray(frame)

    if resize_shape:
        image = image.resize(resize_shape, Image.Resampling.LANCZOS)
    return image

In [None]:
batch_size = 8                    
max_new_tokens = 256  
video_desc = ["Nan"] * len(data) 

num_rows = len(data)
for start_idx in tqdm(range(0, num_rows, batch_size), total=(num_rows // batch_size + 1)):
    end_idx = min(start_idx + batch_size, num_rows)
    
    batch = data.iloc[start_idx:end_idx]

    batch_images = []
    batch_texts = []
    
    for i, row in batch.iterrows():
        sess, file, local_idx, emotion, *rest = row
        video_path = f"{save_path}/Session{sess}/{file}/{local_idx}.avi"
        
        image = extract_first_frame(video_path)
        if image is None:
            batch_images.append(None)
            batch_texts.append(None)
            continue
        
        conversation = [
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant that provides a thorough analysis of the image. "
                    "Focus on describing what is visible, including objects, background, color palette, "
                    "composition, mood, and any context clues. "
                    "Provide a detailed multi-sentence description rather than a single-sentence answer. "
                    "Focus on what can be directly observed."
                )
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": (
                        "Please describe in detail what you see in this image. "
                        "Comment on the objects, the overall context, and any visual characteristics. "
                        "Use at least three sentences."
                    )}
                ]
            }
        ]
        
        text_for_model = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True,
            tokenize=False
        )
        
        batch_images.append(image)
        batch_texts.append(text_for_model)
    
    valid_indices = [idx for idx, img in enumerate(batch_images) if img is not None]
    if len(valid_indices) != 0:

        all_images = [batch_images[i] for i in valid_indices]
        all_texts  = [batch_texts[i]  for i in valid_indices]
    
        inputs = processor(
            text=all_texts,
            images=all_images,
            return_tensors="pt",
            padding=True
        ).to(model.device)
    
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        
        prompt_length = inputs.input_ids.shape[1]
        generate_ids = generate_ids[:, prompt_length:]
        
        responses = processor.batch_decode(
            generate_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
    
        for idx_local, resp in zip(valid_indices, responses):
            video_desc[idx_local] = resp

 35%|███▍      | 437/1253 [4:15:35<8:03:43, 35.57s/it] 

In [None]:
data["video_caption"] = video_desc
data.head()

In [None]:
data.to_csv("../data/IEMOCAP/modified_video_data.csv", index=False)