In [1]:
import os
import re 
import json
import warnings

import cv2
import torch
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from io import BytesIO
from pydub import AudioSegment
from urllib.request import urlopen
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
warnings.filterwarnings("ignore")

2025-03-13 16:36:21.634649: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-13 16:36:21.642421: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-13 16:36:21.651569: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-13 16:36:21.654330: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-13 16:36:21.661448: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data_path = '/shareds/IEMOCAP/IEMOCAP_full_release/'
save_path = '/shareds/IEMOCAP/IEMOCAP_audio'

In [3]:
emotion2short = {'Neutral': 'neu', 
                 'Excited': 'exc', 
                 'Frustration': 'fru', 
                 'Sadness': 'sad', 
                 'Anger': 'ang', 
                 'Happiness': 'hap' 
                }

# Extract audio

In [4]:
def get_emotion(label):
    emotions = dict()
    for row in ''.join(label).split('\n\n')[1:-1]:
        row = row.split('\n')
        head = row[0].split('\t')
        name = head[1]
        emo = head[2]
        if emo not in list(emotion2short.values()):
            answers = ''
            for e in row[1:]:
                if e[0] == 'C':
                    answers += e.split('\t')[1] + ' '
            answers = answers.split('; ')
            answers = [e for e in answers if e in emotion2short]
            if answers:
                emo = emotion2short[max(set(answers), key=answers.count)]
            else:
                emo = np.nan
        emotions[name] = emo
    return emotions

In [5]:
label_pattern = r'^(\S+)\s+\[([0-9]+\.[0-9]+)-([0-9]+\.[0-9]+)\]:\s*(.*)$'

os.makedirs(save_path, exist_ok=True)

markup = pd.DataFrame(columns=['session', 'fn', 'idx', 'sex', 'emotion', 'text'])
for i in tqdm(range(1, 6)):
    os.makedirs(f'{save_path}/Session{i}', exist_ok=True)
    for fn in os.listdir(f'{data_path}Session{i}/dialog/transcriptions/'):
        if fn[0] == '.':
            continue
        os.makedirs(f'{save_path}/Session{i}/{fn[:-4]}', exist_ok=True)
        
        with open(f'{data_path}/Session{i}/dialog/transcriptions/{fn}', 'r') as f:
            labels = f.readlines()

        with open(f'{data_path}/Session{i}/dialog/EmoEvaluation/{fn}', 'r') as f:
            emotions = f.readlines()
            emotions = get_emotion(emotions)
            
        for j, label in enumerate(labels):
            match = re.match(label_pattern, label)
            if match is None:
                # print(f"Invalid label: {label}")
                continue
            row_name, start_time, end_time, text = [match.group(i) for i in range(1, 5)]

            audio_path = f'{data_path}/Session{i}/dialog/wav/{fn[:-4]}.wav'
            if not os.path.isfile(audio_path):
                print(f"No such file or directory: {audio_path}")
                continue
                
            start_sec = float(start_time) + 2 / 100
            end_sec = float(end_time) + 2 / 100
        
            audio = AudioSegment.from_wav(audio_path)
        
            start_ms = int(start_sec * 1000)
            end_ms = int(end_sec * 1000)
        
            clip = audio[start_ms:end_ms]
            audio_save_path = f'{save_path}/Session{i}/{fn[:-4]}/{j}.wav'
            if label.split(' ')[0] in emotions:
                clip.export(audio_save_path, format="wav")
                markup.loc[markup.shape[0]] = [i, fn[:-4], j, row_name[-4], emotions[label.split(' ')[0]], text]
markup = markup.dropna()
markup.to_csv(f'{save_path}/markup.csv', index=False)

100%|█████████████████████████████████████████████| 5/5 [00:36<00:00,  7.36s/it]


# Get description

In [6]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto", load_in_4bit=True)
_ = model.eval()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
data = pd.read_csv(f'{save_path}/markup.csv')
data.head()

Unnamed: 0,session,fn,idx,sex,emotion,text
0,1,Ses01F_impro07,0,M,exc,Did you get the letter?
1,1,Ses01F_impro07,1,F,exc,"Yes. There's a big envelope it says, you're i..."
2,1,Ses01F_impro07,2,M,exc,Yeah. That is so awesome.
3,1,Ses01F_impro07,4,M,exc,Oh my God. What are you going to do? [LAUGHTER]
4,1,Ses01F_impro07,5,F,exc,So I have to move back to the ghetto but...I k...


In [8]:
batch_size = 8
max_new_tokens = 256    
audio_desc = ["Nan"] * len(data)  

num_rows = len(data)
for start_idx in tqdm(range(0, num_rows, batch_size)):
    end_idx = min(start_idx + batch_size, num_rows)
    batch = data.iloc[start_idx:end_idx]

    all_texts = []
    all_audios = []
    valid_indices = []  

    for i, row in batch.iterrows():
        sess, file, local_idx, sex, emotion, _ = row
        audio_path = f"{save_path}/Session{sess}/{file}/{local_idx}.wav"

        conversation = [
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant that provides a thorough analysis of the speaker’s voice. "
                    "Focus on describing their tone, intonation, pitch, volume, pace, emotional nuances, "
                    "and any distinguishing characteristics. Provide a detailed multi-sentence description "
                    "rather than a single-word or single-sentence answer. Avoid guessing personal data like "
                    "name or exact age, and focus instead on audible cues and impressions from the voice."
                )
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "audio_url": "file://" + audio_path
                    },
                    {
                        "type": "text",
                        "text": (
                            "Please describe in detail what the speaker’s voice sounds like. "
                            "Comment on pitch, speed, intonation, emotional tone, and any other notable traits. "
                            "Use at least three sentences."
                        )
                    }
                ]
            }
        ]
        text = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True,
            tokenize=False
        )

        try:
            au, sr = librosa.load(
                audio_path,
                sr=processor.feature_extractor.sampling_rate
            )
            all_texts.append(text)
            all_audios.append(au)
            valid_indices.append(i) 
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # data.at[i, "audio_caption"] = "Nan"

    if not valid_indices:
        continue

    inputs = processor(
        text=all_texts,
        audios=all_audios,
        return_tensors="pt",
        padding=True,
        sampling_rate=processor.feature_extractor.sampling_rate
    ).to("cuda")

    with torch.no_grad():
        generate_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

    prompt_length = inputs.input_ids.shape[1]

    generate_ids = generate_ids[:, prompt_length:]

    responses = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    for idx_local, resp in zip(valid_indices, responses):
        audio_desc[idx_local] = resp

  0%|                                                  | 0/1255 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
100%|█████████████████████████████████████| 1255/1255 [5:31:55<00:00, 15.87s/it]


In [9]:
data['audio_caption'] = audio_desc
data.head()

Unnamed: 0,session,fn,idx,sex,emotion,text,audio_caption
0,1,Ses01F_impro07,0,M,exc,Did you get the letter?,The speaker's voice is that of an English male...
1,1,Ses01F_impro07,1,F,exc,"Yes. There's a big envelope it says, you're i...",The speaker's voice is high-pitched with a you...
2,1,Ses01F_impro07,2,M,exc,Yeah. That is so awesome.,The speaker's voice has a bright quality with ...
3,1,Ses01F_impro07,4,M,exc,Oh my God. What are you going to do? [LAUGHTER],The speaker's voice has a light and airy quali...
4,1,Ses01F_impro07,5,F,exc,So I have to move back to the ghetto but...I k...,The speaker's voice has a light and airy quali...


In [10]:
data.to_csv("../data/IEMOCAP/modified_data1303.csv", index=False)