In [1]:
import json
import base64
from datasets import load_from_disk
from openai import AsyncOpenAI
from dotenv import load_dotenv
import asyncio
import wave
import numpy as np
import io

load_dotenv()
client = AsyncOpenAI()

In [2]:
def array_to_wav_bytes(audio_array, sample_rate):
    """
    Convert a numpy audio array and sample rate into WAV bytes.
    Assumes mono audio. If the array is not int16, it converts assuming 
    the values are in the range [-1, 1].
    """
    audio_array = np.array(audio_array)
    if audio_array.dtype != np.int16:
        # Convert normalized float audio to int16
        audio_array = np.int16(np.clip(audio_array, -1, 1) * 32767)
    
    buffer = io.BytesIO()
    with wave.open(buffer, 'wb') as wf:
        channels = 1  # adjust if needed
        wf.setnchannels(channels)
        wf.setsampwidth(2)  # int16 is 2 bytes
        wf.setframerate(sample_rate)
        wf.writeframes(audio_array.tobytes())
    wav_bytes = buffer.getvalue()
    buffer.close()
    return wav_bytes

In [3]:
async def process_sample(sample):
    # Use the "file" field as a primary key for identification
    primary_key = sample["file"]
    
    audio_info = sample["audio"]
    audio_array = audio_info["array"]
    sample_rate = audio_info["sampling_rate"]
    
    # Encode the audio to a base64 string
    wav_bytes = array_to_wav_bytes(audio_array, sample_rate)
    encoded_audio = base64.b64encode(wav_bytes).decode('utf-8')
    
    # Create text prompts combining transcript and additional metadata
    text_prompt = (
        "Generate a detailed and descriptive emotion caption based solely on the vocal qualities of the following audio recording. "
        "The caption should capture nuances such as tone, inflection, and speech characteristics without referencing the scene or contextual details. "
        "The caption should be in the form of a sentence. "
        "Avoid simply stating an emotion word; instead, describe the vocal expression. For example, 'The voice was vehement, the tone revealing inner dissatisfaction and complaint.'"
    )

    details_text = (
        f"Transcript: {sample['transcription']}\n"
        f"Pitch: mean {sample['pitch_mean']}, std {sample['pitch_std']}\n"
        f"Major Emotion: {sample['major_emotion']}\n"
        f"Speaking Rate: {sample['speaking_rate']}"
    )
    
    # Build the payload for the gpt-4o-audio-preview model
    payload = {
        "model": "gpt-4o-audio-preview",
        "modalities": ["text"],
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_prompt},
                    {"type": "text", "text": details_text},
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": encoded_audio,
                            "format": "wav"
                        }
                    }
                ]
            }
        ]
    }
    
    # Asynchronously call the chat completion endpoint
    response = await client.chat.completions.create(**payload)
    return primary_key, response

In [4]:
# Load the dataset from disk
ds = load_from_disk("iemocap")
train_data = ds["train"]
print(train_data[0])

{'file': 'Ses01F_impro01_F000.wav', 'audio': {'path': 'Ses01F_impro01_F000.wav', 'array': array([-0.0050354 , -0.00497437, -0.0038147 , ..., -0.00265503,
       -0.00317383, -0.00418091]), 'sampling_rate': 16000}, 'frustrated': 0.0062500000931322575, 'angry': 0.0062500000931322575, 'sad': 0.0062500000931322575, 'disgust': 0.0062500000931322575, 'excited': 0.0062500000931322575, 'fear': 0.0062500000931322575, 'neutral': 0.949999988079071, 'surprise': 0.0062500000931322575, 'happy': 0.0062500000931322575, 'EmoAct': 2.3333330154418945, 'EmoVal': 2.6666669845581055, 'EmoDom': 2.0, 'gender': 'Female', 'transcription': ' Excuse me.', 'major_emotion': 'neutral', 'speaking_rate': 5.139999866485596, 'pitch_mean': 202.79881286621094, 'pitch_std': 76.12785339355469, 'rms': 0.00788376946002245, 'relative_db': -17.938434600830078}


In [5]:
first_ten_samples = train_data.select(range(10))
print(first_ten_samples[0])


{'file': 'Ses01F_impro01_F000.wav', 'audio': {'path': 'Ses01F_impro01_F000.wav', 'array': array([-0.0050354 , -0.00497437, -0.0038147 , ..., -0.00265503,
       -0.00317383, -0.00418091]), 'sampling_rate': 16000}, 'frustrated': 0.0062500000931322575, 'angry': 0.0062500000931322575, 'sad': 0.0062500000931322575, 'disgust': 0.0062500000931322575, 'excited': 0.0062500000931322575, 'fear': 0.0062500000931322575, 'neutral': 0.949999988079071, 'surprise': 0.0062500000931322575, 'happy': 0.0062500000931322575, 'EmoAct': 2.3333330154418945, 'EmoVal': 2.6666669845581055, 'EmoDom': 2.0, 'gender': 'Female', 'transcription': ' Excuse me.', 'major_emotion': 'neutral', 'speaking_rate': 5.139999866485596, 'pitch_mean': 202.79881286621094, 'pitch_std': 76.12785339355469, 'rms': 0.00788376946002245, 'relative_db': -17.938434600830078}


In [6]:
import pandas as pd
import math

# Process samples in batches of 500
batch_size = 500
total_samples = len(train_data)
num_batches = math.ceil(total_samples / batch_size)

results_data = []

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, total_samples)
    
    print(f"Processing batch {batch_idx + 1}/{num_batches} (samples {start_idx} to {end_idx-1})")
    
    # Create tasks for this batch only
    batch_samples = train_data.select(range(start_idx, end_idx))
    batch_tasks = [asyncio.create_task(process_sample(sample)) for sample in batch_samples]
    
    # Process this batch
    batch_results = await asyncio.gather(*batch_tasks)
    
    # Collect results from this batch
    for custom_id, response in batch_results:
        content = response.choices[0].message.content
        results_data.append({"id": custom_id, "response": content})
    
    # Save intermediate results after each batch
    interim_df = pd.DataFrame(results_data)
    interim_df.to_csv(f"gpt4o_audio_responses_batch_{batch_idx+1}.csv", index=False)
    print(f"Batch {batch_idx+1} completed and saved")

# Convert all results to DataFrame and save to final CSV
results_df = pd.DataFrame(results_data)
results_df.to_csv("gpt4o_audio_responses.csv", index=False)
print(f"All results saved to gpt4o_audio_responses.csv")


Processing batch 1/21 (samples 0 to 499)
Batch 1 completed and saved
Processing batch 2/21 (samples 500 to 999)
Batch 2 completed and saved
Processing batch 3/21 (samples 1000 to 1499)
Batch 3 completed and saved
Processing batch 4/21 (samples 1500 to 1999)
Batch 4 completed and saved
Processing batch 5/21 (samples 2000 to 2499)
Batch 5 completed and saved
Processing batch 6/21 (samples 2500 to 2999)
Batch 6 completed and saved
Processing batch 7/21 (samples 3000 to 3499)
Batch 7 completed and saved
Processing batch 8/21 (samples 3500 to 3999)
Batch 8 completed and saved
Processing batch 9/21 (samples 4000 to 4499)
Batch 9 completed and saved
Processing batch 10/21 (samples 4500 to 4999)
Batch 10 completed and saved
Processing batch 11/21 (samples 5000 to 5499)
Batch 11 completed and saved
Processing batch 12/21 (samples 5500 to 5999)
Batch 12 completed and saved
Processing batch 13/21 (samples 6000 to 6499)
Batch 13 completed and saved
Processing batch 14/21 (samples 6500 to 6999)
Bat