In [1]:
import os
import json
import base64
import asyncio
from openai import AsyncOpenAI, OpenAIError
import time # For retry delays
from pydantic import BaseModel, Field, ValidationError
from dotenv import load_dotenv

# For Hugging Face datasets
from datasets import load_from_disk # ADDED
import numpy as np # Often used with audio arrays
import soundfile as sf # For writing audio array to WAV in memory - ADDED
import io # For in-memory buffer - ADDED

# Apply nest_asyncio
try:
    import nest_asyncio
    nest_asyncio.apply()
    print("nest_asyncio applied.")
except ImportError:
    print("Warning: nest_asyncio not found. If running in a Jupyter-like environment, you might need it (`pip install nest_asyncio`).")
except RuntimeError as e:
    print(f"nest_asyncio note: {e}")

print("Libraries imported.")
print("Ensure you have 'datasets', 'soundfile', 'numpy' installed: pip install datasets soundfile numpy")

  from .autonotebook import tqdm as notebook_tqdm


nest_asyncio applied.
Libraries imported.
Ensure you have 'datasets', 'soundfile', 'numpy' installed: pip install datasets soundfile numpy


In [2]:
# --- Essential Configuration ---
IEMOCAP_ROOT_DIR = "../iemocap/"

# --- Input/Output Files and Model ---
INPUT_JSON_FILE = "mbt_test_generations.json" # Your input file (generated by previous scripts)
OUTPUT_RESULTS_FILE = f"{INPUT_JSON_FILE}_audio_results.json" # Where detailed results will be saved
OPENAI_MODEL = "gpt-4o-audio-preview"  # Recommended model. Or "gpt-4o-audio-preview" if you have specific reasons/access.

# --- Test Mode ---
# Set to True to process only the first valid item for testing.
# Set to False to process all items in the input file.
TEST_SINGLE_ITEM = False

# --- Load Environment Variables (for OPENAI_API_KEY) ---
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# --- Initial Checks & Print Configuration ---
if not OPENAI_API_KEY:
    print("ðŸ”´ WARNING: OPENAI_API_KEY not found in environment variables or .env file.")
    print("   Please ensure it's set for the script to run.")
else:
    # To avoid printing the full key, just confirm it's loaded
    print(f"ðŸŸ¢ OpenAI API key loaded (first 5 chars): {OPENAI_API_KEY[:5]}...")

print("\nðŸ“‹ Configuration Loaded:")
print(f"  IEMOCAP Root Dir: {IEMOCAP_ROOT_DIR}")
print(f"  Input JSON File:  {INPUT_JSON_FILE}")
print(f"  Output JSON File: {OUTPUT_RESULTS_FILE}")
print(f"  OpenAI Model:     {OPENAI_MODEL}")
print(f"  Test Single Item: {TEST_SINGLE_ITEM}")



ðŸŸ¢ OpenAI API key loaded (first 5 chars): sk-pr...

ðŸ“‹ Configuration Loaded:
  IEMOCAP Root Dir: ../iemocap/
  Input JSON File:  mbt_test_generations.json
  Output JSON File: mbt_test_generations.json_audio_results.json
  OpenAI Model:     gpt-4o-audio-preview
  Test Single Item: False


In [3]:
class EvaluationResponse(BaseModel):
    justification: str = Field(..., description="Brief justification for the score.")
    is_match: bool = Field(..., description="True if the generated caption is roughly similar at all to the audio content, False otherwise.")

In [4]:
def encode_audio_array_to_base64(audio_array: np.ndarray, sampling_rate: int):
    """
    Encodes a NumPy audio array to a base64 WAV string.
    """
    try:
        # Ensure audio_array is a NumPy array
        if not isinstance(audio_array, np.ndarray):
            audio_array = np.array(audio_array)

        # Normalize to float32 if it's not, as soundfile prefers it for PCM_16 subtype if input is float
        # Or ensure it's in a suitable integer range if directly writing int16
        if audio_array.dtype != np.float32:
             # Assuming it might be int16 or similar, scale to -1 to 1 for float32 conversion
            if np.issubdtype(audio_array.dtype, np.integer):
                max_val = np.iinfo(audio_array.dtype).max
                audio_array = audio_array.astype(np.float32) / max_val
            else: # If some other float type, just convert
                audio_array = audio_array.astype(np.float32)


        buffer = io.BytesIO()
        # Writing as 16-bit PCM WAV, common format
        sf.write(buffer, audio_array, sampling_rate, format='WAV', subtype='PCM_16')
        buffer.seek(0) # Reset buffer's position to the beginning
        wav_data = buffer.read()
        return base64.b64encode(wav_data).decode('utf-8')
    except Exception as e:
        print(f"Error encoding audio array to base64: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        return None

print("Helper function 'encode_audio_array_to_base64' defined.")

Helper function 'encode_audio_array_to_base64' defined.


In [5]:
async def evaluate_item_with_openai(
    client: AsyncOpenAI,
    item_index: int,
    wav_filename_from_json: str, 
    audio_array: np.ndarray,
    sampling_rate: int,
    generated_caption: str,
    model_name: str,
    max_retries: int = 3,
    initial_delay: int = 5
):
    if not isinstance(generated_caption, str):
        generated_caption = str(generated_caption) if generated_caption is not None else ""
    if not generated_caption.strip():
        return {
            "item_index": item_index, "wav_filename": wav_filename_from_json,
            "generated_caption": generated_caption, "is_match": None, "justification": None,
            "error": "Generated caption was empty or whitespace.", "raw_openai_response": None
        }

    encoded_audio = encode_audio_array_to_base64(audio_array, sampling_rate)
    if not encoded_audio:
        return {
            "item_index": item_index, "wav_filename": wav_filename_from_json,
            "generated_caption": generated_caption, "is_match": None, "justification": None,
            "error": f"Failed to encode audio data for {wav_filename_from_json}", "raw_openai_response": None
        }
        

    system_prompt_text = (
        "You are an AI assistant with a single, specific task: to evaluate an **emotion caption** against an **audio recording**. "
        "You will be given an audio recording and a text caption. Your ONLY goal is to determine if the emotion described in the caption accurately reflects the emotional content conveyed in the audio. "
        "Consider vocal tone, inflection, prosody, and other audible emotional cues in the audio. The caption is about the *emotion* expressed, not a literal transcription of words. "
        "**You are NOT to transcribe the audio. You are NOT to identify speakers or analyze speaker identity. You are NOT to perform any task other than this emotional content evaluation of the caption against the audio.** "
        "Your entire response MUST be a single, valid JSON object enclosed in ```json ... ``` marks, providing a brief justification for your emotional assessment and a boolean 'is_match' field. "
        "Strictly adhere to the JSON format requested by the user."
    )
    user_prompt_text_template = (
        "Here is an audio recording and a 'Generated Emotion Caption'. "
        "Please evaluate if this 'Generated Emotion Caption' accurately describes the **emotional characteristics** conveyed in the audio recording. "
        "Focus SOLELY on the emotional alignment between the caption and the audio. Do not attempt to identify the speaker or transcribe the audio content. "
        "Provide a brief justification for your decision regarding the emotional match, and then state if it's a match (true/false).\n\n"
        "Generated Emotion Caption: \"{caption}\"\n\n"
        "Respond ONLY with a single valid JSON object enclosed in triple backticks (```json ... ```). "
        "Do not include any explanatory text before or after this JSON block. The JSON object must conform to the following structure:\n"
        "```json\n"
        "{{\n"
        "    \"justification\": \"<Your brief justification regarding the emotional accuracy of the caption for the audio>\",\n"
        "    \"is_match\": <true_if_caption_accurately_describes_audio_emotion_else_false>\n"
        "}}\n"
        "```"
    )
    user_prompt_text = user_prompt_text_template.format(caption=generated_caption)
    
    messages = [
        {"role": "system", "content": system_prompt_text},
        {
            "role": "user",
            "content": [
                {"type": "input_audio", "input_audio": {"data": encoded_audio, "format": "wav"}},
                {"type": "text", "text": user_prompt_text}
            ]
        }
    ]

    current_retry = 0
    delay = initial_delay
    last_error_message = None
    raw_response_content = None

    while current_retry <= max_retries:
        try:
            api_params = {
                "model": model_name,
                "messages": messages,
                # "response_format": {"type": "json_object"}, # REMOVED for manual JSON mode
                "temperature": 0.1,
            }
            if "gpt-4o" in model_name.lower() or "audio-preview" in model_name.lower():
                 api_params["modalities"] = ["text"]

            completion = await client.chat.completions.create(**api_params)
            raw_response_content = completion.choices[0].message.content
            
            # Manual cleaning for JSON, as model might still wrap output or add text
            cleaned_response_content = raw_response_content.strip()
            if cleaned_response_content.startswith("```json"):
                cleaned_response_content = cleaned_response_content[len("```json"):].strip()
            if cleaned_response_content.endswith("```"):
                cleaned_response_content = cleaned_response_content[:-len("```")].strip()
            
            # Additional cleaning: sometimes the model might still add "Sure, here is the JSON:"
            # This is a simple attempt; more robust regex might be needed if issues persist.
            if "}\n" in cleaned_response_content : # check if json ends prematurely and there is more text
                cleaned_response_content = cleaned_response_content.split("}\n")[0] + "}"


            try:
                parsed_json = json.loads(cleaned_response_content)
                eval_response = EvaluationResponse(**parsed_json)
                return {
                    "item_index": item_index, "wav_filename": wav_filename_from_json,
                    "generated_caption": generated_caption, 
                    "is_match": eval_response.is_match, 
                    "justification": eval_response.justification,
                    "error": None, 
                    "raw_openai_response": raw_response_content 
                }
            except json.JSONDecodeError as e_json:
                last_error_message = f"Response not valid JSON (Attempt {current_retry+1}): {e_json}. Cleaned Response: '{cleaned_response_content}'. Raw: '{raw_response_content}'"
            except ValidationError as e_val:
                last_error_message = f"Response JSON did not match schema (Attempt {current_retry+1}): {e_val}. Parsed JSON: '{parsed_json if 'parsed_json' in locals() else 'Error before parsing to dict'}'"
            except Exception as e_parse:
                last_error_message = f"Unexpected error parsing OpenAI response (Attempt {current_retry+1}): {e_parse}. Cleaned Response: '{cleaned_response_content}'"
            
            print(f"  Item {item_index} (Parse Error, Attempt {current_retry+1}): {last_error_message}")

        except OpenAIError as e:
            last_error_message = f"OpenAI API Error (Attempt {current_retry+1}): {type(e).__name__} - {e}"
            print(f"  Item {item_index} ({type(e).__name__} on API call, Attempt {current_retry+1}): {e}")
            if isinstance(e, OpenAIError) and hasattr(e, 'status_code') and e.status_code in [429, 500, 503, 504] or isinstance(e, asyncio.TimeoutError):
                if current_retry == max_retries: break
            else: break 
        except Exception as e:
            last_error_message = f"Unexpected Error during API call (Attempt {current_retry+1}): {type(e).__name__} - {e}"
            print(f"  Item {item_index} (Unexpected Error on API call, Attempt {current_retry+1}): {e}")
            break
        
        current_retry += 1
        if current_retry <= max_retries:
             print(f"    Retrying item {item_index} in {delay}s...")
             await asyncio.sleep(delay)
             delay *= 2
        else: break

    return {
        "item_index": item_index, "wav_filename": wav_filename_from_json,
        "generated_caption": generated_caption, "is_match": None, "justification": None,
        "error": last_error_message or "Max retries exceeded or an unrecoverable error occurred.",
        "raw_openai_response": raw_response_content
    }

print("Core OpenAI evaluation function 'evaluate_item_with_openai' (using Manual JSON Mode) defined.")

Core OpenAI evaluation function 'evaluate_item_with_openai' (using Manual JSON Mode) defined.


In [6]:
from datasets import load_from_disk # Ensure this is imported if Cell 1 was not re-run

async def run_main_evaluations(
    iemocap_root_dir_param, 
    input_json_file_param, 
    output_json_file_param, 
    openai_model_param, 
    test_single_item_param
):
    if not OPENAI_API_KEY:
        print("ðŸ”´ FATAL ERROR: OPENAI_API_KEY is not set. Cannot proceed.")
        return

    hf_dataset = None
    filename_to_hf_idx_map = {}
    try:
        print(f"\nLoading Hugging Face dataset from disk: {iemocap_root_dir_param}...")
        loaded_object = load_from_disk(iemocap_root_dir_param)
        if hasattr(loaded_object, 'keys'): 
            if 'train' in loaded_object:
                hf_dataset = loaded_object['train']
                print(f"Loaded 'train' split. Items: {len(hf_dataset)}")
            elif loaded_object:
                first_split_key = list(loaded_object.keys())[0]
                hf_dataset = loaded_object[first_split_key]
                print(f"Loaded '{first_split_key}' split. Items: {len(hf_dataset)}")
            else: raise ValueError("Loaded DatasetDict is empty.")
        else: 
            hf_dataset = loaded_object
            print(f"Loaded single Hugging Face Dataset. Items: {len(hf_dataset)}")

        for idx, hf_item in enumerate(hf_dataset):
            hf_item_file_field = hf_item.get("file")
            if hf_item_file_field:
                basename = os.path.basename(str(hf_item_file_field)) 
                filename_to_hf_idx_map[basename] = idx
        print(f"Created lookup map for {len(filename_to_hf_idx_map)} HF dataset items.")
        if not filename_to_hf_idx_map: print("ðŸ”´ Warning: HF dataset lookup map is empty.")
    except Exception as e:
        print(f"ðŸ”´ FATAL ERROR loading/processing HF dataset from '{iemocap_root_dir_param}': {e}")
        import traceback
        traceback.print_exc()
        return
        
    print(f"\nLoading generated captions from '{input_json_file_param}'...")
    if not os.path.exists(input_json_file_param):
        print(f"ðŸ”´ FATAL ERROR: Input JSON file not found: {input_json_file_param}")
        return
    try:
        with open(input_json_file_param, 'r', encoding='utf-8') as f:
            input_captions_data = json.load(f)
        if not isinstance(input_captions_data, list):
            print(f"ðŸ”´ FATAL ERROR: Input JSON '{input_json_file_param}' must be a list.")
            return
        print(f"Loaded {len(input_captions_data)} items from '{input_json_file_param}'.")
    except Exception as e:
        print(f"ðŸ”´ FATAL ERROR loading input JSON '{input_json_file_param}': {e}")
        return
    
    if not input_captions_data: print("No data to process from input captions JSON."); return

    client = AsyncOpenAI(api_key=OPENAI_API_KEY)

    if test_single_item_param:
        print("\n--- Running in SINGLE ITEM TEST MODE ---")
        first_valid_item_processed = False
        for i, data_item_from_json in enumerate(input_captions_data):
            wav_filename_from_json = data_item_from_json.get("wav_filename")
            generated_caption = data_item_from_json.get("generated_caption")
            if not wav_filename_from_json: continue
            if not isinstance(generated_caption, str) or not generated_caption.strip(): continue
            
            hf_item_idx = filename_to_hf_idx_map.get(wav_filename_from_json)
            if hf_item_idx is not None:
                hf_item = hf_dataset[hf_item_idx]
                audio_data_dict = hf_item.get("audio")
                if audio_data_dict and 'array' in audio_data_dict and 'sampling_rate' in audio_data_dict:
                    audio_array, sampling_rate = audio_data_dict["array"], audio_data_dict["sampling_rate"]
                    print(f"\nAttempting test with (JSON Index {i}, HF Index {hf_item_idx}): {wav_filename_from_json}")
                    test_result = await evaluate_item_with_openai(
                        client, i + 1, wav_filename_from_json, audio_array, sampling_rate, generated_caption, openai_model_param
                    )
                    print("\n--- Single Item Test Result ---"); print(json.dumps(test_result, indent=2, ensure_ascii=False))
                    first_valid_item_processed = True; break 
                else: print(f"  Skipping item {i+1} ('{wav_filename_from_json}'): Audio data malformed in HF dataset.")
        if not first_valid_item_processed: print("\nðŸ”´ No valid item found to test.")
        await client.close(); return

    tasks, skipped_not_in_hf, skipped_no_caption, skipped_bad_audio, processed_count = [], 0, 0, 0, 0
    print(f"\nPreparing ALL evaluation tasks for '{openai_model_param}'...")
    for i, data_item_from_json in enumerate(input_captions_data):
        wav_filename_from_json = data_item_from_json.get("wav_filename")
        generated_caption = data_item_from_json.get("generated_caption")
        if not wav_filename_from_json: continue
        if not isinstance(generated_caption, str) or not generated_caption.strip(): skipped_no_caption += 1; continue
        hf_item_idx = filename_to_hf_idx_map.get(wav_filename_from_json)
        if hf_item_idx is None: skipped_not_in_hf += 1; continue
        hf_item = hf_dataset[hf_item_idx]
        audio_data_dict = hf_item.get("audio")
        if not (audio_data_dict and 'array' in audio_data_dict and 'sampling_rate' in audio_data_dict):
            skipped_bad_audio += 1; continue
        audio_array, sampling_rate = audio_data_dict["array"], audio_data_dict["sampling_rate"]
        processed_count += 1
        tasks.append(evaluate_item_with_openai(client, i + 1, wav_filename_from_json, audio_array, sampling_rate, generated_caption, openai_model_param))
    
    if skipped_not_in_hf: print(f"\nNote: Skipped {skipped_not_in_hf} items (not found in HF dataset).")
    if skipped_no_caption: print(f"Note: Skipped {skipped_no_caption} items (missing/empty caption).")
    if skipped_bad_audio: print(f"Note: Skipped {skipped_bad_audio} items (bad audio in HF dataset).")
    if not tasks: print("ðŸ”´ No valid items to evaluate."); await client.close(); return

    print(f"\nCreated {len(tasks)} tasks. Starting concurrent OpenAI evaluations...")
    results_from_api = await asyncio.gather(*tasks, return_exceptions=True)
    print("\nAll OpenAI API calls complete.")
    await client.close()

    final_results = [res if not isinstance(res, Exception) else {"error": f"Unhandled exception: {str(res)}"} for res in results_from_api]
    print(f"\nSaving detailed results to '{output_json_file_param}'...")
    try:
        with open(output_json_file_param, "w", encoding='utf-8') as f: json.dump(final_results, f, indent=2, ensure_ascii=False)
        print(f"Successfully saved results.")
    except Exception as e: print(f"ðŸ”´ Error saving results: {e}")

    valid_evals = [r for r in final_results if isinstance(r, dict) and r.get("is_match") is not None]
    if valid_evals:
        true_matches = sum(1 for r in valid_evals if r.get("is_match")); false_matches = len(valid_evals) - true_matches
        match_rate = (true_matches / len(valid_evals) * 100) if valid_evals else 0.0
        print("\n--- Evaluation Summary ---")
        print(f"Total items in input JSON: {len(input_captions_data)}")
        print(f"Tasks created (valid audio & caption): {processed_count}")
        print(f"Successfully evaluated by OpenAI ('is_match' obtained): {len(valid_evals)}")
        if valid_evals:
            print(f"  - True Matches: {true_matches}\n  - False Matches: {false_matches}")
            print(f"Match Rate (among successfully evaluated): {match_rate:.2f}%")
        error_count = sum(1 for r in final_results if isinstance(r, dict) and r.get("error"))
        print(f"Items with errors: {error_count}")
        if error_count: print("  (Review output JSON for error details)")
    else: print("\nNo items successfully evaluated.")
    
    return final_results

print("Main orchestration function 'run_main_evaluations' (for HF Datasets & JSON Mode) defined.")

Main orchestration function 'run_main_evaluations' (for HF Datasets & JSON Mode) defined.


In [7]:
final_evaluation_results = asyncio.run(run_main_evaluations(
    IEMOCAP_ROOT_DIR, 
    INPUT_JSON_FILE, 
    OUTPUT_RESULTS_FILE, 
    OPENAI_MODEL, 
    TEST_SINGLE_ITEM  # This argument carries the True/False flag
))
print("\n--- Evaluation Script Finished ---")


Loading Hugging Face dataset from disk: ../iemocap/...
Loaded 'train' split. Items: 10039
Created lookup map for 10039 HF dataset items.

Loading generated captions from 'mbt_test_generations.json'...
Loaded 980 items from 'mbt_test_generations.json'.

Preparing ALL evaluation tasks for 'gpt-4o-audio-preview'...

Created 980 tasks. Starting concurrent OpenAI evaluations...

All OpenAI API calls complete.

Saving detailed results to 'mbt_test_generations.json_audio_results.json'...
Successfully saved results.

--- Evaluation Summary ---
Total items in input JSON: 980
Tasks created (valid audio & caption): 980
Successfully evaluated by OpenAI ('is_match' obtained): 980
  - True Matches: 594
  - False Matches: 386
Match Rate (among successfully evaluated): 60.61%
Items with errors: 0

--- Evaluation Script Finished ---


In [8]:
print(final_evaluation_results)

[{'item_index': 1, 'wav_filename': 'Ses05M_script03_1_M015.wav', 'generated_caption': 'The voice carried a sharp edge, with a tone that conveyed a sense of irritation and impatience, underscored by a clipped and brisk delivery that hinted at underlying frustration and a desire to swiftly address the matter', 'is_match': True, 'justification': 'The audio conveyed a tone that was sharp and brisk, with a sense of irritation and impatience. The delivery was clipped, suggesting underlying frustration, which aligns with the description in the caption.', 'error': None, 'raw_openai_response': '```json\n{\n    "justification": "The audio conveyed a tone that was sharp and brisk, with a sense of irritation and impatience. The delivery was clipped, suggesting underlying frustration, which aligns with the description in the caption.",\n    "is_match": true\n}\n```'}, {'item_index': 2, 'wav_filename': 'Ses01F_impro03_F005.wav', 'generated_caption': "The voice carried a sharp edge, with a tone that 

In [9]:
from datetime import datetime # For timestamping the appended summary

def calculate_and_append_summary(results_list, source_json_filename, summary_log_file="audio_evaluations.txt"):
    """
    Calculates summary statistics from evaluation results, prints them, 
    and appends them to a log file.
    """
    if not results_list or not isinstance(results_list, list):
        print("\nðŸ”´ Summary calculation skipped: No results list provided or list is empty.")
        return

    # Items for which an API call was at least attempted and a result dictionary was formed.
    # This might include items that resulted in an error during the API call or parsing.
    total_items_in_results_list = len(results_list) 
                                                     
    # Successfully evaluated items are those where 'is_match' is not None (i.e., True or False)
    # AND there was no overriding error preventing the 'is_match' field from being populated.
    successful_evals = [
        r for r in results_list 
        if isinstance(r, dict) and r.get("is_match") is not None and r.get("error") is None
    ]
    num_successful_evals = len(successful_evals)

    true_matches = sum(1 for r in successful_evals if r.get("is_match") is True)
    # false_matches can be derived if needed: num_successful_evals - true_matches
    
    # Processing Success Rate: Percentage of items in the results list that yielded a successful 'is_match' evaluation.
    processing_success_rate_percent = (num_successful_evals / total_items_in_results_list * 100) if total_items_in_results_list > 0 else 0.0
    
    # Model Match Rate: Percentage of *successfully evaluated* items that were 'is_match: True'.
    model_match_rate_percent = (true_matches / num_successful_evals * 100) if num_successful_evals > 0 else 0.0

    # Status Counts based on the structure of items in `results_list`
    status_counts = {}
    for item_result in results_list:
        if not isinstance(item_result, dict):
            status = "Malformed Result Entry"
        elif item_result.get("is_match") is not None and item_result.get("error") is None:
            status = "Success ('is_match' obtained)"
        elif item_result.get("error"):
            # Simplified error status for summary
            error_type = str(item_result.get('error', 'Unknown Error')).split(':')[0] # Get general error type
            status = f"Error ({error_type})"
        else:
            status = "Unknown or Incomplete"
        status_counts[status] = status_counts.get(status, 0) + 1
    
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # --- Prepare summary string ---
    summary_title = f"Calculating Accuracy / Summary Statistics for {source_json_filename}"
    summary_lines = [
        f"\n\n{timestamp} - Evaluation Run Summary",
        "=" * len(summary_title),
        summary_title,
        "=" * len(summary_title),
        f"\nTotal Items in Results List (processed or errored): {total_items_in_results_list}",
        f"Successful Evaluations ('is_match' obtained): {num_successful_evals}",
        f"  Processing Success Rate (successful / total in list): {processing_success_rate_percent:.1f}%",
        "-" * 30,
        f"Pairs Classified as 'is_match: True' by Model: {true_matches} (out of {num_successful_evals} successful)",
        f"Model Match Rate (True / successful evals): {model_match_rate_percent:.1f}%",
        "-" * 30,
        "Status Counts from Results List:"
    ]
    for status, count in sorted(status_counts.items()): # Sort for consistent order
        summary_lines.append(f"  - {status}: {count}")
    summary_lines.append("\nNote: 'Model Match Rate' reflects how often the model determined a match among successfully evaluated items.")
    summary_lines.append("The 'Processing Success Rate' reflects how many items in the final results list got a definitive 'is_match' value.")
    summary_lines.append("Items skipped before API call (e.g., audio not found) are not included in 'Total Items in Results List'.")
    summary_lines.append("=" * len(summary_title) + "\n")

    summary_string_for_display_and_file = "\n".join(summary_lines)

    # Print to console
    print(summary_string_for_display_and_file)

    # Append to file
    try:
        with open(summary_log_file, "a", encoding="utf-8") as f:
            f.write(summary_string_for_display_and_file)
        print(f"Summary appended to '{summary_log_file}'")
    except Exception as e:
        print(f"ðŸ”´ Error appending summary to file '{summary_log_file}': {e}")


if 'final_evaluation_results' in locals() and final_evaluation_results is not None:
    if TEST_SINGLE_ITEM: # From Cell 2
        print("\nNote: Summary calculation is based on a single test item because TEST_SINGLE_ITEM was True.")
        print("For a full summary, set TEST_SINGLE_ITEM to False and re-run the evaluation (Cell 8).")
    
    calculate_and_append_summary(final_evaluation_results, INPUT_JSON_FILE)
else:
    print("\nSummary calculation skipped: 'final_evaluation_results' not found or is empty.")
    print("Please ensure the main evaluation (Cell 8) ran successfully and produced results.")



2025-05-07 01:01:13 - Evaluation Run Summary
Calculating Accuracy / Summary Statistics for mbt_test_generations.json

Total Items in Results List (processed or errored): 980
Successful Evaluations ('is_match' obtained): 980
  Processing Success Rate (successful / total in list): 100.0%
------------------------------
Pairs Classified as 'is_match: True' by Model: 594 (out of 980 successful)
Model Match Rate (True / successful evals): 60.6%
------------------------------
Status Counts from Results List:
  - Success ('is_match' obtained): 980

Note: 'Model Match Rate' reflects how often the model determined a match among successfully evaluated items.
The 'Processing Success Rate' reflects how many items in the final results list got a definitive 'is_match' value.
Items skipped before API call (e.g., audio not found) are not included in 'Total Items in Results List'.

Summary appended to 'audio_evaluations.txt'
