In [1]:
# Block 1: Imports and Setup (Async, Pydantic, Reasoning Summary)
import os
import json
import asyncio
import sys
import traceback # For detailed error logging
from openai import AsyncOpenAI # Import the asynchronous client
from pydantic import BaseModel, Field # Import Pydantic
from dotenv import load_dotenv

# --- Pydantic Schema Definition (Simplified) ---
class EvaluationMatchResult(BaseModel):
    """Structure for the simple match evaluation output from the model."""
    is_match: bool = Field(..., description="True if the generated caption is roughly similar at all to the ground truth, False otherwise.")
    # Note: Reasoning field removed as per request - it will be requested separately.

# --- nest_asyncio Setup ---
try:
    # Required for running asyncio loops in environments like Jupyter
    import nest_asyncio
    nest_asyncio.apply()
    print("nest_asyncio applied.")
except ImportError:
    print("Warning: nest_asyncio not found. Install (`pip install nest_asyncio`) if needed for your environment.")
except RuntimeError as e:
    # Handles cases where nest_asyncio might be applied already or unnecessary
    print(f"nest_asyncio note: {e}")

# --- Configuration & Client Initialization ---
try:
    load_dotenv()
    print("Attempted to load environment variables from .env file.")
except Exception as e:
    print(f"Could not load .env file (often ignorable): {e}")

# Check API Key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
else:
    print("OpenAI API key found.")

# Initialize the AsyncOpenAI client
try:
    client = AsyncOpenAI(api_key=api_key) 
    print("AsyncOpenAI client initialized.")
except Exception as e:
    print(f"Error initializing AsyncOpenAI client: {e}")
    client = None # Set client to None if initialization fails

# Model and File Config
EVALUATION_MODEL = "o3-mini" # Using the requested reasoning model
REASONING_EFFORT = "medium" # Effort level for the reasoning model (used with summary request)
REASONING_SUMMARY = "auto" # Request the best available reasoning summary ('auto', 'detailed', 'concise', or None)
INPUT_JSON_FILE = "xnorm_test_generations.json" # Input data file

print(f"\nConfiguration:")
print(f"  Model: '{EVALUATION_MODEL}'")
print(f"  Reasoning Effort: '{REASONING_EFFORT}'")
print(f"  Reasoning Summary Request: '{REASONING_SUMMARY}'")
print(f"  Input File: '{INPUT_JSON_FILE}'")
print(f"  Output Schema for Match Result: {json.dumps(EvaluationMatchResult.model_json_schema(), indent=2)}")

nest_asyncio applied.
Attempted to load environment variables from .env file.
OpenAI API key found.
AsyncOpenAI client initialized.

Configuration:
  Model: 'o3-mini'
  Reasoning Effort: 'medium'
  Reasoning Summary Request: 'auto'
  Input File: 'xnorm_test_generations.json'
  Output Schema for Match Result: {
  "description": "Structure for the simple match evaluation output from the model.",
  "properties": {
    "is_match": {
      "description": "True if the generated caption is roughly similar at all to the ground truth, False otherwise.",
      "title": "Is Match",
      "type": "boolean"
    }
  },
  "required": [
    "is_match"
  ],
  "title": "EvaluationMatchResult",
  "type": "object"
}


In [2]:
# Block 2: Load Data from File

caption_pairs = [] # Initialize empty list to store data
print(f"\nAttempting to load data from '{INPUT_JSON_FILE}'...")

try:
    # Ensure the file exists before trying to open it
    if not os.path.exists(INPUT_JSON_FILE):
         # Provide context about the current working directory
         current_dir = os.getcwd()
         raise FileNotFoundError(f"Error: The file '{INPUT_JSON_FILE}' was not found in the current directory '{current_dir}'")

    # Open and load JSON data, ensuring UTF-8 encoding
    with open(INPUT_JSON_FILE, 'r', encoding='utf-8') as f:
        caption_pairs = json.load(f)
    
    # Basic validation: Check if the loaded data is a list
    if not isinstance(caption_pairs, list):
        print(f"Error: Expected a JSON list in '{INPUT_JSON_FILE}', but found type {type(caption_pairs)}. Please ensure the file contains a valid JSON array.")
        caption_pairs = [] # Reset to empty list if data is not a list
    else:
         print(f"Successfully loaded {len(caption_pairs)} caption pairs from '{INPUT_JSON_FILE}'.")

# Handle specific errors during file loading
except FileNotFoundError as e:
    print(e) # Print the specific error message from the exception
    caption_pairs = [] 
except json.JSONDecodeError as e:
    # Provide more specific feedback for JSON errors
    print(f"Error decoding JSON from '{INPUT_JSON_FILE}'. Please check the file for syntax errors (e.g., missing commas, quotes): {e}")
    caption_pairs = []
except Exception as e: # Catch any other unexpected errors during file I/O
    print(f"An unexpected error occurred while loading the file: {type(e).__name__}: {e}")
    caption_pairs = []

# Optional: Display loaded data sample if successful and data exists
if caption_pairs:
    print("Sample of loaded data (first item):")
    try:
        # Use json.dumps for pretty printing
        print(json.dumps(caption_pairs[0], indent=2))
    except IndexError:
        # Handle case where the list was loaded but is empty
        print("Loaded data appears to be an empty list.")
else:
     # This message prints if loading failed or the file contained an empty list initially
     print("Failed to load valid data or the file is empty. Evaluation cannot proceed.")


Attempting to load data from 'xnorm_test_generations.json'...
Successfully loaded 980 caption pairs from 'xnorm_test_generations.json'.
Sample of loaded data (first item):
{
  "generated_caption": "The voice carried a sharp and clipped quality, with a tone that conveyed a sense of exasperation and impatience, as if the speaker was struggling to maintain composure amidst a situation that seemed to be bothersome and un",
  "ground_truth": "The voice carried a lively urgency, with animated pitch variations that expressed a dynamic blend of excitement and sharpness, giving an edgy vibrance to the words."
}


In [3]:
# Block 3: Test Single Evaluation (Using client.responses.parse - Corrected)

async def test_single_evaluation_parse():
    """
    Performs a single API call for the first data pair using 
    client.responses.parse() to test the setup, attempting to include
    the 'reasoning' parameter. Corrected schema formatting in prompt.
    """
    global client, caption_pairs, EVALUATION_MODEL, REASONING_EFFORT, REASONING_SUMMARY, EvaluationMatchResult
    
    print("\n--- Starting Single Evaluation Test (using responses.parse) ---")

    # Pre-checks
    if not client: print("Test failed: Client not initialized."); return
    if not caption_pairs: print("Test failed: No data loaded."); return
        
    try:
        first_pair = caption_pairs[0]
        pair_index = 1 
        if not isinstance(first_pair, dict): print(f"Test failed: First item not a dict."); return
        gt_caption = first_pair.get("ground_truth"); gen_caption = first_pair.get("generated_caption")
        if gt_caption is None or gen_caption is None: print(f"Test failed: Missing keys in first pair."); return
            
        print(f"Using Pair {pair_index}:")
        print(f"  Ground Truth: {gt_caption[:100]}...") 
        print(f"  Generated:    {gen_caption[:100]}...") 

        # Define prompts with corrected schema formatting
        # Corrected: Use json.dumps() for indentation within the f-string
        system_prompt = f"""You are an AI assistant evaluating audio captions.
Compare 'generated_caption' and 'ground_truth' for the *same* audio.
Determine if they are **roughly similar at all** based on audio characteristics (tone, emotion, etc.). Any resemblance counts.
Your response MUST be a JSON object matching this schema for the match status:
{json.dumps(EvaluationMatchResult.model_json_schema(), indent=2)}
"""

        user_prompt = f"""Evaluate the following pair and provide the match result in the required JSON format:

Ground Truth:
"{gt_caption}"

Generated Caption:
"{gen_caption}"
"""

        print("\nMaking API call using client.responses.parse() with:")
        print(f"  model='{EVALUATION_MODEL}'")
        print(f"  text_format=EvaluationMatchResult")
        print(f"  reasoning={{'effort': '{REASONING_EFFORT}', 'summary': '{REASONING_SUMMARY}'}}  <-- Attempting to pass this")

        # Make the API call using client.responses.parse
        response = await client.responses.parse(
            model=EVALUATION_MODEL,
            input=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ],
            text_format=EvaluationMatchResult, 
            reasoning={"effort": REASONING_EFFORT, "summary": REASONING_SUMMARY} 
        )

        print("\n--- API Call Successful (using parse) ---")
        print("\n--- Full Response Object (JSON Dump) ---")
        try: print(response.model_dump_json(indent=2)) 
        except Exception as dump_err: print(f"Could not dump response: {dump_err}\nRaw: {response}")

        # Check for structured output
        print("\n--- Checking for Structured Output (output_parsed) ---")
        if hasattr(response, 'output_parsed') and response.output_parsed:
            if isinstance(response.output_parsed, EvaluationMatchResult):
                print("Found expected structured output (EvaluationMatchResult):")
                print(response.output_parsed.model_dump()) 
            else:
                print(f"Found 'output_parsed', unexpected type: {type(response.output_parsed)}\nValue: {response.output_parsed}")
        else:
            raw_text = getattr(response, 'output_text', '[No raw text available]')
            print(f"'output_parsed' not found. Raw text: {raw_text}")

        # Check for reasoning summary
        print("\n--- Checking for Reasoning Summary ---")
        summary_found = False
        if hasattr(response, 'output') and isinstance(response.output, list):
            for item in response.output:
                if hasattr(item, 'type') and item.type == 'reasoning' and hasattr(item, 'summary'): 
                    print("Found reasoning summary item:"); print(item.summary) 
                    summary_found = True; break 
        if not summary_found: print("Reasoning summary not found in expected locations.")

        print("\n--- Single Evaluation Test (using parse) Finished ---")

    except Exception as e:
        print(f"\n--- An Error Occurred During the Test (using parse) ---")
        error_type = type(e).__name__; error_str = str(e).lower()
        print(f"{error_type}: {e}")
        if "unexpected keyword argument 'reasoning'" in error_str or "does not support parameter 'reasoning'" in error_str:
             print(f"*** Error confirms client.responses.parse() does not accept 'reasoning'. Use client.responses.create(). ***")
        elif "text_format" in error_str or "response_format" in error_str:
             print(f"Error might relate to text_format incompatibility.")
             

await test_single_evaluation_parse() # OR asyncio.run(...)



--- Starting Single Evaluation Test (using responses.parse) ---
Using Pair 1:
  Ground Truth: The voice carried a lively urgency, with animated pitch variations that expressed a dynamic blend of...
  Generated:    The voice carried a sharp and clipped quality, with a tone that conveyed a sense of exasperation and...

Making API call using client.responses.parse() with:
  model='o3-mini'
  text_format=EvaluationMatchResult
  reasoning={'effort': 'medium', 'summary': 'auto'}  <-- Attempting to pass this

--- API Call Successful (using parse) ---

--- Full Response Object (JSON Dump) ---
{
  "id": "resp_680fdccaaf4c8191896434b554046a3a01f057eeda9b3fa1",
  "created_at": 1745870026.0,
  "error": null,
  "incomplete_details": null,
  "instructions": null,
  "metadata": {},
  "model": "o3-mini-2025-01-31",
  "object": "response",
  "output": [
    {
      "id": "rs_680fdccf08308191942832678d1b5e3201f057eeda9b3fa1",
      "summary": [
        {
          "text": "**Evaluating caption similari

In [4]:
# Block 3: Define Async Worker Function (Using `parse` with Reasoning)

async def evaluate_caption_pair_async(pair_index, ground_truth, generated_caption):
    """
    Evaluates a single caption pair asynchronously using client.responses.parse,
    requesting structured output for the match and a reasoning summary.

    Args:
        pair_index (int): The index of the pair for tracking.
        ground_truth (str): The reference caption.
        generated_caption (str): The caption generated by the model.

    Returns:
        tuple: (pair_index, response_object | Exception)
               Returns the index and the full OpenAI Response object on success,
               or the Exception object if an API call error occurred.
    """
    global client, EVALUATION_MODEL, REASONING_EFFORT, REASONING_SUMMARY, EvaluationMatchResult
    
    if not client: return (pair_index, RuntimeError("OpenAI client not initialized"))
    if not ground_truth or not generated_caption:
        return (pair_index, ValueError("Input caption(s) empty")) 
        
    # Define prompts for evaluation
    system_prompt = f"""You are an AI assistant evaluating audio captions.
Compare 'generated_caption' and 'ground_truth' for the *same* audio.
Determine if they are **roughly similar at all** based on audio characteristics (tone, emotion, etc.). Any resemblance counts.
Your response MUST be a JSON object matching this schema for the match status:
{json.dumps(EvaluationMatchResult.model_json_schema(), indent=2)}
"""
    user_prompt = f"""Evaluate the following pair and provide the match result in the required JSON format:

Ground Truth:
"{ground_truth}"

Generated Caption:
"{generated_caption}"
"""

    try:
        # Make the API call using client.responses.parse
        # Pass reasoning parameter - test showed this combination worked.
        response = await client.responses.parse(
            model=EVALUATION_MODEL,
            input=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            text_format=EvaluationMatchResult, # Request structured output
            reasoning={"effort": REASONING_EFFORT, "summary": REASONING_SUMMARY} # Request reasoning + summary
        )
        # On success, return the index and the entire response object
        return (pair_index, response) 

    except Exception as e:
        # On failure, return the index and the exception object
        error_type = type(e).__name__
        print(f"API Error for Pair {pair_index}: {error_type}: {e}") 
        # Log specific errors if needed (e.g., related to parameters)
        error_str = str(e).lower()
        if "unexpected keyword argument 'reasoning'" in error_str or \
           "does not support parameter 'reasoning'" in error_str:
             print(f"-> Note: Error suggests 'parse' might not reliably accept 'reasoning'.")
        return (pair_index, e) 

print("\nAsync worker function 'evaluate_caption_pair_async' defined.")


Async worker function 'evaluate_caption_pair_async' defined.


In [5]:
# Block 4: Concurrent Loop and Result Processing

async def main_concurrent_evaluation():
    """
    Main async function to run evaluations concurrently for all pairs 
    and process the results (structured output + reasoning summary).
    """
    global caption_pairs, client # Need access to data and client
    
    if not client: print("Cannot proceed: Client not initialized."); return []
    if not caption_pairs: print("Cannot proceed: No data loaded."); return []

    print(f"\n--- Starting Concurrent Evaluation of {len(caption_pairs)} Pairs ---")
    
    tasks = []
    valid_pairs_indices = [] # Keep track of indices we actually create tasks for
    for i, pair in enumerate(caption_pairs):
        pair_index = i + 1 # Use 1-based index
        
        # Validate pair data before creating task
        if not isinstance(pair, dict):
            print(f"Skipping item {pair_index}: Not a dictionary."); continue
        gt_caption = pair.get("ground_truth"); gen_caption = pair.get("generated_caption")
        if gt_caption is None or gen_caption is None:
            print(f"Skipping pair {pair_index}: Missing 'ground_truth' or 'generated_caption' key."); continue
            
        # Create and add the task for valid pairs
        task = asyncio.create_task(
            evaluate_caption_pair_async(pair_index, gt_caption, gen_caption)
        )
        tasks.append(task)
        valid_pairs_indices.append(pair_index) # Track index if task created

    if not tasks: print("No valid pairs found to create evaluation tasks."); return []
    print(f"Created {len(tasks)} evaluation tasks. Running concurrently...")
    
    # Run all tasks concurrently and gather results
    # return_exceptions=True prevents one failure from stopping others
    raw_results_with_indices = await asyncio.gather(*tasks, return_exceptions=True)
    
    print(f"\nAll {len(raw_results_with_indices)} tasks completed. Processing results...")
    
    # --- Process Results ---
    final_results = []
    # Use the list of valid indices we created tasks for
    input_map = {idx: caption_pairs[idx-1] for idx in valid_pairs_indices} 
    
    for result_item in raw_results_with_indices:
        # Handle potential errors from asyncio.gather itself (rare with return_exceptions=True)
        if isinstance(result_item, Exception): 
            print(f"Error during task gathering (asyncio internal): {result_item}"); continue 
        
        # Unpack result from our worker function: (index, response_object | Exception)
        pair_index, outcome = result_item 
        
        original_pair_data = input_map.get(pair_index)
        if not original_pair_data: 
             # This check ensures we only process results for which we created tasks
             print(f"Warning: Received result for unexpected index {pair_index}. Skipping.")
             continue

        # Prepare result dictionary structure
        processed_result = {
            "pair_index": pair_index,
            "ground_truth": original_pair_data.get("ground_truth"),
            "generated_caption": original_pair_data.get("generated_caption"),
            "match_result": None, # For the structured {is_match: bool}
            "reasoning_summary": None, # For the separate reasoning text
            "status": "Unknown",
            "error_info": None
        }

        # --- Process based on whether outcome is a Response object or an Exception ---
        if isinstance(outcome, Exception):
            # API call failed for this pair
            error_type = type(outcome).__name__
            processed_result["status"] = f"Failed ({error_type})"
            processed_result["error_info"] = str(outcome)
            print(f"Processed Pair {pair_index}: Failed ({error_type})")
        
        # Assuming the outcome is an OpenAI Response object if not an Exception
        # Add more robust type checking if needed: from openai.types.beta import Response
        # elif isinstance(outcome, Response): 
        else: 
            response_obj = outcome # Rename for clarity
            
            # 1. Extract Structured Output (Match Result)
            structured_found = False
            if hasattr(response_obj, 'output_parsed') and response_obj.output_parsed:
                if isinstance(response_obj.output_parsed, EvaluationMatchResult):
                    processed_result["match_result"] = response_obj.output_parsed.model_dump()
                    processed_result["status"] = "Success (Structured)"
                    structured_found = True
                    print(f"Processed Pair {pair_index}: Success (Structured)")
                else:
                    # Parsed output exists but isn't the expected Pydantic type
                    processed_result["status"] = "Failed (Unexpected Parsed Type)"
                    processed_result["error_info"] = f"Parsed type: {type(response_obj.output_parsed)}"
                    print(f"Processed Pair {pair_index}: Failed (Unexpected Parsed Type)")
            else:
                # No parsed output found
                processed_result["status"] = "Failed (No Structured Output)"
                processed_result["error_info"] = "output_parsed attribute not found or empty."
                print(f"Processed Pair {pair_index}: Failed (No Structured Output)")


            # 2. Extract Reasoning Summary (regardless of structured output success)
            summary_text = None
            if hasattr(response_obj, 'output') and isinstance(response_obj.output, list):
                for item in response_obj.output:
                    if hasattr(item, 'type') and item.type == 'reasoning' and hasattr(item, 'summary'): 
                        # Extract the text from the summary object/list
                        try:
                             # Summary structure might be a list of objects, get text from first
                             if isinstance(item.summary, list) and len(item.summary) > 0 and hasattr(item.summary[0], 'text'):
                                  summary_text = item.summary[0].text
                             elif hasattr(item.summary, 'text'): # Or maybe directly has text
                                  summary_text = item.summary.text
                             # Add more checks if structure varies
                             if summary_text: 
                                 print(f" -> Found Reasoning Summary for Pair {pair_index}")
                                 break # Found it
                        except Exception as summary_err:
                             print(f" -> Error extracting summary text for Pair {pair_index}: {summary_err}")

            processed_result["reasoning_summary"] = summary_text
            
            # Add overall status check - even if structured output failed, might be partially useful
            if processed_result["status"].startswith("Failed") and summary_text:
                 processed_result["status"] += " (Summary Found)"
            elif processed_result["status"] == "Unknown": # Should have been set above
                 processed_result["status"] = "Failed (Processing Issue)" # Fallback status
                 
        final_results.append(processed_result)
        
    # Sort results by original index for consistency
    final_results.sort(key=lambda x: x.get("pair_index", float('inf'))) 
    print("\n--- Result Processing Finished ---")
    return final_results

print("\nMain concurrent evaluation function 'main_concurrent_evaluation' defined.")

# --- Block 5: Runner ---

if __name__ == "__main__": 
    # Ensure client is initialized and data is loaded before running
    if caption_pairs and client:
        print("\n" + "="*50)
        print(" Executing Main Concurrent Evaluation Loop ".center(50, "="))
        print("="*50)
        
        # Run the main async function that manages concurrency
        final_evaluation_results = asyncio.run(main_concurrent_evaluation())
        
        # --- Save final results ---
        if final_evaluation_results:
            output_filename = f"{INPUT_JSON_FILE.replace('.json', '')}_results.json"
            print(f"\nSaving {len(final_evaluation_results)} results to '{output_filename}'...")
            try:
                with open(output_filename, "w", encoding='utf-8') as outfile:
                   # Use default=str for safety during JSON dump (e.g., for Exceptions)
                   json.dump(final_evaluation_results, outfile, indent=2, ensure_ascii=False, default=str) 
                print(f"Successfully saved results.")
            except Exception as e:
                print(f"\nError saving results to '{output_filename}': {e}")
        else:
             print("\nNo results were generated or processed.")
             
    elif not client:
         print("\nEvaluation not started: OpenAI client failed to initialize.")
    else: # caption_pairs is empty or failed loading
        print("\nEvaluation not started: No data was loaded successfully.")

# --- End of Code ---


Main concurrent evaluation function 'main_concurrent_evaluation' defined.

=== Executing Main Concurrent Evaluation Loop ====

--- Starting Concurrent Evaluation of 980 Pairs ---
Created 980 evaluation tasks. Running concurrently...

All 980 tasks completed. Processing results...
Processed Pair 1: Success (Structured)
 -> Found Reasoning Summary for Pair 1
Processed Pair 2: Success (Structured)
Processed Pair 3: Success (Structured)
 -> Found Reasoning Summary for Pair 3
Processed Pair 4: Success (Structured)
Processed Pair 5: Success (Structured)
 -> Found Reasoning Summary for Pair 5
Processed Pair 6: Success (Structured)
 -> Found Reasoning Summary for Pair 6
Processed Pair 7: Success (Structured)
Processed Pair 8: Success (Structured)
 -> Found Reasoning Summary for Pair 8
Processed Pair 9: Success (Structured)
Processed Pair 10: Success (Structured)
 -> Found Reasoning Summary for Pair 10
Processed Pair 11: Success (Structured)
Processed Pair 12: Success (Structured)
Processed Pa

In [6]:
# Block 6: Calculate and Display Accuracy / Summary Statistics (Corrected Status Check)

def calculate_accuracy_stats(results_list):
    """
    Calculates success rate and match rate from the evaluation results,
    using the confirmed result structure.

    Args:
        results_list (list): The list of result dictionaries produced by the 
                             main concurrent evaluation loop (like the sample).

    Returns:
        dict: A dictionary containing calculated statistics, or None if input is invalid.
    """
    if not results_list or not isinstance(results_list, list):
        print("Accuracy calculation skipped: Input results list is invalid or empty.")
        return None

    total_pairs_processed = len(results_list)
    successful_structured_evals = 0
    matches_found_by_model = 0
    
    # Keep track of different statuses if needed
    status_counts = {} 

    print(f"\n--- Calculating Statistics for {total_pairs_processed} Processed Pairs ---")

    for result in results_list:
        status = result.get("status", "Unknown") # Get status, default to Unknown
        status_counts[status] = status_counts.get(status, 0) + 1 # Count statuses

        # *** Corrected Check: Use the exact status string from the sample output ***
        is_successful_structured = status == "Success (Structured)" 
        
        if is_successful_structured:
            successful_structured_evals += 1
            # Check the match result within the successfully parsed evaluation
            match_result = result.get("match_result")
            # Check it's a dict and the 'is_match' key is explicitly True
            if isinstance(match_result, dict) and match_result.get("is_match") is True:
                matches_found_by_model += 1
        
    # Calculate rates, handling potential division by zero
    processing_success_rate = (successful_structured_evals / total_pairs_processed * 100) if total_pairs_processed > 0 else 0
    model_match_rate = (matches_found_by_model / successful_structured_evals * 100) if successful_structured_evals > 0 else 0

    stats = {
        "total_pairs_processed": total_pairs_processed,
        "successful_structured_evals": successful_structured_evals,
        "processing_success_rate_percent": round(processing_success_rate, 2),
        "matches_found_by_model": matches_found_by_model,
        "model_match_rate_percent": round(model_match_rate, 2), # Percentage of successful evals that were 'True'
        "status_counts": status_counts # Include counts of different statuses
    }

    return stats

# --- Run the Accuracy Calculation ---
# Assumes 'final_evaluation_results' holds the results list.

if 'final_evaluation_results' in locals() and final_evaluation_results:
    print("\n" + "="*50)
    print(" Calculating Accuracy / Summary Statistics ".center(50, "="))
    print("="*50)
    
    accuracy_summary = calculate_accuracy_stats(final_evaluation_results)
    
    if accuracy_summary:
        print(f"\nTotal Pairs Processed: {accuracy_summary['total_pairs_processed']}")
        print(f"Successful Structured Evaluations ('is_match' obtained): {accuracy_summary['successful_structured_evals']}")
        print(f"Processing Success Rate: {accuracy_summary['processing_success_rate_percent']}%")
        print("-" * 30)
        print(f"Pairs Classified as 'is_match: True' by Model: {accuracy_summary['matches_found_by_model']} (out of {accuracy_summary['successful_structured_evals']} successful)")
        print(f"Model Match Rate (among successful evals): {accuracy_summary['model_match_rate_percent']}%")
        print("-" * 30)
        print("Status Counts:")
        for status, count in accuracy_summary.get('status_counts', {}).items():
             print(f"  - {status}: {count}")
        print("\nNote: 'Model Match Rate' reflects how often the model found captions similar based on the 'roughly similar at all' criteria, among pairs it successfully evaluated with a structured output.")
        
else:
    print("\nAccuracy calculation skipped: 'final_evaluation_results' variable not found or is empty.")
    print("Please ensure the main evaluation loop (Block 4/5) ran successfully and produced results.")


=== Calculating Accuracy / Summary Statistics ====

--- Calculating Statistics for 980 Processed Pairs ---

Total Pairs Processed: 980
Successful Structured Evaluations ('is_match' obtained): 980
Processing Success Rate: 100.0%
------------------------------
Pairs Classified as 'is_match: True' by Model: 601 (out of 980 successful)
Model Match Rate (among successful evals): 61.33%
------------------------------
Status Counts:
  - Success (Structured): 980

Note: 'Model Match Rate' reflects how often the model found captions similar based on the 'roughly similar at all' criteria, among pairs it successfully evaluated with a structured output.
