In [1]:
import json
from collections import defaultdict

# Define the category map (same as judge_baseline.ipynb)
CATEGORY_MAP = {
    "0": "TextEditing",  "1": "TextEditing",   "2": "TextEditing",
    "3": "TextEditing",  "4": "TextEditing",   "5": "TextEditing",
    "6": "TextEditing",  "7": "TextEditing",   "8": "TextEditing",
    "9": "TextEditing", "10": "TextEditing",  "11": "VisualFormatting",
    "12": "VisualFormatting", "13": "TextEditing",  "14": "VisualFormatting",
    "15": "TextEditing", "16": "VisualFormatting",  "17": "LayoutAndImageAdjustment",
    "18": "VisualFormatting", "19": "TextEditing",  "20": "VisualFormatting",
    "21": "VisualFormatting", "22": "VisualFormatting", "23": "VisualFormatting",
    "24": "VisualFormatting", "25": "VisualFormatting", "26": "VisualFormatting",
    "27": "VisualFormatting", "28": "VisualFormatting", "29": "LayoutAndImageAdjustment",
    "30": "LayoutAndImageAdjustment", "31": "LayoutAndImageAdjustment",
    "32": "LayoutAndImageAdjustment", "33": "LayoutAndImageAdjustment",
    "34": "LayoutAndImageAdjustment", "35": "LayoutAndImageAdjustment",
    "36": "LayoutAndImageAdjustment", "37": "LayoutAndImageAdjustment",
    "38": "LayoutAndImageAdjustment", "39": "LayoutAndImageAdjustment",
    "40": "VisualFormatting", "41": "VisualFormatting", "42": "VisualFormatting",
    "43": "LayoutAndImageAdjustment", "44": "LayoutAndImageAdjustment",
    "45": "VisualFormatting", "46": "SlideStructure", "47": "SlideStructure",
    "48": "SlideStructure", "49": "LayoutAndImageAdjustment",
    "50": "SlideStructure", "51": "SlideStructure", "52": "SlideStructure",
    "53": "VisualFormatting", "54": "TextEditing", "55": "TextEditing"
}

def analyze_jsonl(filepath):
    """Analyze JSONL file and calculate statistics by category."""
    
    # Initialize data structures
    category_data = defaultdict(lambda: {
        'total': 0,
        'success': 0,
        'input_tokens': [],
        'output_tokens': [],
        'prices': [],
        'execution_times': []
    })
    
    # Read and process the JSONL file
    entries = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    entry = json.loads(line)
                    entries.append(entry)
                except json.JSONDecodeError:
                    print(f"Error parsing line: {line[:50]}...")
    
    # Process each entry
    for entry in entries:
        instruction_key = entry['instruction_key']
        
        # Handle keys with a dash (like "6-2")
        if "-" in instruction_key:
            base_key = instruction_key.split("-")[0]
        else:
            base_key = instruction_key
        
        # Get category
        category = CATEGORY_MAP.get(base_key)
        
        if category:
            stats = category_data[category]
            stats['total'] += 1
            
            # Check if execution was successful
            if entry.get('status') == "success":
                stats['success'] += 1
            
            # Collect all metrics (including 0 values)
            stats['input_tokens'].append(entry.get('input_token', 0))
            stats['output_tokens'].append(entry.get('output_token', 0))
            stats['prices'].append(entry.get('price', 0))
            stats['execution_times'].append(entry.get('execution_time', 0))
    
    # Calculate statistics for each category
    results = {}
    for category, stats in category_data.items():
        if stats['total'] > 0:
            success_rate = (stats['success'] / stats['total']) * 100
            avg_input_tokens = sum(stats['input_tokens']) / len(stats['input_tokens'])
            avg_output_tokens = sum(stats['output_tokens']) / len(stats['output_tokens'])
            avg_price = sum(stats['prices']) / len(stats['prices'])
            avg_execution_time = sum(stats['execution_times']) / len(stats['execution_times'])
            
            results[category] = {
                'total_entries': stats['total'],
                'successful_entries': stats['success'],
                'success_rate': f"{success_rate:.2f}%",
                'avg_input_tokens': round(avg_input_tokens),
                'avg_output_tokens': round(avg_output_tokens),
                'avg_price': f"{avg_price:.6f}",
                'avg_execution_time': f"{avg_execution_time:.2f}s"
            }
    
    # Calculate overall statistics
    total_entries = len(entries)
    successful_entries = sum(1 for entry in entries if entry.get('status') == "success")
    overall_success_rate = (successful_entries / total_entries) * 100 if total_entries > 0 else 0
    
    avg_input_tokens = sum(entry.get('input_token', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_output_tokens = sum(entry.get('output_token', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_price = sum(entry.get('price', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_execution_time = sum(entry.get('execution_time', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    
    overall_stats = {
        'total_entries': total_entries,
        'successful_entries': successful_entries,
        'success_rate': f"{overall_success_rate:.2f}%",
        'avg_input_tokens': round(avg_input_tokens),
        'avg_output_tokens': round(avg_output_tokens),
        'avg_price': f"{avg_price:.6f}",
        'avg_execution_time': f"{avg_execution_time:.2f}s"
    }
    
    return results, overall_stats

def print_results(category_results, overall_stats):
    """Print category and overall statistics."""
    # Print category results
    print("\n===== STATISTICS BY CATEGORY =====")
    print(f"{'Category':<25} {'Success Rate':<15} {'Avg Input':<12} {'Avg Output':<12} {'Avg Price':<12} {'Avg Time':<12}")
    print("-" * 90)
    
    for category in sorted(category_results.keys()):
        stats = category_results[category]
        print(f"{category:<25} {stats['success_rate']:<15} {stats['avg_input_tokens']:<12} {stats['avg_output_tokens']:<12} {stats['avg_price']:<12} {stats['avg_execution_time']:<12}")
    
    # Print overall results
    print("\n===== OVERALL STATISTICS =====")
    print(f"Total Entries: {overall_stats['total_entries']}")
    print(f"Successful Entries: {overall_stats['successful_entries']}")
    print(f"Overall Success Rate: {overall_stats['success_rate']}")
    print(f"Average Input Tokens: {overall_stats['avg_input_tokens']}")
    print(f"Average Output Tokens: {overall_stats['avg_output_tokens']}")
    print(f"Average Price: {overall_stats['avg_price']}")
    print(f"Average Execution Time: {overall_stats['avg_execution_time']}")

# Execute analysis
if __name__ == "__main__":
    filepath = "result_ufo_evaluation/ufo_experiment_log.jsonl"
    category_results, overall_stats = analyze_jsonl(filepath)
    print_results(category_results, overall_stats)


===== STATISTICS BY CATEGORY =====
Category                  Success Rate    Avg Input    Avg Output   Avg Price    Avg Time    
------------------------------------------------------------------------------------------
LayoutAndImageAdjustment  66.32%          108480       2505         0.017775     128.23s     
SlideStructure            82.22%          72463        1828         0.011966     99.18s      
TextEditing               67.24%          102040       2266         0.016666     117.85s     
VisualFormatting          86.99%          93271        2156         0.015284     122.25s     

===== OVERALL STATISTICS =====
Total Entries: 379
Successful Entries: 285
Overall Success Rate: 75.20%
Average Input Tokens: 97297
Average Output Tokens: 2238
Average Price: 0.015937
Average Execution Time: 119.66s


In [None]:
import json
from collections import defaultdict
import statistics  # Added for variance calculation

# Define the category map (same as judge_baseline.ipynb)
CATEGORY_MAP = {
    "0": "TextEditing",  "1": "TextEditing",   "2": "TextEditing",
    "3": "TextEditing",  "4": "TextEditing",   "5": "TextEditing",
    "6": "TextEditing",  "7": "TextEditing",   "8": "TextEditing",
    "9": "TextEditing", "10": "TextEditing",  "11": "VisualFormatting",
    "12": "VisualFormatting", "13": "TextEditing",  "14": "VisualFormatting",
    "15": "TextEditing", "16": "VisualFormatting",  "17": "LayoutAndImageAdjustment",
    "18": "VisualFormatting", "19": "TextEditing",  "20": "VisualFormatting",
    "21": "VisualFormatting", "22": "VisualFormatting", "23": "VisualFormatting",
    "24": "VisualFormatting", "25": "VisualFormatting", "26": "VisualFormatting",
    "27": "VisualFormatting", "28": "VisualFormatting", "29": "LayoutAndImageAdjustment",
    "30": "LayoutAndImageAdjustment", "31": "LayoutAndImageAdjustment",
    "32": "LayoutAndImageAdjustment", "33": "LayoutAndImageAdjustment",
    "34": "LayoutAndImageAdjustment", "35": "LayoutAndImageAdjustment",
    "36": "LayoutAndImageAdjustment", "37": "LayoutAndImageAdjustment",
    "38": "LayoutAndImageAdjustment", "39": "LayoutAndImageAdjustment",
    "40": "VisualFormatting", "41": "VisualFormatting", "42": "VisualFormatting",
    "43": "LayoutAndImageAdjustment", "44": "LayoutAndImageAdjustment",
    "45": "VisualFormatting", "46": "SlideStructure", "47": "SlideStructure",
    "48": "SlideStructure", "49": "LayoutAndImageAdjustment",
    "50": "SlideStructure", "51": "SlideStructure", "52": "SlideStructure",
    "53": "VisualFormatting", "54": "TextEditing", "55": "TextEditing"
}

def analyze_jsonl(filepath):
    """Analyze JSONL file and calculate statistics by category."""
    
    # Initialize data structures
    category_data = defaultdict(lambda: {
        'total': 0,
        'success': 0,
        'timeout': 0,  # Added counter for timeout status
        'short_time': 0,  # Added counter for entries with total_api_time < 1
        'input_tokens': [],
        'output_tokens': [],
        'prices': [],
        'execution_times': []
    })
    
    # Read and process the JSONL file
    entries = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    entry = json.loads(line)
                    entries.append(entry)
                except json.JSONDecodeError:
                    print(f"Error parsing line: {line[:50]}...")
    
    # Process each entry
    for entry in entries:
        instruction_key = entry['instruction_key']
        
        # Handle keys with a dash (like "6-2")
        if "-" in instruction_key:
            base_key = instruction_key.split("-")[0]
        else:
            base_key = instruction_key
        
        # Get category
        category = CATEGORY_MAP.get(base_key)
        
        if category:
            stats = category_data[category]
            stats['total'] += 1
            
            # Count entries with timeout status
            if entry.get('status') == "timeout":
                stats['timeout'] += 1
            
            # Count entries with total_api_time < 1
            if entry.get('total_api_time', 0) < 1:
                stats['short_time'] += 1
            
            # Check if execution was successful - considering both status and time
            if (entry.get('status') == "success" and 
                entry.get('total_api_time', 0) >= 1):
                stats['success'] += 1
            
            # Collect all metrics (including 0 values)
            stats['input_tokens'].append(entry.get('input_token', 0))
            stats['output_tokens'].append(entry.get('output_token', 0))
            stats['prices'].append(entry.get('price', 0))
            stats['execution_times'].append(entry.get('execution_time', 0))
    
    # Calculate statistics for each category
    results = {}
    for category, stats in category_data.items():
        if stats['total'] > 0:
            success_rate = (stats['success'] / stats['total']) * 100
            avg_input_tokens = sum(stats['input_tokens']) / len(stats['input_tokens'])
            avg_output_tokens = sum(stats['output_tokens']) / len(stats['output_tokens'])
            avg_price = sum(stats['prices']) / len(stats['prices'])
            avg_execution_time = sum(stats['execution_times']) / len(stats['execution_times'])
            
            # Calculate variance for execution time
            time_variance = 0
            if len(stats['execution_times']) > 1:
                time_variance = statistics.variance(stats['execution_times'])
            
            results[category] = {
                'total_entries': stats['total'],
                'successful_entries': stats['success'],
                'timeout_entries': stats['timeout'],
                'short_time_entries': stats['short_time'],
                'success_rate': f"{success_rate:.2f}%",
                'avg_input_tokens': f"{avg_input_tokens:.2f}",
                'avg_output_tokens': f"{avg_output_tokens:.2f}",
                'avg_price': f"{avg_price:.6f}",
                'avg_execution_time': f"{avg_execution_time:.2f}s",
                'time_variance': f"{time_variance:.2f}"
            }
    
    # Calculate overall statistics
    total_entries = len(entries)
    timeout_entries = sum(1 for entry in entries if entry.get('status') == "timeout")
    short_time_entries = sum(1 for entry in entries if entry.get('total_api_time', 0) < 1)
    successful_entries = sum(1 for entry in entries if 
                          entry.get('status') == "success" and entry.get('total_api_time', 0) >= 1)
    
    overall_success_rate = (successful_entries / total_entries) * 100 if total_entries > 0 else 0
    
    avg_input_tokens = sum(entry.get('input_token', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_output_tokens = sum(entry.get('output_token', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_price = sum(entry.get('price', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    avg_execution_time = sum(entry.get('execution_time', 0) for entry in entries) / total_entries if total_entries > 0 else 0
    
    # Calculate variance for execution time for all entries
    all_execution_times = [entry.get('execution_time', 0) for entry in entries]
    time_variance = 0
    if len(all_execution_times) > 1:
        time_variance = statistics.variance(all_execution_times)
    
    overall_stats = {
        'total_entries': total_entries,
        'successful_entries': successful_entries,
        'timeout_entries': timeout_entries,
        'short_time_entries': short_time_entries,
        'success_rate': f"{overall_success_rate:.2f}%",
        'avg_input_tokens': (avg_input_tokens),
        'avg_output_tokens': (avg_output_tokens),
        'avg_price': f"{avg_price:.6f}",
        'avg_execution_time': f"{avg_execution_time:.2f}s",
        'time_variance': f"{time_variance:.2f}"
    }
    
    return results, overall_stats

def print_results(category_results, overall_stats):
    """Print category and overall statistics."""
    # Print category results
    print("\n===== STATISTICS BY CATEGORY =====")
    print(f"{'Category':<25} {'Success Rate':<15} {'Timeouts':<10} {'Short Time':<12} {'Avg Input':<12} {'Avg Output':<12} {'Avg Price':<12} {'Avg Time':<12} {'Time Var':<12}")
    print("-" * 130)
    
    for category in sorted(category_results.keys()):
        stats = category_results[category]
        print(f"{category:<25} {stats['success_rate']:<15} {stats['timeout_entries']:<10} {stats['short_time_entries']:<12} {stats['avg_input_tokens']:<12} {stats['avg_output_tokens']:<12} {stats['avg_price']:<12} {stats['avg_execution_time']:<12} {stats['time_variance']:<12}")
    
    # Print overall results
    print("\n===== OVERALL STATISTICS =====")
    print(f"Total Entries: {overall_stats['total_entries']}")
    print(f"Successful Entries: {overall_stats['successful_entries']}")
    print(f"Timeout Entries: {overall_stats['timeout_entries']}")
    print(f"Short Time Entries (<1s): {overall_stats['short_time_entries']}")
    print(f"Overall Success Rate: {overall_stats['success_rate']}")
    print(f"Average Input Tokens: {overall_stats['avg_input_tokens']}")
    print(f"Average Output Tokens: {overall_stats['avg_output_tokens']}")
    print(f"Average Price: {overall_stats['avg_price']}")
    print(f"Average Execution Time: {overall_stats['avg_execution_time']}")
    print(f"Time Variance: {overall_stats['time_variance']}")

# Execute analysis
if __name__ == "__main__":
    filepath = "result_ufo_evaluation/ufo_experiment_log.jsonl"
    category_results, overall_stats = analyze_jsonl(filepath)
    print_results(category_results, overall_stats)


===== STATISTICS BY CATEGORY =====
Category                  Success Rate    Timeouts   Short Time   Avg Input    Avg Output   Avg Price    Avg Time     Time Var    
----------------------------------------------------------------------------------------------------------------------------------
LayoutAndImageAdjustment  65.26%          21         12           108480.35    2505.09      0.017775     128.23s      8820.34     
SlideStructure            82.22%          5          3            72463.44     1827.64      0.011966     99.18s       3943.77     
TextEditing               66.38%          22         17           102040.36    2266.05      0.016666     117.85s      11112.04    
VisualFormatting          86.18%          15         2            93271.30     2155.72      0.015284     122.25s      5516.07     

===== OVERALL STATISTICS =====
Total Entries: 379
Successful Entries: 282
Timeout Entries: 63
Short Time Entries (<1s): 34
Overall Success Rate: 74.41%
Average Input Tokens: 972

: 