In [13]:
import json
import os
from pathlib import Path

def calculate_average_induction_scores(model_name, thought_type="", source_model=None):
    """
    Calculate the total average score across all induction tasks for a given model.
    
    Args:
        model_name (str): Name of the model (used in folder name)
        thought_type (str): Type of thought process ("", "empty", "transfer", etc.)
        source_model (str): Source model name (required when thought_type="transfer")
    
    Returns:
        dict: Contains individual task scores and overall average
    """
    
    INDUCTION_TASKS = [
        'cause_and_effect', 'larger_animal', 'num_to_verbal','orthography_starts_with',
        'rhymes', 'synonyms', 'taxonomy_animal', 'translation_en-fr',
        'reverse_from_middle', 'smallest_item_length', 'smallest_even_no_sqrt', 'most_vowel_return_consonant',
        'detect_rhyme_and_rewrite', 'rank_by_protein','multi_lang_to_english','square_of_zodiac_animal',
        'alternate_synonym_antonym', 'most_consonant_return_vowel', 'least_unique_word_count', 'first_word_alphabetically_return_reverse'
    ]
    
    # Determine folder name based on thought_type
    if thought_type == "transfer":
        if source_model is None:
            raise ValueError("source_model must be provided when thought_type='transfer'")
        predictions_folder = Path(f"predictions_{source_model}_thoughts_to_{model_name}")
        folder_description = f"transfer from {source_model} to {model_name}"
    elif thought_type == "transfer_without_answer":
        if source_model is None:
            raise ValueError("source_model must be provided when thought_type='transfer'")
        if "dapo" in source_model.lower():
            predictions_folder = Path(f"predictions_without_answer_dapo_thoughts_to_{model_name}")
        elif "oss" in source_model.lower():
            predictions_folder = Path(f"predictions_without_answer_oss_thoughts_to_{model_name}")
        elif "qwq" in source_model.lower():
            predictions_folder = Path(f"predictions_without_answer_qwq_thoughts_to_{model_name}")
        elif "open-thoughts" in source_model.lower():
            predictions_folder = Path(f"predictions_without_answer_opent_thoughts_to_{model_name}")
        elif "nemotron" in source_model.lower():
            predictions_folder = Path(f"predictions_without_answer_nrr_thoughts_to_{model_name}")
        else:
            predictions_folder = Path(f"predictions_{source_model}_thoughts_to_{model_name}")
        folder_description = f"transfer from {source_model} to {model_name}"
    else:
        thought_suffix = f"_{thought_type}" if thought_type else ""
        predictions_folder = Path(f"predictions_{model_name}{thought_suffix}")
        folder_description = f"{model_name} with {thought_type if thought_type else 'default'} thoughts"
            
    
    task_scores = {}
    missing_files = []
    
    print(f"Calculating scores for: {folder_description}")
    print(f"Looking in folder: {predictions_folder}")
    print("-" * 50)
    
    # Check if folder exists
    if not predictions_folder.exists():
        print(f"Error: Folder '{predictions_folder}' does not exist!")
        return None
    
    # Process each task
    for task in INDUCTION_TASKS:
        file_path = predictions_folder / f"{task}_with_scores.json"
        
        if file_path.exists():
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Extract weighted task score
                if 'weighted_task_score' in data:
                    score = data['weighted_task_score']
                    task_scores[task] = score
                    # print(f"{task:<35}: {score:.4f}")
                else:
                    print(f"{task:<35}: Missing 'weighted_task_score' key")
                    missing_files.append(f"{task} (missing key)")
                    
            except json.JSONDecodeError:
                print(f"{task:<35}: Error reading JSON file")
                missing_files.append(f"{task} (JSON error)")
            except Exception as e:
                print(f"{task:<35}: Error - {str(e)}")
                missing_files.append(f"{task} (error)")
        else:
            print(f"{task:<35}: File not found")
            missing_files.append(task)
    
    print("-" * 50)
    
    # Calculate average
    if task_scores:
        total_average = sum(task_scores.values()) / len(task_scores)
        print(f"Total tasks processed: {len(task_scores)}/{len(INDUCTION_TASKS)}")
        print(f"TOTAL AVERAGE SCORE: {total_average:.4f}")
        
        if missing_files:
            print(f"\nMissing/Error files ({len(missing_files)}):")
            for missing in missing_files:
                print(f"  - {missing}")
    else:
        print("No valid scores found!")
        total_average = None
    
    return {
        'model_name': model_name,
        'source_model': source_model,
        'thought_type': thought_type,
        'task_scores': task_scores,
        'total_average': total_average,
        'tasks_processed': len(task_scores),
        'total_tasks': len(INDUCTION_TASKS),
        'missing_files': missing_files,
        'folder_path': str(predictions_folder)
    }

def compare_multiple_models(model_names, thought_type="", source_model=None):
    """
    Compare average scores across multiple models.
    
    Args:
        model_names (list): List of model names to compare
        thought_type (str): Type of thought process
        source_model (str): Source model name (required when thought_type="transfer")
    """
    results = {}
    
    for model_name in model_names:
        print(f"\n{'='*60}")
        result = calculate_average_induction_scores(model_name, thought_type, source_model)
        if result:
            results[model_name] = result
    
    # Summary comparison
    if len(results) > 1:
        print(f"\n{'='*60}")
        print("SUMMARY COMPARISON")
        print("="*60)
        
        for model_name, result in results.items():
            if result['total_average'] is not None:
                if thought_type == "transfer":
                    display_name = f"{result['source_model']}→{model_name}"
                else:
                    display_name = f"{model_name}({thought_type if thought_type else 'default'})"
                print(f"{display_name:<35}: {result['total_average']:.4f} ({result['tasks_processed']}/{result['total_tasks']} tasks)")
            else:
                print(f"{model_name:<35}: No valid scores")
    
    return results

def compare_transfer_scenarios(source_models, target_models, thought_type="transfer"):
    """
    Compare multiple transfer scenarios (source → target combinations).
    
    Args:
        source_models (list): List of source model names
        target_models (list): List of target model names
    """
    all_results = {}
    
    for source in source_models:
        for target in target_models:
            if True:  # Skip self-transfer
                print(f"\n{'='*60}")
                print(f"TRANSFER: {source} → {target}")
                print("="*60)
                
                result = calculate_average_induction_scores(target, thought_type, source)
                if result:
                    transfer_key = f"{source}_to_{target}"
                    all_results[transfer_key] = result
    
    # Summary of all transfers
    if len(all_results) > 1:
        print(f"\n{'='*80}")
        print("TRANSFER SUMMARY")
        print("="*80)
        
        # Sort by average score (descending)
        sorted_results = sorted(all_results.items(), 
                              key=lambda x: x[1]['total_average'] if x[1]['total_average'] else 0, 
                              reverse=True)
        
        for transfer_key, result in sorted_results:
            if result['total_average'] is not None:
                transfer_name = f"{result['source_model']} → {result['model_name']}"
                print(f"{transfer_name:<45}: {result['total_average']:.4f} ({result['tasks_processed']}/{result['total_tasks']} tasks)")
            else:
                print(f"{transfer_key:<45}: No valid scores")
    
    return all_results

# Example usage:
if __name__ == "__main__":
    # Example 1: Regular model evaluation with empty thoughts
    model_names = ["nvidia_Nemotron-Research-Reasoning-Qwen-1.5B", 
                   "open-thoughts_OpenThinker-7B",
                   "openai_gpt-oss-20b",
                   "Qwen_QwQ-32B", 
                   "BytedTsinghua-SIA_DAPO-Qwen-32B"]
    
    # print("EMPTY THOUGHT EVALUATION:")
    #results_empty = compare_multiple_models(model_names, "ensemble_without_answer_gen_qwq_opent_eval_oss")
    results_empty = compare_multiple_models(model_names, "with_sampling_without_answer")
    
    # Example 2: Single transfer evaluation
    # print(f"\n{'='*80}")
    # print("TRANSFER EVALUATION EXAMPLE:")
    # source_model = "Qwen_QwQ-32B"
    # target_model = "openai_gpt-oss-20b"
    # transfer_result = calculate_average_induction_scores(target_model, "transfer", source_model)
    
    # Example 3: Multiple transfer scenarios
    # print(f"\n{'='*80}")
    # print("MULTIPLE TRANSFER SCENARIOS:")
    # source_models = ["BytedTsinghua-SIA_DAPO-Qwen-32B"]
    # target_models = ["nvidia_Nemotron-Research-Reasoning-Qwen-1.5B", 
    #                "open-thoughts_OpenThinker-7B",
    #                "openai_gpt-oss-20b",
    #                "Qwen_QwQ-32B", 
    #                "BytedTsinghua-SIA_DAPO-Qwen-32B"]
    # transfer_results = compare_transfer_scenarios(source_models, target_models, "transfer without answer")


Calculating scores for: nvidia_Nemotron-Research-Reasoning-Qwen-1.5B with with_sampling_without_answer thoughts
Looking in folder: predictions_nvidia_Nemotron-Research-Reasoning-Qwen-1.5B_with_sampling_without_answer
--------------------------------------------------
--------------------------------------------------
Total tasks processed: 20/20
TOTAL AVERAGE SCORE: 0.5593

Calculating scores for: open-thoughts_OpenThinker-7B with with_sampling_without_answer thoughts
Looking in folder: predictions_open-thoughts_OpenThinker-7B_with_sampling_without_answer
--------------------------------------------------
--------------------------------------------------
Total tasks processed: 20/20
TOTAL AVERAGE SCORE: 0.5648

Calculating scores for: openai_gpt-oss-20b with with_sampling_without_answer thoughts
Looking in folder: predictions_openai_gpt-oss-20b_with_sampling_without_answer
--------------------------------------------------
--------------------------------------------------
Total task