# Model Evaluation Framework

## 1. Setup: Imports and Configuration

In [None]:
%pip install datasets evaluate transformers accelerate bitsandbytes torch pandas matplotlib seaborn tqdm ipywidgets -q

In [None]:
import os
import json
import importlib

# Import refactored components
from config_manager import ConfigManager
from evaluation_framework import EvaluationFramework
from results_visualizer import ResultsVisualizer

# --- Configuration ---

# Initialize ConfigManager to get base model name and defaults
config_manager = ConfigManager()
base_config = config_manager.get_base_config()
BASE_MODEL_NAME = base_config['MODEL_NAME']

# List of paths to the *final saved model directories* generated by training runs
# These should be relative to the project root or absolute paths
MODEL_PATHS_TO_EVALUATE = [
    # --- Add paths to your trained model directories below ---
    # Example format:
    # "training_outputs/finetune_deepseek_original_lila/final_model",
    # "training_outputs/gradient_ascent_deepseek_lila/final_model",

    # Add the actual paths from your training runs here:
    "training_outputs/finetune_DeepSeek-R1-Distill-Qwen-1.5B_original_lila_MATH_algebra_crowdsourced/final_model",
    "training_outputs/finetune_DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced/final_model",
    "training_outputs/finetune_DeepSeek-R1-Distill-Qwen-1.5B_val_modified_lila_MATH_algebra_crowdsourced/final_model",
    "training_outputs/finetune_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model",
    "training_outputs/gradient_ascent_DeepSeek-R1-Distill-Qwen-1.5B/final_model",
    "training_outputs/gradient_ascent_reduced_EOS_DeepSeek-R1-Distill-Qwen-1.5B/final_model",
]

# Combine base model name with the paths
all_model_identifiers = [BASE_MODEL_NAME] + MODEL_PATHS_TO_EVALUATE

# Evaluation parameters (can override defaults from config.py)
NUM_EXAMPLES_PER_SPLIT = None # Set to an integer (e.g., 100) for faster testing, None for full evaluation
INFERENCE_BATCH_SIZE = base_config.get('DEFAULT_INFERENCE_BATCH_SIZE', 4) # Use default from config or set here
MAX_NEW_TOKENS = base_config.get('MAX_NEW_TOKENS_MATH', 1024)
INFERENCE_STYLE = 'think' # Or 'no_think', should match how models were trained/expect prompts

# Output file for results
RESULTS_FILE = "evaluation_results_aggregate.json"
DETAIL_RESULTS_FILE = "evaluation_results_detailed.json"

print(f"Base model: {BASE_MODEL_NAME}")
print(f"Models to evaluate: {all_model_identifiers}")
print(f"Dataset: {base_config['BASE_DATASET_NAME']} ({base_config['BASE_DATASET_CONFIG']})")
print(f"Num examples per split: {'Full' if NUM_EXAMPLES_PER_SPLIT is None else NUM_EXAMPLES_PER_SPLIT}")
print(f"Inference Batch Size: {INFERENCE_BATCH_SIZE}")
print(f"Max new tokens: {MAX_NEW_TOKENS}")
print(f"Inference Style: {INFERENCE_STYLE}")

## 2. Run Evaluation

In [None]:
# Initialize the evaluation framework
eval_framework = EvaluationFramework(config_manager)

# Run the evaluation for all specified models
aggregate_results, detailed_results = eval_framework.run_evaluation(
    model_identifiers=all_model_identifiers,
    num_examples=NUM_EXAMPLES_PER_SPLIT,
    inference_batch_size=INFERENCE_BATCH_SIZE,
    max_new_tokens=MAX_NEW_TOKENS,
    inference_style=INFERENCE_STYLE
)

## 3. Save Results

In [None]:
# Save the aggregated results
if aggregate_results:
    eval_framework.save_results(aggregate_results, RESULTS_FILE)
else:
    print("Skipping saving aggregate results as none were generated.")

# Save the detailed results (optional, can be large)
if detailed_results:
    eval_framework.save_results(detailed_results, DETAIL_RESULTS_FILE)
else:
    print("Skipping saving detailed results as none were generated.")

## 4. Visualize Results

In [None]:
# Load the aggregate results for visualization
loaded_agg_results = eval_framework.load_results(RESULTS_FILE)

if loaded_agg_results:
    # Initialize the visualizer with the loaded data
    visualizer = ResultsVisualizer(loaded_agg_results)

    # Generate plots
    plot_title = f'Model Performance on LILA ({base_config["BASE_DATASET_CONFIG"]})'
    visualizer.plot_comparison_bar_chart(title=plot_title)

    # Optional: Plot individual splits
    # visualizer.plot_split_bar_chart('validation', title=f'Validation Accuracy on LILA ({base_config["BASE_DATASET_CONFIG"]})')
    # visualizer.plot_split_bar_chart('test', title=f'Test Accuracy on LILA ({base_config["BASE_DATASET_CONFIG"]})', palette='Greens_d')
else:
    print("Cannot visualize results as aggregate results file could not be loaded.")