# Explainer Model Performance Analysis

This notebook analyzes the performance of different explainer models in generating explanations for SAE latent features. We compare models across multiple metrics including accuracy, F1 scores, token usage, and execution time.

## Recent Updates
- **Updated to 400 latents**: Changed from 100 to 400 latents to match the experiment configuration
- **Fixed directory pattern matching**: Now handles both old (with suffix) and new (without suffix) directory naming conventions
- **Simplified loading process**: Now uses functions from `result_analysis.py` to reduce redundancy
- **Enhanced model support**: Updated model name mapping for better display names and added GPT-OSS-20B

## Analysis Overview

- **Accuracy Distribution**: Density plots showing accuracy distribution for each model
- **Mean Performance**: Bar charts comparing mean accuracy across models
- **Token Usage**: Analysis of computational efficiency and resource consumption
- **Performance Summary**: Comprehensive comparison tables

## 1. Setup and Configuration

Import required libraries, define model name mapping for prettier display names, and set up output directories.

In [6]:
import sys
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np

# Add the parent directory to the path to import delphi modules
sys.path.append(str(Path.cwd().parent))

from delphi.log.result_analysis import (
    import_plotly,
    load_data,
    get_agg_metrics,
    add_latent_f1,
    compute_confusion,
    compute_classification_metrics,
    frequency_weighted_f1
)

# Import plotly for plotting
px = import_plotly()

# Configuration - Model prefix for experiment naming
MODEL_PREFIX = "pythiaST"  # Change this to match your base model (e.g., "pythia", "gemma", etc.)
LATENTS_COUNT = 400  # Number of latents used in experiments
COMPARE_THINKING_MODES = False  # Set to True to compare both thinking and non-thinking modes (Currently broken if set to True)

# Define model name mapping for prettier display - updated to match actual directory names
MODEL_NAME_MAPPING = {
    "gemma_3_4b_it_quantized_w4a16": "Gemma-3-4B-IT",
    "Qwen3_4B_quantized_w4a16": "Qwen3-4B", 
    "gemma_3_12b_it_quantized_w4a16": "Gemma-3-12B-IT",
    "gemma_3_27b_it_quantized_w4a16": "Gemma-3-27B-IT",
    "Qwen3_14B_quantized_w4a16": "Qwen3-14B",
    "Qwen3_32B_quantized_w4a16": "Qwen3-32B",
    "Qwen3_235B_A22B_GPTQ_Int4": "Qwen3-235B",
    "Llama_3_3_70B_Instruct_quantized_w4a16": "Llama-3.3-70B-Instruct",
    "Llama_3_1_70B_Instruct_NVFP4": "Llama-3.1-70B-Instruct",
    "Llama_4_Scout_17B_16E_Instruct_quantized_w4a16": "Llama-4-Scout-17B",
    "Llama_4_Maverick_17B_128E_Instruct_quantized_w4a16": "Llama-4-Maverick-17B",
    "llama_8b_explainer": "Transluce-Llama-8B (Qwen 32B Scorer)",
    "Meta_Llama_3_1_8B_Instruct_GPTQ_INT4": "Llama-8B (Qwen 32B Scorer)",
    "gpt_oss_20b": "GPT-OSS-20B"
}

def load_model_results(results_dir: Path, model_mapping: dict, model_prefix: str, latents_count: int, compare_modes: bool = False):
    """Load results and statistics for all explainer models using functions from result_analysis.py."""
    model_results = {}
    model_stats = {}

    # Base directory for experiments with this latent count
    experiments_base = results_dir / model_prefix / f"{latents_count}latents"

    if not experiments_base.exists():
        print(f"Warning: Experiments directory not found: {experiments_base}")
        return model_results, model_stats

    # Pattern for experiment directories (exclude thinking mode and hide directories)
    # Handle both naming conventions: with and without latents suffix
    pattern_with_suffix = f"{model_prefix}_*_{latents_count}latents"
    pattern_without_suffix = f"{model_prefix}_*"
    
    all_exp_dirs = []
    # First try directories with latents suffix (older format)
    for exp_dir in experiments_base.glob(pattern_with_suffix):
        if not ("thinking" in exp_dir.name or "hide" in exp_dir.name):
            all_exp_dirs.append(exp_dir)
    
    # Then try directories without latents suffix (newer format)
    if not all_exp_dirs:
        for exp_dir in experiments_base.glob(pattern_without_suffix):
            if (not ("thinking" in exp_dir.name or "hide" in exp_dir.name) and 
                exp_dir.name != "cache"):  # Exclude cache directory
                all_exp_dirs.append(exp_dir)

    for exp_dir in all_exp_dirs:
        # Skip directories with "thinking" or "hide" in the name
        if "thinking" in exp_dir.name or "hide" in exp_dir.name:
            continue

        # Extract model name from directory: handle both naming conventions
        dir_name = exp_dir.name
        if dir_name.endswith(f"_{latents_count}latents"):
            # Old format: pythiaST_model_400latents
            model_key = dir_name.replace(f"{model_prefix}_", "").replace(f"_{latents_count}latents", "")
        else:
            # New format: pythiaST_model
            model_key = dir_name.replace(f"{model_prefix}_", "")
        
        base_display_name = model_mapping.get(model_key, model_key)
        display_name = base_display_name

        scores_path = exp_dir / "scores"
        if scores_path.exists():
            # Load scoring results using functions from result_analysis.py
            try:
                # All experiments now use the shared cache
                cache_dir_name = "cache"
                latents_path = experiments_base.parent / cache_dir_name / "latents"
                if not latents_path.exists():
                    latents_path = exp_dir / "latents"  # Fallback to local latents

                if latents_path.exists():
                    # Extract module names from the actual files
                    sample_score_dir = next(scores_path.iterdir())
                    sample_files = list(sample_score_dir.glob("*.txt"))
                    if sample_files:
                        # Extract module name from filename pattern (e.g., "layers.32_latent0.txt" -> "layers.32")
                        sample_filename = sample_files[0].stem
                        module_name = sample_filename.split('_latent')[0]
                        modules = [module_name]
                    else:
                        print(f"No score files found in {sample_score_dir}")
                        continue

                    # Use load_data from result_analysis.py
                    latent_df, counts = load_data(scores_path, latents_path, modules)

                    if latent_df.empty:
                        print(f"No data found for {display_name}")
                        continue

                    # Use add_latent_f1 and get_agg_metrics from result_analysis.py
                    latent_df = add_latent_f1(latent_df)
                    processed_df = get_agg_metrics(latent_df, counts)

                    model_results[display_name] = {
                        'latent_df': latent_df,
                        'processed_df': processed_df,
                        'counts': counts
                    }
                else:
                    print(f"Latents path not found for {display_name}")

            except Exception as e:
                print(f"Error loading results for {model_key}: {e}")
                continue

        # Load explainer statistics
        stats_file = exp_dir / "explainer_stats.json"
        if stats_file.exists():
            try:
                with open(stats_file, 'r') as f:
                    stats = json.load(f)
                    model_stats[display_name] = stats
            except Exception as e:
                print(f"Error loading stats for {model_key}: {e}")
                model_stats[display_name] = None
        else:
            model_stats[display_name] = None

    return model_results, model_stats

# Set up directories
results_dir = Path.cwd().parent / "results"
visualizations_dir = results_dir / "visualizations"
visualizations_dir.mkdir(exist_ok=True, parents=True)

print(f"Model prefix: {MODEL_PREFIX}")
print(f"Latents count: {LATENTS_COUNT}")
print(f"Compare thinking modes: {COMPARE_THINKING_MODES}")
print(f"Results directory: {results_dir}")
print(f"Visualizations output: {visualizations_dir}")

# Check if the experiments directory exists
experiments_dir = results_dir / MODEL_PREFIX / f"{LATENTS_COUNT}latents"
print(f"Experiments directory: {experiments_dir}")

if experiments_dir.exists():
    print(f"Available experiments ({LATENTS_COUNT} latents):")
    # Handle both naming conventions: with and without latents suffix
    pattern_with_suffix = f"{MODEL_PREFIX}_*_{LATENTS_COUNT}latents"
    pattern_without_suffix = f"{MODEL_PREFIX}_*"
    
    all_exp_dirs = []
    # First try directories with latents suffix (older format)
    for exp_dir in sorted(experiments_dir.glob(pattern_with_suffix)):
        if not ("thinking" in exp_dir.name or "hide" in exp_dir.name):
            all_exp_dirs.append(exp_dir)
    
    # Then try directories without latents suffix (newer format)
    if not all_exp_dirs:
        for exp_dir in sorted(experiments_dir.glob(pattern_without_suffix)):
            if (not ("thinking" in exp_dir.name or "hide" in exp_dir.name) and 
                exp_dir.name != "cache"):  # Exclude cache directory
                all_exp_dirs.append(exp_dir)
    
    for exp_dir in all_exp_dirs:
        print(f"  - {exp_dir.name}")
else:
    print(f"Warning: Experiments directory not found: {experiments_dir}")
    print("Make sure experiments have been run with the specified latent count.")

Model prefix: pythiaST
Latents count: 400
Compare thinking modes: False
Results directory: /home/jeremias/projects/delphi-explanations/results
Visualizations output: /home/jeremias/projects/delphi-explanations/results/visualizations
Experiments directory: /home/jeremias/projects/delphi-explanations/results/pythiaST/400latents
Available experiments (400 latents):
  - pythiaST_Llama_3_3_70B_Instruct_quantized_w4a16
  - pythiaST_Llama_4_Scout_17B_16E_Instruct_quantized_w4a16
  - pythiaST_Meta_Llama_3_1_8B_Instruct_GPTQ_INT4
  - pythiaST_Qwen3_14B_quantized_w4a16
  - pythiaST_Qwen3_32B_quantized_w4a16
  - pythiaST_Qwen3_4B_quantized_w4a16
  - pythiaST_gemma_3_12b_it_quantized_w4a16
  - pythiaST_gemma_3_27b_it_quantized_w4a16
  - pythiaST_gemma_3_4b_it_quantized_w4a16
  - pythiaST_gpt_oss_20b


## 2. Load Model Results

Load explanation comparison results from all models and extract performance metrics.

In [7]:
# Load all model results and statistics
print("Loading model results...")
model_results, model_stats = load_model_results(results_dir, MODEL_NAME_MAPPING, MODEL_PREFIX, LATENTS_COUNT, COMPARE_THINKING_MODES)

print(f"\nLoaded results for {len(model_results)} models:")
for model_name in model_results.keys():
    print(f"  - {model_name}")

print(f"\nToken usage statistics available for {len([k for k, v in model_stats.items() if v is not None])} models:")
for model_name, stats in model_stats.items():
    if stats:
        print(f"  - {model_name}: {list(stats.keys())}")
    else:
        print(f"  - {model_name}: No stats available")

# Display sample metrics for the first model
if model_results:
    sample_model = list(model_results.keys())[0]
    sample_data = model_results[sample_model]['processed_df']
    print(f"\nSample metrics from {sample_model}:")
    print(sample_data[['score_type', 'accuracy', 'f1_score', 'precision', 'recall']].round(3))

Loading model results...











































Loaded results for 10 models:
  - Qwen3-4B
  - Gemma-3-4B-IT
  - Gemma-3-12B-IT
  - Gemma-3-27B-IT
  - Qwen3-14B
  - Qwen3-32B
  - Llama-3.3-70B-Instruct
  - GPT-OSS-20B
  - Llama-8B (Qwen 32B Scorer)
  - Llama-4-Scout-17B

Token usage statistics available for 10 models:
  - Qwen3-4B: ['DefaultExplainer']
  - Gemma-3-4B-IT: ['DefaultExplainer']
  - Gemma-3-12B-IT: ['DefaultExplainer']
  - Gemma-3-27B-IT: ['DefaultExplainer']
  - Qwen3-14B: ['DefaultExplainer']
  - Qwen3-32B: ['DefaultExplainer']
  - Llama-3.3-70B-Instruct: ['DefaultExplainer']
  - GPT-OSS-20B: ['DefaultExplainer']
  - Llama-8B (Qwen 32B Scorer): ['DefaultExplainer']
  - Llama-4-Scout-17B: ['DefaultExplainer']

Sample metrics from Qwen3-4B:
  score_type  accuracy  f1_score  precision  recall
0  detection     0.757     0.755      0.763   0.747
1       fuzz     0.818     0.827      0.787   0.871


## 3. Generate Accuracy Distribution Plots

Create density plots showing accuracy distribution for each model and score type.

In [8]:
# Generate KDE density distribution plots for accuracy
print("Generating accuracy density distribution plots...")

# Import required libraries
import plotly.graph_objects as go
from scipy import stats

# First, let's check what data we have available
sample_model = list(model_results.keys())[0]
sample_latent_df = model_results[sample_model]['latent_df']
print(f"\nColumns in latent_df: {sample_latent_df.columns.tolist()}")
print(f"Sample data shape: {sample_latent_df.shape}")
print(f"Score types: {sample_latent_df['score_type'].unique()}")

# Define color and line style mapping by model family and mode
def get_model_style(model_name):
    """Get color and line style based on model family and thinking mode"""
    # Extract base model name (remove mode suffix)
    is_thinking = '(thinking)' in model_name
    base_name = model_name.replace(' (thinking)', '').replace(' (regular)', '')
    
    # Get base color by model family
    if 'Gemma' in base_name:
        base_color = 'green'
        # Different line styles for different Gemma models
        if '4B' in base_name:
            line_style = 'solid'
        elif '12B' in base_name:
            line_style = 'dash'
        elif '27B' in base_name:
            line_style = 'dot'
        else:
            line_style = 'dashdot'
    elif 'Qwen' in base_name:
        base_color = 'red'
        # Different line styles for different Qwen models
        if '4B' in base_name:
            line_style = 'solid'
        elif '14B' in base_name:
            line_style = 'dash'
        elif '32B' in base_name:
            line_style = 'dot'
        elif '235B' in base_name:
            line_style = 'longdash'
        else:
            line_style = 'dashdot'
    elif 'Llama' in base_name:
        base_color = 'blue'
        # Different line styles for different Llama models
        if '3.3' in base_name:
            line_style = 'solid'
        elif '3.1' in base_name:
            line_style = 'dash'
        elif 'Scout' in base_name:
            line_style = 'dot'
        elif 'Maverick' in base_name:
            line_style = 'dashdot'
        else:
            line_style = 'longdash'
    else:
        base_color = 'gray'
        line_style = 'solid'
    
    # Adjust color intensity based on thinking mode
    if is_thinking:
        # Darker colors for thinking mode
        color_map = {
            'green': 'darkgreen',
            'red': 'darkred', 
            'blue': 'darkblue',
            'gray': 'darkgray'
        }
        color = color_map.get(base_color, 'darkgray')
    else:
        # Lighter colors for regular mode
        color_map = {
            'green': 'lightgreen',
            'red': 'lightcoral',
            'blue': 'lightblue', 
            'gray': 'lightgray'
        }
        color = color_map.get(base_color, 'lightgray')
    
    return color, line_style

# Create accuracy_df for bar charts (aggregate data)
all_data = []
for model_name, data in model_results.items():
    for score_type in data['processed_df']['score_type'].unique():
        score_data = data['processed_df'][data['processed_df']['score_type'] == score_type].iloc[0]
        all_data.append({
            'model': model_name,
            'score_type': score_type,
            'accuracy': score_data['accuracy'],
            'f1_score': score_data['f1_score'],
            'precision': score_data['precision'],
            'recall': score_data['recall']
        })

accuracy_df = pd.DataFrame(all_data)
print("\nAccuracy data summary:")
print(accuracy_df.groupby('score_type')[['accuracy', 'f1_score']].describe().round(3))

# Check if we have individual accuracy measurements for KDE plots
if 'correct' in sample_latent_df.columns:
    print("Found 'correct' column - will calculate accuracy per latent for KDE plots")
    
    # Create KDE density plots for each score type
    for score_type in sample_latent_df['score_type'].unique():
        print(f"\nCreating KDE density plot for {score_type}...")
        
        # Initialize counters for data quality tracking
        nan_counter = 0
        inf_counter = 0
        total_points = 0
        
        # Collect accuracy data for all models for this score type
        plot_data = []
        for model_name, data in model_results.items():
            latent_df = data['latent_df']
            score_subset = latent_df[latent_df['score_type'] == score_type]
            
            if len(score_subset) > 0:
                # Calculate accuracy per latent (group by latent_idx if available, or use individual rows)
                if 'latent_idx' in score_subset.columns:
                    # Group by latent and calculate accuracy
                    latent_accuracies = score_subset.groupby('latent_idx')['correct'].mean()
                    for accuracy in latent_accuracies:
                        total_points += 1
                        if pd.isna(accuracy):
                            nan_counter += 1
                        elif not np.isfinite(accuracy):
                            inf_counter += 1
                        else:
                            plot_data.append({
                                'model': model_name,
                                'accuracy': accuracy
                            })
                else:
                    # Use individual correct values as accuracy (0 or 1)
                    for _, row in score_subset.iterrows():
                        total_points += 1
                        accuracy = float(row['correct'])
                        if pd.isna(accuracy):
                            nan_counter += 1
                        elif not np.isfinite(accuracy):
                            inf_counter += 1
                        else:
                            plot_data.append({
                                'model': model_name,
                                'accuracy': accuracy
                            })
        
        if plot_data:
            plot_df = pd.DataFrame(plot_data)
            
            # Create KDE line density plot
            fig_line = go.Figure()
            
            for model in sorted(plot_df['model'].unique()):
                model_data = plot_df[plot_df['model'] == model]['accuracy']
                
                # Clean the data: remove NaN and infinite values
                model_data_clean = model_data.dropna()
                model_data_clean = model_data_clean[np.isfinite(model_data_clean)]
                
                if len(model_data_clean) > 1:  # Need multiple points for KDE
                    # Check if data has sufficient variance for KDE
                    if model_data_clean.std() > 1e-10:  # Check for non-zero variance
                        # Get color and line style based on model family
                        color, line_style = get_model_style(model)
                        
                        try:
                            # Create kernel density estimation
                            kde = stats.gaussian_kde(model_data_clean)
                            x_range = np.linspace(0, 1, 100)
                            density = kde(x_range)
                            
                            fig_line.add_trace(go.Scatter(
                                x=x_range,
                                y=density,
                                mode='lines',
                                name=model,
                                line=dict(
                                    width=3,
                                    color=color,
                                    dash=line_style
                                )
                            ))
                        except Exception as e:
                            print(f"Warning: Could not create KDE for {model}: {e}")
                            print(f"Data summary for {model}: min={model_data_clean.min():.3f}, max={model_data_clean.max():.3f}, std={model_data_clean.std():.6f}")
                    else:
                        print(f"Warning: {model} has constant accuracy values, skipping KDE plot")
                        print(f"Constant value: {model_data_clean.iloc[0]:.3f}")
                elif len(model_data_clean) == 1:
                    print(f"Warning: {model} has only one data point, skipping KDE plot")
                else:
                    print(f"Warning: {model} has no valid data points after cleaning")
            
            fig_line.update_layout(
                title=f'Accuracy Density Distribution - {score_type.title()}',
                xaxis_title="Accuracy",
                yaxis_title="Density",
                height=500,
                legend=dict(
                    yanchor="top",
                    y=0.99,
                    xanchor="left",
                    x=0.01
                ),
                font=dict(size=12)
            )
            
            # Show KDE density plot
            fig_line.show()
            
            # Print data quality summary
            valid_points = len(plot_data)
            print(f"\nData Quality Summary for {score_type}:")
            print(f"  Total data points processed: {total_points}")
            print(f"  Valid data points used: {valid_points}")
            print(f"  NaN values found: {nan_counter}")
            print(f"  Infinite values found: {inf_counter}")
            print(f"  Data quality rate: {(valid_points/total_points*100):.1f}%" if total_points > 0 else "  No data processed")
            
            # Save KDE density plot in both PDF and PNG formats
            output_file_pdf = visualizations_dir / f"accuracy_density_{score_type}.pdf"
            output_file_png = visualizations_dir / f"accuracy_density_{score_type}.png"
            fig_line.write_image(str(output_file_pdf))
            fig_line.write_image(str(output_file_png))
            print(f"Saved KDE density plot: {output_file_pdf}")
            print(f"Saved KDE density plot: {output_file_png}")

else:
    print("No 'correct' column found - cannot create density plots with individual measurements")
    print("Need individual accuracy measurements to create meaningful density distributions")

Generating accuracy density distribution plots...

Columns in latent_df: ['text', 'distance', 'activating', 'prediction', 'probability', 'correct', 'activations', 'latent_idx', 'score_type', 'module', 'firing_count', 'f1_score']
Sample data shape: (118400, 12)
Score types: ['fuzz' 'detection']

Accuracy data summary:
           accuracy                                                  f1_score  \
              count   mean    std    min    25%    50%    75%    max    count   
score_type                                                                      
detection      10.0  0.746  0.067  0.600  0.738  0.770  0.792  0.804     10.0   
fuzz           10.0  0.804  0.102  0.567  0.812  0.833  0.859  0.892     10.0   

                                                             
             mean    std    min    25%    50%    75%    max  
score_type                                                   
detection   0.752  0.055  0.614  0.739  0.765  0.786  0.807  
fuzz        0.806  0.084  0


Data Quality Summary for fuzz:
  Total data points processed: 2960
  Valid data points used: 2838
  NaN values found: 122
  Infinite values found: 0
  Data quality rate: 95.9%




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_fuzz.pdf
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_fuzz.png

Creating KDE density plot for detection...



Data Quality Summary for detection:
  Total data points processed: 2960
  Valid data points used: 2849
  NaN values found: 111
  Infinite values found: 0
  Data quality rate: 96.2%
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_detection.pdf
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_detection.png


## 4. Generate Mean F1 Score Bar Charts

Create bar charts displaying mean F1 scores with error bars for each model and score type.

In [9]:
# Generate inline bar charts for frequency-weighted F1 scores with error bars and save to files
print("Generating frequency-weighted F1 score bar charts with error bars...")

# Create inline bar charts for each score type
for score_type in accuracy_df['score_type'].unique():
    error_data = []
    for model_name in accuracy_df[accuracy_df['score_type'] == score_type]['model'].unique():
        if model_name in model_results:
            # Get the frequency-weighted F1 from the processed data (already calculated by get_agg_metrics)
            processed_df = model_results[model_name]['processed_df']
            score_data = processed_df[processed_df['score_type'] == score_type]
            if len(score_data) > 0:
                freq_weighted_f1 = score_data['weighted_f1'].iloc[0]
                # Calculate standard deviation of F1 scores per latent for error bars
                latent_df = model_results[model_name]['latent_df']
                score_subset = latent_df[latent_df['score_type'] == score_type]
                if len(score_subset) > 0 and 'f1_score' in score_subset.columns:
                    f1_std = score_subset['f1_score'].std()
                else:
                    f1_std = 0
                error_data.append({
                    'model': model_name,
                    'f1_std': f1_std,
                    'frequency_weighted_f1': freq_weighted_f1
                })
    error_df = pd.DataFrame(error_data)
    # Sort by frequency-weighted F1
    error_df = error_df.sort_values('frequency_weighted_f1', ascending=False)
    # Create inline bar chart with error bars
    fig = px.bar(
        error_df,
        x='model',
        y='frequency_weighted_f1',
        title=f'Frequency-Weighted F1 Score by Model - {score_type.title()}',
        text='frequency_weighted_f1',
        error_y='f1_std'
    )
    fig.update_layout(
        yaxis_range=[0, 1],
        xaxis_title="Model",
        yaxis_title="Frequency-Weighted F1 Score",
        xaxis={'tickangle': 45},
        height=500
    )
    fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
    fig.show()
    output_file_pdf = visualizations_dir / f"freq_weighted_f1_bar_{score_type}.pdf"
    output_file_png = visualizations_dir / f"freq_weighted_f1_bar_{score_type}.png"
    fig.write_image(str(output_file_pdf))
    fig.write_image(str(output_file_png))
    print(f"Saved Frequency-Weighted F1 bar chart: {output_file_pdf}")
    print(f"Saved Frequency-Weighted F1 bar chart: {output_file_png}")

print("\nBar charts saved to", visualizations_dir)

Generating frequency-weighted F1 score bar charts with error bars...




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved Frequency-Weighted F1 bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/freq_weighted_f1_bar_detection.pdf
Saved Frequency-Weighted F1 bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/freq_weighted_f1_bar_detection.png


Saved Frequency-Weighted F1 bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/freq_weighted_f1_bar_fuzz.pdf
Saved Frequency-Weighted F1 bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/freq_weighted_f1_bar_fuzz.png

Bar charts saved to /home/jeremias/projects/delphi-explanations/results/visualizations




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




## 5. Performance Summary

Display a summary table of F1 scores for each model and score type, including mean and standard deviation.

In [10]:
# Create comprehensive performance summary using data from get_agg_metrics
print("Creating comprehensive performance summary...")

# Create frequency-weighted F1 score summary using processed data
summary_rows = []
for model_name, model_data in model_results.items():
    processed_df = model_data['processed_df']
    # Use the frequency-weighted F1 from the processed data (averaged across score types)
    freq_weighted_f1 = processed_df['weighted_f1'].mean() if 'weighted_f1' in processed_df.columns else None
    summary_rows.append({
        'model': model_name,
        'frequency_weighted_f1': freq_weighted_f1,
        'accuracy': processed_df['accuracy'].mean(),
        'precision': processed_df['precision'].mean(),
        'recall': processed_df['recall'].mean()
    })
summary_df = pd.DataFrame(summary_rows).set_index('model').round(3)

print("\nModel Performance Summary (Frequency-Weighted F1 Focus):")
print("=" * 60)
print(summary_df.sort_values('frequency_weighted_f1', ascending=False))

# Save summary to CSV
summary_file = visualizations_dir / "model_frequency_weighted_f1_summary.csv"
summary_df.to_csv(summary_file)
print(f"\nSummary saved to: {summary_file}")

# Best performing models by category
print("\nBest Performing Models by Category:")
print("=" * 50)
print(f"Highest Frequency-Weighted F1: {summary_df['frequency_weighted_f1'].idxmax()} ({summary_df['frequency_weighted_f1'].max():.3f})")
print(f"Highest Accuracy: {summary_df['accuracy'].idxmax()} ({summary_df['accuracy'].max():.3f})")
print(f"Highest Precision: {summary_df['precision'].idxmax()} ({summary_df['precision'].max():.3f})")
print(f"Highest Recall: {summary_df['recall'].idxmax()} ({summary_df['recall'].max():.3f})")

print(f"\nAll visualizations and summaries saved to: {visualizations_dir}")
print("\nGenerated files:")
for file in sorted(visualizations_dir.glob("*")):
    print(f"  - {file.name}")

Creating comprehensive performance summary...

Model Performance Summary (Frequency-Weighted F1 Focus):
                            frequency_weighted_f1  accuracy  precision  recall
model                                                                         
Qwen3-32B                                   0.719     0.845      0.855   0.831
Gemma-3-27B-IT                              0.715     0.833      0.817   0.856
Qwen3-4B                                    0.686     0.787      0.775   0.809
Llama-4-Scout-17B                           0.674     0.813      0.826   0.793
Llama-3.3-70B-Instruct                      0.651     0.842      0.870   0.806
Gemma-3-12B-IT                              0.642     0.788      0.816   0.750
Qwen3-14B                                   0.627     0.813      0.877   0.731
Llama-8B (Qwen 32B Scorer)                  0.607     0.775      0.808   0.735
Gemma-3-4B-IT                               0.596     0.584      0.567   0.733
GPT-OSS-20B                