# Explainer Model Performance Analysis

This notebook analyzes the performance of different explainer models in generating explanations for SAE latent features. We compare models across multiple metrics including accuracy, F1 scores, token usage, and execution time.

## Analysis Overview

- **Accuracy Distribution**: Density plots showing accuracy distribution for each model
- **Mean Performance**: Bar charts comparing mean accuracy across models
- **Token Usage**: Analysis of computational efficiency and resource consumption
- **Performance Summary**: Comprehensive comparison tables

## 1. Setup and Configuration

Import required libraries, define model name mapping for prettier display names, and set up output directories.

In [26]:
import sys
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np

# Add the parent directory to the path to import delphi modules
sys.path.append(str(Path.cwd().parent))

from delphi.log.result_analysis import (
    import_plotly,
    load_data
)

# Import plotly for plotting
px = import_plotly()

# Configuration - Model prefix for experiment naming
MODEL_PREFIX = "pythiaST"  # Change this to match your base model (e.g., "pythia", "gemma", etc.)
COMPARE_THINKING_MODES = False  # Set to True to compare both thinking and non-thinking modes

# Define model name mapping for prettier display - updated to match actual directory names
MODEL_NAME_MAPPING = {
    "gemma_3_4b_it_quantized_w4a16": "Gemma-3-4B-IT",
    "Qwen3_4B_quantized_w4a16": "Qwen3-4B", 
    "gemma_3_12b_it_quantized_w4a16": "Gemma-3-12B-IT",
    "gemma_3_27b_it_quantized_w4a16": "Gemma-3-27B-IT",
    "Qwen3_14B_quantized_w4a16": "Qwen3-14B",
    "Qwen3_32B_quantized_w4a16": "Qwen3-32B",
    "Qwen3_235B_A22B_GPTQ_Int4": "Qwen3-235B",
    "Llama_3_3_70B_Instruct_quantized_w4a16": "Llama-3.3-70B-Instruct",
    "Llama_3_1_70B_Instruct_NVFP4": "Llama-3.1-70B-Instruct",
    "Llama_4_Scout_17B_16E_Instruct_quantized_w4a16": "Llama-4-Scout-17B",
    "Llama_4_Maverick_17B_128E_Instruct_quantized_w4a16": "Llama-4-Maverick-17B",
    "llama_8b_explainer": "Transluce-Explainer-Llama-8B"
}

def load_model_results(results_dir: Path, model_mapping: dict, model_prefix: str, compare_modes: bool = False):
    """Load results and statistics for all explainer models."""
    model_results = {}
    model_stats = {}
    
    # If comparing modes, load both thinking and non-thinking experiments
    modes_to_load = []
    if compare_modes:
        modes_to_load = [
            ("thinking", f"{model_prefix}ST_*_thinking_explanation_comparison", f"{model_prefix}ST_", "_thinking_explanation_comparison"),
            ("regular", f"{model_prefix}ST_*_explanation_comparison", f"{model_prefix}_", "_explanation_comparison")
        ]
    else:
        # Legacy single mode support
        modes_to_load = [
            ("single", f"{model_prefix}_*_explanation_comparison", f"{model_prefix}_", "_explanation_comparison")
        ]
    
    for mode_name, pattern, prefix_to_remove, suffix_to_remove in modes_to_load:
        for exp_dir in results_dir.glob(pattern):
            # Extract model name from directory: remove prefix and suffix
            dir_name = exp_dir.name
            model_key = dir_name.replace(prefix_to_remove, "").replace(suffix_to_remove, "")
            base_display_name = model_mapping.get(model_key, model_key)
            
            # Add mode suffix to distinguish between thinking and regular modes
            if compare_modes:
                display_name = f"{base_display_name} ({mode_name})"
            else:
                display_name = base_display_name
            
            scores_path = exp_dir / "scores"
            if scores_path.exists():
                # Load scoring results
                try:
                    # All experiments now use the "st" cache
                    cache_dir_name = "pythia-160m-st"  # All experiments use thinking mode cache
                    latents_path = exp_dir.parent / cache_dir_name / "latents"
                    if not latents_path.exists():
                        latents_path = exp_dir / "latents"  # Fallback to local latents
                    
                    if latents_path.exists():
                        # Extract module names from the actual files
                        sample_score_dir = next(scores_path.iterdir())
                        sample_files = list(sample_score_dir.glob("*.txt"))
                        if sample_files:
                            # Extract module name from filename pattern (e.g., "layers.32_latent0.txt" -> "layers.32")
                            sample_filename = sample_files[0].stem
                            module_name = sample_filename.split('_latent')[0]
                            modules = [module_name]
                        else:
                            print(f"No score files found in {sample_score_dir}")
                            continue
                        
                        latent_df, counts = load_data(scores_path, latents_path, modules)
                        
                        # Calculate aggregate metrics similar to get_agg_metrics
                        processed_data = []
                        for score_type in latent_df["score_type"].unique():
                            score_subset = latent_df[latent_df["score_type"] == score_type]
                            
                            # Calculate metrics
                            accuracy = score_subset["correct"].mean()
                            
                            # Calculate F1, precision, recall
                            true_pos = ((score_subset["prediction"] == True) & (score_subset["activating"] == True)).sum()
                            false_pos = ((score_subset["prediction"] == True) & (score_subset["activating"] == False)).sum()
                            false_neg = ((score_subset["prediction"] == False) & (score_subset["activating"] == True)).sum()
                            
                            precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
                            recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
                            f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
                            
                            processed_data.append({
                                'score_type': score_type,
                                'accuracy': accuracy,
                                'f1_score': f1_score,
                                'precision': precision,
                                'recall': recall
                            })
                        
                        model_results[display_name] = {
                            'latent_df': latent_df,
                            'processed_df': pd.DataFrame(processed_data),
                            'counts': counts
                        }
                    
                except Exception as e:
                    print(f"Error loading results for {model_key}: {e}")
                    continue
            
            # Load explainer statistics
            stats_file = exp_dir / "explainer_stats.json"
            if stats_file.exists():
                try:
                    with open(stats_file, 'r') as f:
                        stats = json.load(f)
                        model_stats[display_name] = stats
                except Exception as e:
                    print(f"Error loading stats for {model_key}: {e}")
                    model_stats[display_name] = None
            else:
                model_stats[display_name] = None
    
    return model_results, model_stats

# Set up directories
results_dir = Path.cwd().parent / "results"
visualizations_dir = results_dir / "visualizations"
visualizations_dir.mkdir(exist_ok=True, parents=True)

print(f"Model prefix: {MODEL_PREFIX}")
print(f"Compare thinking modes: {COMPARE_THINKING_MODES}")
print(f"Results directory: {results_dir}")
print(f"Visualizations output: {visualizations_dir}")
print(f"Available result directories:")
if COMPARE_THINKING_MODES:
    print("  Thinking mode experiments:")
    for d in results_dir.glob(f"{MODEL_PREFIX}ST_*_thinking_explanation_comparison"):
        print(f"    - {d.name}")
    print("  Regular mode experiments:")
    for d in results_dir.glob(f"{MODEL_PREFIX}ST_*_explanation_comparison"):
        print(f"    - {d.name}")
else:
    pattern = f"{MODEL_PREFIX}_*_explanation_comparison"
    for d in results_dir.glob(pattern):
        print(f"  - {d.name}")

Model prefix: pythiaST
Compare thinking modes: False
Results directory: /home/jeremias/projects/delphi-explanations/results
Visualizations output: /home/jeremias/projects/delphi-explanations/results/visualizations
Available result directories:
  - pythiaST_gemma_3_4b_it_quantized_w4a16_explanation_comparison
  - pythiaST_Qwen3_4B_quantized_w4a16_explanation_comparison
  - pythiaST_gemma_3_12b_it_quantized_w4a16_explanation_comparison
  - pythiaST_gemma_3_27b_it_quantized_w4a16_explanation_comparison
  - pythiaST_Qwen3_14B_quantized_w4a16_explanation_comparison
  - pythiaST_Qwen3_32B_quantized_w4a16_explanation_comparison
  - pythiaST_Llama_3_3_70B_Instruct_quantized_w4a16_explanation_comparison
  - pythiaST_Llama_3_1_70B_Instruct_NVFP4_explanation_comparison
  - pythiaST_Llama_4_Scout_17B_16E_Instruct_quantized_w4a16_explanation_comparison
  - pythiaST_Llama_4_Maverick_17B_128E_Instruct_quantized_w4a16_explanation_comparison
  - pythiaST_Qwen3_235B_A22B_GPTQ_Int4_explanation_comparison

## 2. Load Model Results

Load explanation comparison results from all models and extract performance metrics.

In [27]:
# Load all model results and statistics
print("Loading model results...")
model_results, model_stats = load_model_results(results_dir, MODEL_NAME_MAPPING, MODEL_PREFIX, COMPARE_THINKING_MODES)

print(f"\nLoaded results for {len(model_results)} models:")
for model_name in model_results.keys():
    print(f"  - {model_name}")

print(f"\nToken usage statistics available for {len([k for k, v in model_stats.items() if v is not None])} models:")
for model_name, stats in model_stats.items():
    if stats:
        print(f"  - {model_name}: {list(stats.keys())}")
    else:
        print(f"  - {model_name}: No stats available")

# Display sample metrics for the first model
if model_results:
    sample_model = list(model_results.keys())[0]
    sample_data = model_results[sample_model]['processed_df']
    print(f"\nSample metrics from {sample_model}:")
    print(sample_data[['score_type', 'accuracy', 'f1_score', 'precision', 'recall']].round(3))

Loading model results...



Loaded results for 12 models:
  - Gemma-3-4B-IT
  - Qwen3-4B
  - Gemma-3-12B-IT
  - Gemma-3-27B-IT
  - Qwen3-14B
  - Qwen3-32B
  - Llama-3.3-70B-Instruct
  - Llama-3.1-70B-Instruct
  - Llama-4-Scout-17B
  - Llama-4-Maverick-17B
  - Qwen3-235B
  - Transluce-Explainer-Llama-8B

Token usage statistics available for 12 models:
  - Gemma-3-4B-IT: ['DefaultExplainer']
  - Qwen3-4B: ['DefaultExplainer']
  - Gemma-3-12B-IT: ['DefaultExplainer']
  - Gemma-3-27B-IT: ['DefaultExplainer']
  - Qwen3-14B: ['DefaultExplainer']
  - Qwen3-32B: ['DefaultExplainer']
  - Llama-3.3-70B-Instruct: ['DefaultExplainer']
  - Llama-3.1-70B-Instruct: ['DefaultExplainer']
  - Llama-4-Scout-17B: ['DefaultExplainer']
  - Llama-4-Maverick-17B: ['DefaultExplainer']
  - Qwen3-235B: ['DefaultExplainer']
  - Transluce-Explainer-Llama-8B: ['DefaultExplainer']

Sample metrics from Gemma-3-4B-IT:
  score_type  accuracy  f1_score  precision  recall
0       fuzz     0.559     0.654      0.537   0.835
1  detection     0.595  

## 3. Generate Accuracy Distribution Plots

Create density plots showing accuracy distribution for each model and score type.

In [28]:
# Generate KDE density distribution plots for accuracy
print("Generating accuracy density distribution plots...")

# Import required libraries
import plotly.graph_objects as go
from scipy import stats

# First, let's check what data we have available
sample_model = list(model_results.keys())[0]
sample_latent_df = model_results[sample_model]['latent_df']
print(f"\nColumns in latent_df: {sample_latent_df.columns.tolist()}")
print(f"Sample data shape: {sample_latent_df.shape}")
print(f"Score types: {sample_latent_df['score_type'].unique()}")

# Define color and line style mapping by model family and mode
def get_model_style(model_name):
    """Get color and line style based on model family and thinking mode"""
    # Extract base model name (remove mode suffix)
    is_thinking = '(thinking)' in model_name
    base_name = model_name.replace(' (thinking)', '').replace(' (regular)', '')
    
    # Get base color by model family
    if 'Gemma' in base_name:
        base_color = 'green'
        # Different line styles for different Gemma models
        if '4B' in base_name:
            line_style = 'solid'
        elif '12B' in base_name:
            line_style = 'dash'
        elif '27B' in base_name:
            line_style = 'dot'
        else:
            line_style = 'dashdot'
    elif 'Qwen' in base_name:
        base_color = 'red'
        # Different line styles for different Qwen models
        if '4B' in base_name:
            line_style = 'solid'
        elif '14B' in base_name:
            line_style = 'dash'
        elif '32B' in base_name:
            line_style = 'dot'
        elif '235B' in base_name:
            line_style = 'longdash'
        else:
            line_style = 'dashdot'
    elif 'Llama' in base_name:
        base_color = 'blue'
        # Different line styles for different Llama models
        if '3.3' in base_name:
            line_style = 'solid'
        elif '3.1' in base_name:
            line_style = 'dash'
        elif 'Scout' in base_name:
            line_style = 'dot'
        elif 'Maverick' in base_name:
            line_style = 'dashdot'
        else:
            line_style = 'longdash'
    else:
        base_color = 'gray'
        line_style = 'solid'
    
    # Adjust color intensity based on thinking mode
    if is_thinking:
        # Darker colors for thinking mode
        color_map = {
            'green': 'darkgreen',
            'red': 'darkred', 
            'blue': 'darkblue',
            'gray': 'darkgray'
        }
        color = color_map.get(base_color, 'darkgray')
    else:
        # Lighter colors for regular mode
        color_map = {
            'green': 'lightgreen',
            'red': 'lightcoral',
            'blue': 'lightblue', 
            'gray': 'lightgray'
        }
        color = color_map.get(base_color, 'lightgray')
    
    return color, line_style

# Create accuracy_df for bar charts (aggregate data)
all_data = []
for model_name, data in model_results.items():
    for score_type in data['processed_df']['score_type'].unique():
        score_data = data['processed_df'][data['processed_df']['score_type'] == score_type].iloc[0]
        all_data.append({
            'model': model_name,
            'score_type': score_type,
            'accuracy': score_data['accuracy'],
            'f1_score': score_data['f1_score'],
            'precision': score_data['precision'],
            'recall': score_data['recall']
        })

accuracy_df = pd.DataFrame(all_data)
print("\nAccuracy data summary:")
print(accuracy_df.groupby('score_type')[['accuracy', 'f1_score']].describe().round(3))

# Check if we have individual accuracy measurements for KDE plots
if 'correct' in sample_latent_df.columns:
    print("Found 'correct' column - will calculate accuracy per latent for KDE plots")
    
    # Create KDE density plots for each score type
    for score_type in sample_latent_df['score_type'].unique():
        print(f"\nCreating KDE density plot for {score_type}...")
        
        # Initialize counters for data quality tracking
        nan_counter = 0
        inf_counter = 0
        total_points = 0
        
        # Collect accuracy data for all models for this score type
        plot_data = []
        for model_name, data in model_results.items():
            latent_df = data['latent_df']
            score_subset = latent_df[latent_df['score_type'] == score_type]
            
            if len(score_subset) > 0:
                # Calculate accuracy per latent (group by latent_idx if available, or use individual rows)
                if 'latent_idx' in score_subset.columns:
                    # Group by latent and calculate accuracy
                    latent_accuracies = score_subset.groupby('latent_idx')['correct'].mean()
                    for accuracy in latent_accuracies:
                        total_points += 1
                        if pd.isna(accuracy):
                            nan_counter += 1
                        elif not np.isfinite(accuracy):
                            inf_counter += 1
                        else:
                            plot_data.append({
                                'model': model_name,
                                'accuracy': accuracy
                            })
                else:
                    # Use individual correct values as accuracy (0 or 1)
                    for _, row in score_subset.iterrows():
                        total_points += 1
                        accuracy = float(row['correct'])
                        if pd.isna(accuracy):
                            nan_counter += 1
                        elif not np.isfinite(accuracy):
                            inf_counter += 1
                        else:
                            plot_data.append({
                                'model': model_name,
                                'accuracy': accuracy
                            })
        
        if plot_data:
            plot_df = pd.DataFrame(plot_data)
            
            # Create KDE line density plot
            fig_line = go.Figure()
            
            for model in sorted(plot_df['model'].unique()):
                model_data = plot_df[plot_df['model'] == model]['accuracy']
                
                # Clean the data: remove NaN and infinite values
                model_data_clean = model_data.dropna()
                model_data_clean = model_data_clean[np.isfinite(model_data_clean)]
                
                if len(model_data_clean) > 1:  # Need multiple points for KDE
                    # Check if data has sufficient variance for KDE
                    if model_data_clean.std() > 1e-10:  # Check for non-zero variance
                        # Get color and line style based on model family
                        color, line_style = get_model_style(model)
                        
                        try:
                            # Create kernel density estimation
                            kde = stats.gaussian_kde(model_data_clean)
                            x_range = np.linspace(0, 1, 100)
                            density = kde(x_range)
                            
                            fig_line.add_trace(go.Scatter(
                                x=x_range,
                                y=density,
                                mode='lines',
                                name=model,
                                line=dict(
                                    width=3,
                                    color=color,
                                    dash=line_style
                                )
                            ))
                        except Exception as e:
                            print(f"Warning: Could not create KDE for {model}: {e}")
                            print(f"Data summary for {model}: min={model_data_clean.min():.3f}, max={model_data_clean.max():.3f}, std={model_data_clean.std():.6f}")
                    else:
                        print(f"Warning: {model} has constant accuracy values, skipping KDE plot")
                        print(f"Constant value: {model_data_clean.iloc[0]:.3f}")
                elif len(model_data_clean) == 1:
                    print(f"Warning: {model} has only one data point, skipping KDE plot")
                else:
                    print(f"Warning: {model} has no valid data points after cleaning")
            
            fig_line.update_layout(
                title=f'Accuracy Density Distribution - {score_type.title()}',
                xaxis_title="Accuracy",
                yaxis_title="Density",
                height=500,
                legend=dict(
                    yanchor="top",
                    y=0.99,
                    xanchor="left",
                    x=0.01
                ),
                font=dict(size=12)
            )
            
            # Show KDE density plot
            fig_line.show()
            
            # Print data quality summary
            valid_points = len(plot_data)
            print(f"\nData Quality Summary for {score_type}:")
            print(f"  Total data points processed: {total_points}")
            print(f"  Valid data points used: {valid_points}")
            print(f"  NaN values found: {nan_counter}")
            print(f"  Infinite values found: {inf_counter}")
            print(f"  Data quality rate: {(valid_points/total_points*100):.1f}%" if total_points > 0 else "  No data processed")
            
            # Save KDE density plot in both PDF and PNG formats
            output_file_pdf = visualizations_dir / f"accuracy_density_{score_type}.pdf"
            output_file_png = visualizations_dir / f"accuracy_density_{score_type}.png"
            fig_line.write_image(str(output_file_pdf))
            fig_line.write_image(str(output_file_png))
            print(f"Saved KDE density plot: {output_file_pdf}")
            print(f"Saved KDE density plot: {output_file_png}")

else:
    print("No 'correct' column found - cannot create density plots with individual measurements")
    print("Need individual accuracy measurements to create meaningful density distributions")

Generating accuracy density distribution plots...

Columns in latent_df: ['text', 'distance', 'activating', 'prediction', 'probability', 'correct', 'activations', 'latent_idx', 'score_type', 'module', 'firing_count']
Sample data shape: (27200, 11)
Score types: ['fuzz' 'detection']

Accuracy data summary:
           accuracy                                                  f1_score  \
              count   mean    std    min    25%    50%    75%    max    count   
score_type                                                                      
detection      12.0  0.746  0.054  0.595  0.748  0.767  0.776  0.790     12.0   
fuzz           12.0  0.823  0.091  0.559  0.820  0.852  0.867  0.894     12.0   

                                                             
             mean    std    min    25%    50%    75%    max  
score_type                                                   
detection   0.738  0.053  0.604  0.739  0.755  0.763  0.793  
fuzz        0.824  0.065  0.654  0.793  


Data Quality Summary for fuzz:
  Total data points processed: 816
  Valid data points used: 814
  NaN values found: 2
  Infinite values found: 0
  Data quality rate: 99.8%




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_fuzz.pdf
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_fuzz.png

Creating KDE density plot for detection...



Data Quality Summary for detection:
  Total data points processed: 816
  Valid data points used: 814
  NaN values found: 2
  Infinite values found: 0
  Data quality rate: 99.8%
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_detection.pdf
Saved KDE density plot: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_density_detection.png


## 4. Generate Mean Accuracy Bar Charts

Create bar charts displaying mean accuracy for each model and score type.

In [29]:
# Generate inline bar charts for mean accuracy and save to files
print("Generating mean accuracy bar charts...")

# Create inline bar charts for each score type
for score_type in accuracy_df['score_type'].unique():
    score_df = accuracy_df[accuracy_df['score_type'] == score_type].sort_values('accuracy', ascending=False)
    
    # Create inline bar chart
    fig = px.bar(
        score_df,
        x='model',
        y='accuracy',
        title=f'Mean Accuracy by Model - {score_type.title()}',
        text='accuracy'
    )
    fig.update_layout(
        yaxis_range=[0, 1],
        xaxis_title="Model",
        yaxis_title="Accuracy",
        xaxis={'tickangle': 45},
        height=500
    )
    fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
    
    # Show inline
    fig.show()
    
    # Save to file in both PDF and PNG formats
    output_file_pdf = visualizations_dir / f"accuracy_bar_{score_type}.pdf"
    output_file_png = visualizations_dir / f"accuracy_bar_{score_type}.png"
    fig.write_image(str(output_file_pdf))
    fig.write_image(str(output_file_png))
    print(f"Saved bar chart: {output_file_pdf}")
    print(f"Saved bar chart: {output_file_png}")

# Display accuracy rankings
print("\nModel accuracy rankings:")
for score_type in accuracy_df['score_type'].unique():
    score_df = accuracy_df[accuracy_df['score_type'] == score_type].sort_values('accuracy', ascending=False)
    print(f"\n{score_type.title()} Accuracy Rankings:")
    for i, (_, row) in enumerate(score_df.iterrows(), 1):
        print(f"  {i}. {row['model']}: {row['accuracy']:.3f}")

print(f"\nBar charts saved to {visualizations_dir}")

Generating mean accuracy bar charts...


Saved bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_bar_fuzz.pdf
Saved bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_bar_fuzz.png




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).






Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_bar_detection.pdf
Saved bar chart: /home/jeremias/projects/delphi-explanations/results/visualizations/accuracy_bar_detection.png

Model accuracy rankings:

Fuzz Accuracy Rankings:
  1. Qwen3-32B: 0.894
  2. Llama-4-Maverick-17B: 0.887
  3. Qwen3-235B: 0.868
  4. Llama-3.1-70B-Instruct: 0.867
  5. Gemma-3-27B-IT: 0.859
  6. Llama-3.3-70B-Instruct: 0.853
  7. Llama-4-Scout-17B: 0.852
  8. Qwen3-14B: 0.846
  9. Transluce-Explainer-Llama-8B: 0.824
  10. Gemma-3-12B-IT: 0.807
  11. Qwen3-4B: 0.760
  12. Gemma-3-4B-IT: 0.559

Detection Accuracy Rankings:
  1. Gemma-3-27B-IT: 0.790
  2. Qwen3-32B: 0.782
  3. Qwen3-235B: 0.777
  4. Llama-4-Maverick-17B: 0.776
  5. Qwen3-14B: 0.768
  6. Llama-3.1-70B-Instruct: 0.768
  7. Llama-4-Scout-17B: 0.767
  8. Gemma-3-12B-IT: 0.757
  9. Llama-3.3-70B-Instruct: 0.755
  10. Transluce-Explainer-Llama-8B: 0.726
  11. Qwen3-4B: 0.697
  12. Gemma-3-4B-IT: 0.595

Bar ch



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




## 5. Create Comprehensive Performance Summary

Generate summary tables and statistics comparing model performance across all metrics.

In [30]:
# Create comprehensive performance summary
print("Creating comprehensive performance summary...")

# Create accuracy summary
accuracy_summary = accuracy_df.groupby('model').agg({
    'accuracy': 'mean',
    'f1_score': 'mean',
    'precision': 'mean',
    'recall': 'mean'
}).round(3)

print("\nModel Performance Summary (Accuracy Metrics):")
print("=" * 60)
print(accuracy_summary)

# Save summary to CSV
summary_file = visualizations_dir / "model_accuracy_summary.csv"
accuracy_summary.to_csv(summary_file)
print(f"\nSummary saved to: {summary_file}")

# Best performing models by category
print("\nBest Performing Models by Category:")
print("=" * 50)
print(f"Highest Accuracy: {accuracy_summary['accuracy'].idxmax()} ({accuracy_summary['accuracy'].max():.3f})")
print(f"Highest F1 Score: {accuracy_summary['f1_score'].idxmax()} ({accuracy_summary['f1_score'].max():.3f})")
print(f"Highest Precision: {accuracy_summary['precision'].idxmax()} ({accuracy_summary['precision'].max():.3f})")
print(f"Highest Recall: {accuracy_summary['recall'].idxmax()} ({accuracy_summary['recall'].max():.3f})")

# Create detailed per-score-type analysis
print("\nDetailed Analysis by Score Type:")
print("=" * 40)
for score_type in accuracy_df['score_type'].unique():
    score_subset = accuracy_df[accuracy_df['score_type'] == score_type]
    print(f"\n{score_type.title()} Results:")
    print(score_subset[['model', 'accuracy', 'f1_score']].sort_values('accuracy', ascending=False))

print(f"\nAll visualizations and summaries saved to: {visualizations_dir}")
print("\nGenerated files:")
for file in sorted(visualizations_dir.glob("*")):
    print(f"  - {file.name}")

Creating comprehensive performance summary...

Model Performance Summary (Accuracy Metrics):
                              accuracy  f1_score  precision  recall
model                                                              
Gemma-3-12B-IT                   0.782     0.774      0.810   0.744
Gemma-3-27B-IT                   0.825     0.828      0.812   0.845
Gemma-3-4B-IT                    0.577     0.629      0.562   0.729
Llama-3.1-70B-Instruct           0.818     0.806      0.859   0.759
Llama-3.3-70B-Instruct           0.804     0.798      0.828   0.770
Llama-4-Maverick-17B             0.832     0.827      0.852   0.803
Llama-4-Scout-17B                0.809     0.806      0.818   0.795
Qwen3-14B                        0.807     0.790      0.869   0.725
Qwen3-235B                       0.823     0.806      0.893   0.736
Qwen3-32B                        0.838     0.838      0.842   0.833
Qwen3-4B                         0.728     0.739      0.711   0.769
Transluce-Explainer-Lla