# Prompt Optimization Results Visualization

This notebook visualizes the results from the three prompt optimization algorithms:
- **APE**: Automatic Prompt Engineering (paraphrasing)
  In which candidate prompts are paraphrased N times from a base prompt using different mutators and evaluated over
- **Evolution**: Binary Tournament Genetic Algorithm
  In which the pool of candidate prompts is generated using different mutators and in a tournament the winning prompt is mutated and replaces the losing prompt
- **Thompson Sampling**: NIG Multi-Armed Bandit
  In which candidates are randomly sampled from a distribution of unknown variance and unknown mean, here the candidates are pulled one at a time and evaluaed, the evaluation updates the posteriors


Dataset benchmarks: PIQA, HellaSwag, BoolQ, GSM8K

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load Results

In [None]:
def load_results(dataset_name):
    """Load results for a specific dataset"""
    try:
        with open(f'results_{dataset_name}.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Warning: results_{dataset_name}.json not found")
        return None

# Load all datasets
datasets = ['piqa', 'hellaswag', 'boolq', 'gsm8k']
results = {}

for dataset in datasets:
    data = load_results(dataset)
    if data:
        results[dataset] = data
        print(f"Loaded {dataset}: {data['numExamples']} examples")

print(f"\nLoaded {len(results)} datasets")

## 2. Helper Functions

In [None]:
def extract_history(algorithm_data):
    """Extract tokens and scores from algorithm results"""
    history = algorithm_data.get('bestPrompts', [])
    if not history:
        return [], []

    tokens = [h['tokens'] for h in history]
    scores = [h['score'] for h in history]
    return tokens, scores

def calculate_efficiency(tokens, score):
    """Calculate score per 1000 tokens"""
    return (score / tokens * 1000) if tokens > 0 else 0

## 3. Individual Dataset Plots

In [None]:
def plot_dataset_comparison(dataset_name, data):
    """Create comparison plot for a single dataset"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Prompt Optimization Results: {dataset_name.upper()}',
                 fontsize=16, fontweight='bold')

    # Extract data for each algorithm
    algorithms = ['ape', 'evo', 'ts']
    colors = {'ape': '#e74c3c', 'evo': '#3498db', 'ts': '#2ecc71'}
    labels = {'ape': 'APE', 'evo': 'Evolution', 'ts': 'Thompson Sampling'}

    histories = {}
    for alg in algorithms:
        if alg in data:
            tokens, scores = extract_history(data[alg])
            histories[alg] = {'tokens': tokens, 'scores': scores}

    # Score vs Tokens
    ax1 = axes[0, 0]
    for alg in algorithms:
        if alg in histories:
            h = histories[alg]
            ax1.plot(h['tokens'], h['scores'],
                    marker='o', label=labels[alg],
                    color=colors[alg], linewidth=2, markersize=4)

    ax1.set_xlabel('Tokens Used')
    ax1.set_ylabel('Best Score')
    ax1.set_title('Score vs Token Usage')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Final scores bar chart
    ax2 = axes[0, 1]
    final_scores = {}
    for alg in algorithms:
        if alg in histories and histories[alg]['scores']:
            final_scores[labels[alg]] = histories[alg]['scores'][-1]

    bars = ax2.bar(final_scores.keys(), final_scores.values(),
                   color=[colors[alg] for alg in algorithms if alg in histories])
    ax2.set_ylabel('Final Score')
    ax2.set_title('Final Performance')
    ax2.set_ylim(0, 1)

    # Value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom')

    # Token usage
    ax3 = axes[1, 0]
    token_usage = {}
    for alg in algorithms:
        if alg in histories and histories[alg]['tokens']:
            token_usage[labels[alg]] = histories[alg]['tokens'][-1]

    ax3.bar(token_usage.keys(), token_usage.values(),
           color=[colors[alg] for alg in algorithms if alg in histories])
    ax3.set_ylabel('Total Tokens')
    ax3.set_title('Token Consumption')

    # Efficiency (score per 1k tokens)
    ax4 = axes[1, 1]
    efficiency = {}
    for alg in algorithms:
        if alg in histories and histories[alg]['scores'] and histories[alg]['tokens']:
            score = histories[alg]['scores'][-1]
            tokens = histories[alg]['tokens'][-1]
            efficiency[labels[alg]] = calculate_efficiency(tokens, score)

    bars = ax4.bar(efficiency.keys(), efficiency.values(),
                   color=[colors[alg] for alg in algorithms if alg in histories])
    ax4.set_ylabel('Score per 1K Tokens')
    ax4.set_title('Efficiency')

    # Value labels
    for bar in bars:
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(f'plots_{dataset_name}.png', dpi=300, bbox_inches='tight')
    plt.show()

    return fig

for dataset_name, data in results.items():
    plot_dataset_comparison(dataset_name, data)

## 4. Cross-Dataset Comparison

In [None]:
def plot_cross_dataset_comparison(results_dict):
    """Compare algorithm performance across all datasets"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Cross-Dataset Algorithm Comparison',
                 fontsize=18, fontweight='bold')

    algorithms = ['ape', 'evo', 'ts']
    colors = {'ape': '#e74c3c', 'evo': '#3498db', 'ts': '#2ecc71'}
    labels = {'ape': 'APE', 'evo': 'Evolution', 'ts': 'Thompson'}

    dataset_names = list(results_dict.keys())

    # Final scores heatmap
    ax1 = axes[0, 0]
    score_matrix = []
    for alg in algorithms:
        alg_scores = []
        for dataset in dataset_names:
            data = results_dict[dataset]
            if alg in data and data[alg]['bestPrompts']:
                score = data[alg]['bestPrompts'][-1]['score']
            else:
                score = 0
            alg_scores.append(score)
        score_matrix.append(alg_scores)

    im = ax1.imshow(score_matrix, cmap='YlOrRd', aspect='auto')
    ax1.set_xticks(range(len(dataset_names)))
    ax1.set_xticklabels([d.upper() for d in dataset_names])
    ax1.set_yticks(range(len(algorithms)))
    ax1.set_yticklabels([labels[alg] for alg in algorithms])
    ax1.set_title('Final Scores Heatmap')

    # Add annotations
    for i, alg in enumerate(algorithms):
        for j, dataset in enumerate(dataset_names):
            text = ax1.text(j, i, f'{score_matrix[i][j]:.3f}',
                          ha='center', va='center', color='black')

    plt.colorbar(im, ax=ax1)

    # Token efficiency comparison
    ax2 = axes[0, 1]
    x = np.arange(len(dataset_names))
    width = 0.25

    for i, alg in enumerate(algorithms):
        efficiencies = []
        for dataset in dataset_names:
            data = results_dict[dataset]
            if alg in data and data[alg]['bestPrompts']:
                bp = data[alg]['bestPrompts'][-1]
                eff = calculate_efficiency(bp['tokens'], bp['score'])
            else:
                eff = 0
            efficiencies.append(eff)

        ax2.bar(x + i*width, efficiencies, width,
               label=labels[alg], color=colors[alg])

    ax2.set_xlabel('Dataset')
    ax2.set_ylabel('Efficiency (Score/1K tokens)')
    ax2.set_title('Token Efficiency by Dataset')
    ax2.set_xticks(x + width)
    ax2.set_xticklabels([d.upper() for d in dataset_names])
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)

    # Iterations/rounds comparison
    ax3 = axes[1, 0]
    for i, alg in enumerate(algorithms):
        iterations = []
        for dataset in dataset_names:
            data = results_dict[dataset]
            if alg in data and data[alg]['bestPrompts']:
                iters = len(data[alg]['bestPrompts'])
            else:
                iters = 0
            iterations.append(iters)

        ax3.bar(x + i*width, iterations, width,
               label=labels[alg], color=colors[alg])

    ax3.set_xlabel('Dataset')
    ax3.set_ylabel('Number of Iterations')
    ax3.set_title('Iterations by Algorithm')
    ax3.set_xticks(x + width)
    ax3.set_xticklabels([d.upper() for d in dataset_names])
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)

    # Win rate (best algorithm per dataset)
    ax4 = axes[1, 1]
    wins = {alg: 0 for alg in algorithms}

    for dataset in dataset_names:
        data = results_dict[dataset]
        best_alg = None
        best_score = -1

        for alg in algorithms:
            if alg in data and data[alg]['bestPrompts']:
                score = data[alg]['bestPrompts'][-1]['score']
                if score > best_score:
                    best_score = score
                    best_alg = alg

        if best_alg:
            wins[best_alg] += 1

    ax4.pie([wins[alg] for alg in algorithms],
           labels=[labels[alg] for alg in algorithms],
           colors=[colors[alg] for alg in algorithms],
           autopct='%1.0f%%',
           startangle=90)
    ax4.set_title('Win Rate (Best Performance per Dataset)')

    plt.tight_layout()
    plt.savefig('plots_cross_dataset.png', dpi=300, bbox_inches='tight')
    plt.show()

    return fig

if results:
    plot_cross_dataset_comparison(results)

## 5. Summary Statistics Table

In [None]:
def create_summary_table(results_dict):
    """Create comprehensive summary table"""
    rows = []

    for dataset_name, data in results_dict.items():
        for alg in ['ape', 'evo', 'ts']:
            if alg not in data or not data[alg]['bestPrompts']:
                continue

            bp = data[alg]['bestPrompts'][-1]

            rows.append({
                'Dataset': dataset_name.upper(),
                'Algorithm': {'ape': 'APE', 'evo': 'Evolution', 'ts': 'Thompson'}[alg],
                'Final Score': bp['score'],
                'Tokens Used': bp['tokens'],
                'Iterations': len(data[alg]['bestPrompts']),
                'Efficiency': calculate_efficiency(bp['tokens'], bp['score'])
            })

    df = pd.DataFrame(rows)

    pd.options.display.float_format = '{:.3f}'.format

    print("SUMMARY STATISTICS")
    print(df.to_string(index=False))
    print("\n")

    # Save to CSV
    df.to_csv('summary_statistics.csv', index=False)
    return df

summary_df = create_summary_table(results)

## 6. Best Prompts Display

In [None]:
def display_best_prompts(results_dict):
    """Display the best performing prompts"""
    print("BEST PROMPTS BY ALGORITHM AND DATASET")

    for dataset_name, data in results_dict.items():
        print(f"Dataset: {dataset_name.upper()}")

        for alg in ['ape', 'evo', 'ts']:
            alg_name = {'ape': 'APE', 'evo': 'Evolution', 'ts': 'Thompson'}[alg]

            if alg in data and 'best' in data[alg]:
                best = data[alg]['best']
                score = data[alg]['bestPrompts'][-1]['score'] if data[alg]['bestPrompts'] else 0

                print(f"\n{alg_name} (Score: {score:.3f}):")
                print(f"  {best['instruction'][:150]}")
                print()

display_best_prompts(results)

## 7. Learning Curves

In [None]:
def plot_learning_curves(results_dict):
    """Learning curves for Evolution and Thompson"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Score Progression Over Time',
                 fontsize=16, fontweight='bold')

    datasets = list(results_dict.keys())
    colors = {'evo': '#3498db', 'ts': '#2ecc71'}

    for idx, dataset in enumerate(datasets[:4]):
        ax = axes[idx // 2, idx % 2]
        data = results_dict[dataset]

        for alg in ['evo', 'ts']:
            if alg in data and data[alg]['bestPrompts']:
                iterations = range(len(data[alg]['bestPrompts']))
                scores = [bp['score'] for bp in data[alg]['bestPrompts']]

                label = 'Evolution' if alg == 'evo' else 'Thompson'
                ax.plot(iterations, scores,
                       marker='o', label=label,
                       color=colors[alg], linewidth=2, markersize=3)

        ax.set_xlabel('Iteration')
        ax.set_ylabel('Best Score')
        ax.set_title(f'{dataset.upper()}')
        ax.legend()
        ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('learning_curves.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_learning_curves(results)

## 8. Statistical Analysis

In [None]:
def statistical_analysis(summary_df):
    """Perform statistical analysis on results"""
    print("STATISTICAL ANALYSIS")

    # Group by algorithm
    grouped = summary_df.groupby('Algorithm')

    print("\nMean Performance by Algorithm:")
    print(grouped[['Final Score', 'Tokens Used', 'Efficiency']].mean())

    print("\nStandard Deviation:")
    print(grouped[['Final Score', 'Tokens Used', 'Efficiency']].std())

    print("\nMedian Performance:")
    print(grouped[['Final Score', 'Tokens Used', 'Efficiency']].median())

    print("\nBest Algorithm by Metric:")
    print(f"  Highest Average Score: {summary_df.groupby('Algorithm')['Final Score'].mean().idxmax()}")
    print(f"  Lowest Token Usage: {summary_df.groupby('Algorithm')['Tokens Used'].mean().idxmin()}")
    print(f"  Highest Efficiency: {summary_df.groupby('Algorithm')['Efficiency'].mean().idxmax()}")

statistical_analysis(summary_df)