In [None]:
"""
Visualization script for Gemma benchmarking results.

This script can be run directly or converted to a Jupyter notebook using:
jupyter nbconvert --to notebook --execute visualize_results.py
"""

# Gemma Benchmarking Results

This notebook visualizes the results from the Gemma benchmarking framework, comparing Gemma 2B, Gemma 7B, and Mistral 7B models on the MMLU dataset.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

In [None]:
results_dir = project_root / "results"

result_dirs = sorted([d for d in results_dir.glob("*") if d.is_dir()], 
                     key=lambda x: x.stat().st_mtime, reverse=True)

if not result_dirs:
    print("No results found. Please run the benchmark first.")
else:
    latest_results = result_dirs[0]
    print(f"Loading results from: {latest_results}")
    
    metrics_path = latest_results / "metrics.csv"
    if metrics_path.exists():
        metrics_df = pd.read_csv(metrics_path)
        print(f"Loaded metrics data with {len(metrics_df)} rows")
    else:
        print(f"Metrics file not found: {metrics_path}")
        metrics_df = None
    
    predictions_path = latest_results / "predictions.csv"
    if predictions_path.exists():
        predictions_df = pd.read_csv(predictions_path)
        print(f"Loaded predictions data with {len(predictions_df)} rows")
    else:
        print(f"Predictions file not found: {predictions_path}")
        predictions_df = None

## Model Comparison

Let's visualize the performance of different models across different categories.

In [None]:
if 'metrics_df' in locals() and metrics_df is not None:
    plt.figure(figsize=(12, 6))
    
    overall = metrics_df[metrics_df['category'] == 'overall']
    
    sns.barplot(x='model', y='accuracy', data=overall)
    plt.title('Overall Accuracy by Model')
    plt.ylim(0, 1.0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
if 'metrics_df' in locals() and metrics_df is not None:
    categories = metrics_df[metrics_df['category'] != 'overall']
    
    plt.figure(figsize=(14, 8))
    sns.barplot(x='category', y='accuracy', hue='model', data=categories)
    plt.title('Accuracy by Category and Model')
    plt.ylim(0, 1.0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## Model Size vs. Performance

Let's analyze the relationship between model size and performance.

In [None]:
if 'metrics_df' in locals() and metrics_df is not None and 'overall' in locals():
    model_sizes = {
        'gemma-2b': 2,
        'gemma-7b': 7,
        'mistral-7b': 7
    }
    
    overall['model_size'] = overall['model'].map(model_sizes)

    plt.figure(figsize=(10, 6))

    for model in overall['model'].unique():
        model_data = overall[overall['model'] == model]
        plt.scatter(model_data['model_size'], model_data['accuracy'], 
                   label=model, s=100)
    
    plt.title('Model Size vs. Accuracy')
    plt.xlabel('Model Size (Billions of Parameters)')
    plt.ylabel('Accuracy')
    plt.grid(linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.show()

## Conclusion

This analysis compares the performance of Gemma 2B, Gemma 7B, and Mistral 7B models on the MMLU benchmark. The visualizations show how these models perform across different categories, highlighting their relative strengths and weaknesses. 