# 📊 Performance Benchmarking - Generalized

This notebook provides comprehensive benchmarking of trained models.

**Configuration-driven approach:** All settings loaded from `../config/pipeline_config.json`

In [None]:
# Import configuration system and benchmarking utilities
import sys
import os
sys.path.append("../")

from src.pipeline_utils import ConfigManager, StateManager, LoggingManager
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import json
import time
import gc
import platform
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict
from collections import defaultdict
import warnings

# Machine Learning imports
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ONNX and optimization imports
try:
    import onnxruntime as ort
    onnx_available = True
except ImportError:
    print("⚠️ ONNXRuntime not installed. Install with: pip install onnxruntime")
    onnx_available = False

# Hugging Face transformers
try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import pickle
    transformers_available = True
except ImportError:
    print("⚠️ Transformers not installed")
    transformers_available = False

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Initialize managers
config = ConfigManager("../config/pipeline_config.json")
state = StateManager("../config/pipeline_state.json")
logger_manager = LoggingManager(config, 'benchmarking')
logger = logger_manager.get_logger()

logger.info("📊 Starting Performance Benchmarking - Generalized Pipeline")
print("📋 Configuration loaded from ../config/pipeline_config.json")

In [None]:
# Benchmarking Data Structures and Configuration
@dataclass
class BenchmarkResult:
    """Structured class to hold results from a single benchmark run."""
    model_name: str
    model_type: str  # 'pytorch' or 'onnx'
    batch_size: int
    avg_latency_ms: float
    p50_latency_ms: float
    p95_latency_ms: float
    p99_latency_ms: float
    throughput_samples_per_sec: float
    peak_memory_mb: float
    model_size_mb: float
    provider: str
    accuracy: Optional[float] = None
    f1_score: Optional[float] = None
    weighted_accuracy: Optional[float] = None
    weighted_f1_score: Optional[float] = None
    avg_confidence_correct: Optional[float] = None
    avg_confidence_incorrect: Optional[float] = None
    per_class_metrics: Optional[Dict] = None
    validation_samples: Optional[int] = None
    
    def to_dict(self) -> Dict:
        """Convert to dictionary with flattened per-class metrics."""
        flat_dict = asdict(self)
        per_class = flat_dict.pop("per_class_metrics", {})
        if per_class:
            for class_name, metrics in per_class.items():
                for metric_name, value in metrics.items():
                    flat_dict[f"{class_name}_{metric_name}"] = value
        return flat_dict

class ExecutionProviderManager:
    """Manages ONNX execution providers for consistent benchmarking."""
    
    @staticmethod
    def get_execution_providers() -> List[str]:
        """Get available execution providers for benchmarking."""
        available_providers = ort.get_available_providers() if onnx_available else []
        
        # Prioritize providers for consistent benchmarking
        preferred_providers = ['CPUExecutionProvider']  # Start with CPU for consistency
        
        # Add platform-specific optimizations if available
        if platform.system() == "Darwin" and "CoreMLExecutionProvider" in available_providers:
            preferred_providers.insert(0, 'CoreMLExecutionProvider')
        elif "CUDAExecutionProvider" in available_providers:
            preferred_providers.insert(0, 'CUDAExecutionProvider')
        
        return [p for p in preferred_providers if p in available_providers]

# Load benchmarking configuration
benchmark_config = config.get('benchmarking', {})
models_config = config.get('models', {})
data_config = config.get('data', {})

print("📊 Benchmarking Configuration:")
print(f"   ⚡ Fast mode: {benchmark_config.get('fast_mode', True)}")
print(f"   🎯 Include accuracy: {benchmark_config.get('include_accuracy', True)}")
print(f"   🔄 Target models: {benchmark_config.get('target_models') or 'All available'}")

# Set benchmark parameters based on configuration
FAST_MODE = benchmark_config.get('fast_mode', True)
INCLUDE_ACCURACY = benchmark_config.get('include_accuracy', True)
TARGET_MODELS = benchmark_config.get('target_models')  # None means all models

# Performance parameters
if FAST_MODE:
    ITERATIONS = benchmark_config.get('iterations', {}).get('fast', 15)
    WARMUP_ITERATIONS = benchmark_config.get('warmup_iterations', {}).get('fast', 5)
    BATCH_SIZES = benchmark_config.get('batch_sizes', {}).get('fast', [1, 8])
    ACCURACY_SAMPLE_SIZE = benchmark_config.get('accuracy_sample_size', {}).get('fast', 300)
else:
    ITERATIONS = benchmark_config.get('iterations', {}).get('thorough', 30)
    WARMUP_ITERATIONS = benchmark_config.get('warmup_iterations', {}).get('thorough', 10)
    BATCH_SIZES = benchmark_config.get('batch_sizes', {}).get('thorough', [1, 4, 8, 16])
    ACCURACY_SAMPLE_SIZE = benchmark_config.get('accuracy_sample_size', {}).get('thorough', 500)

print(f"   🔄 Iterations: {ITERATIONS} (warmup: {WARMUP_ITERATIONS})")
print(f"   📊 Batch sizes: {BATCH_SIZES}")
print(f"   🎯 Accuracy samples: {ACCURACY_SAMPLE_SIZE if INCLUDE_ACCURACY else 'Disabled'}")

logger.info("Benchmarking configuration loaded successfully")

In [None]:
# Model Discovery and Loading
logger.info("🔍 Discovering available models...")

# Verify prerequisites
if not onnx_available or not transformers_available:
    raise ImportError("Required libraries missing. Install with: pip install onnxruntime transformers")

# Check if ONNX conversion was completed
if not state.is_step_complete('onnx_conversion_completed'):
    print("⚠️ ONNX conversion not completed. ONNX benchmarks will be skipped.")
    onnx_conversion_completed = False
else:
    onnx_conversion_completed = True

models_dir = Path(f"../{models_config.get('output_dir', 'models')}")
print(f"📂 Models directory: {models_dir}")

# Discover available models
available_models = {}

if models_dir.exists():
    for model_path in models_dir.iterdir():
        if not model_path.is_dir() or model_path.name.startswith('.'):
            continue
            
        model_name = model_path.name
        
        # Check if model should be included based on TARGET_MODELS
        if TARGET_MODELS and model_name not in TARGET_MODELS:
            print(f"   ⏭️ Skipping {model_name} (not in target list)")
            continue
        
        # Check for PyTorch model files
        config_file = model_path / "config.json"
        pytorch_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("pytorch_model.bin"))
        
        # Check for ONNX model files
        onnx_dir = model_path / "onnx"
        onnx_files = list(onnx_dir.glob("*.onnx")) if onnx_dir.exists() else []
        
        if config_file.exists() and (pytorch_files or onnx_files):
            model_info = {
                'name': model_name,
                'path': model_path,
                'config_file': config_file,
                'has_pytorch': len(pytorch_files) > 0,
                'has_onnx': len(onnx_files) > 0,
                'pytorch_files': pytorch_files,
                'onnx_files': onnx_files
            }
            available_models[model_name] = model_info
            
            status = []
            if model_info['has_pytorch']:
                status.append("PyTorch")
            if model_info['has_onnx']:
                status.append("ONNX")
            
            print(f"   ✅ Found: {model_name} ({', '.join(status)})")
        else:
            print(f"   ⚠️ Invalid model directory: {model_name}")

print(f"\n📊 Discovery Summary:")
print(f"   🤖 Total models found: {len(available_models)}")
pytorch_count = sum(1 for m in available_models.values() if m['has_pytorch'])
onnx_count = sum(1 for m in available_models.values() if m['has_onnx'])
print(f"   🔥 PyTorch models: {pytorch_count}")
print(f"   ⚡ ONNX models: {onnx_count}")

if len(available_models) == 0:
    logger.error("No models found for benchmarking")
    raise RuntimeError("No models found. Please run model training and/or ONNX conversion first.")

In [None]:
# Core Benchmarking Engine
class ModelBenchmarker:
    """Comprehensive model benchmarking with standardized measurement."""
    
    def __init__(self, logger):
        self.logger = logger
        self.execution_providers = ExecutionProviderManager.get_execution_providers()
    
    def measure_memory_usage(self) -> float:
        """Measure current memory usage in MB."""
        import psutil
        process = psutil.Process()
        return process.memory_info().rss / 1024 / 1024  # Convert to MB
    
    def get_model_size(self, model_path: Path, model_type: str) -> float:
        """Get model file size in MB."""
        if model_type == 'pytorch':
            model_files = list(model_path.glob("*.safetensors")) + list(model_path.glob("pytorch_model.bin"))
            return sum(f.stat().st_size for f in model_files) / (1024 * 1024)
        elif model_type == 'onnx':
            onnx_files = list((model_path / "onnx").glob("*.onnx"))
            return sum(f.stat().st_size for f in onnx_files) / (1024 * 1024)
        return 0.0
    
    def create_sample_batch(self, tokenizer, batch_size: int, max_length: int = 128) -> Tuple[Any, Any]:
        """Create a sample batch for benchmarking."""
        # Use diverse sample texts for more realistic benchmarking
        sample_texts = [
            "The company's quarterly earnings exceeded expectations by 15%.",
            "Market volatility has increased due to geopolitical tensions.",
            "The Federal Reserve announced a rate hike of 0.25 basis points.",
            "Tech stocks declined following regulatory concerns.",
            "Oil prices surged on supply chain disruptions."
        ] * (batch_size // 5 + 1)  # Repeat to fill batch
        
        sample_texts = sample_texts[:batch_size]
        
        # Tokenize inputs
        inputs = tokenizer(
            sample_texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt' if hasattr(tokenizer, 'model_max_length') else None
        )
        
        return inputs['input_ids'], inputs['attention_mask']
    
    def benchmark_pytorch_model(self, model_info: Dict, batch_size: int) -> BenchmarkResult:
        """Benchmark PyTorch model performance."""
        model_name = model_info['name']
        model_path = model_info['path']
        
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(str(model_path))
        model = AutoModelForSequenceClassification.from_pretrained(str(model_path))
        model.eval()
        
        # Move to appropriate device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        
        # Create sample inputs
        input_ids, attention_mask = self.create_sample_batch(tokenizer, batch_size)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Warmup runs
        with torch.no_grad():
            for _ in range(WARMUP_ITERATIONS):
                _ = model(input_ids, attention_mask)
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
        
        # Benchmark runs
        latencies = []
        memory_before = self.measure_memory_usage()
        
        with torch.no_grad():
            for _ in range(ITERATIONS):
                start_time = time.perf_counter()
                _ = model(input_ids, attention_mask)
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                end_time = time.perf_counter()
                latencies.append((end_time - start_time) * 1000)  # Convert to ms
        
        memory_after = self.measure_memory_usage()
        peak_memory = memory_after - memory_before
        
        # Calculate statistics
        latencies = np.array(latencies)
        avg_latency = np.mean(latencies)
        p50_latency = np.percentile(latencies, 50)
        p95_latency = np.percentile(latencies, 95)
        p99_latency = np.percentile(latencies, 99)
        throughput = (batch_size * 1000) / avg_latency  # samples per second
        
        # Get model size
        model_size = self.get_model_size(model_path, 'pytorch')
        
        # Clean up
        del model, tokenizer, input_ids, attention_mask
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()
        
        return BenchmarkResult(
            model_name=model_name,
            model_type='pytorch',
            batch_size=batch_size,
            avg_latency_ms=float(avg_latency),
            p50_latency_ms=float(p50_latency),
            p95_latency_ms=float(p95_latency),
            p99_latency_ms=float(p99_latency),
            throughput_samples_per_sec=float(throughput),
            peak_memory_mb=float(max(0, peak_memory)),
            model_size_mb=float(model_size),
            provider='PyTorch'
        )
    
    def benchmark_onnx_model(self, model_info: Dict, batch_size: int, provider: str = 'CPUExecutionProvider') -> BenchmarkResult:
        """Benchmark ONNX model performance."""
        model_name = model_info['name']
        model_path = model_info['path']
        
        # Find ONNX model file
        onnx_files = model_info['onnx_files']
        if not onnx_files:
            raise ValueError(f"No ONNX files found for {model_name}")
        
        onnx_model_path = onnx_files[0]  # Use first ONNX file
        
        # Load tokenizer and create ONNX session
        tokenizer = AutoTokenizer.from_pretrained(str(model_path))
        session = ort.InferenceSession(str(onnx_model_path), providers=[provider])
        
        # Create sample inputs
        input_ids, attention_mask = self.create_sample_batch(tokenizer, batch_size)
        
        # Convert to numpy for ONNX
        onnx_inputs = {
            'input_ids': input_ids.numpy() if hasattr(input_ids, 'numpy') else input_ids,
            'attention_mask': attention_mask.numpy() if hasattr(attention_mask, 'numpy') else attention_mask
        }
        
        # Warmup runs
        for _ in range(WARMUP_ITERATIONS):
            _ = session.run(None, onnx_inputs)
        
        # Benchmark runs
        latencies = []
        memory_before = self.measure_memory_usage()
        
        for _ in range(ITERATIONS):
            start_time = time.perf_counter()
            _ = session.run(None, onnx_inputs)
            end_time = time.perf_counter()
            latencies.append((end_time - start_time) * 1000)  # Convert to ms
        
        memory_after = self.measure_memory_usage()
        peak_memory = memory_after - memory_before
        
        # Calculate statistics
        latencies = np.array(latencies)
        avg_latency = np.mean(latencies)
        p50_latency = np.percentile(latencies, 50)
        p95_latency = np.percentile(latencies, 95)
        p99_latency = np.percentile(latencies, 99)
        throughput = (batch_size * 1000) / avg_latency  # samples per second
        
        # Get model size
        model_size = self.get_model_size(model_path, 'onnx')
        
        # Clean up
        del tokenizer, session
        gc.collect()
        
        return BenchmarkResult(
            model_name=model_name,
            model_type='onnx',
            batch_size=batch_size,
            avg_latency_ms=float(avg_latency),
            p50_latency_ms=float(p50_latency),
            p95_latency_ms=float(p95_latency),
            p99_latency_ms=float(p99_latency),
            throughput_samples_per_sec=float(throughput),
            peak_memory_mb=float(max(0, peak_memory)),
            model_size_mb=float(model_size),
            provider=provider
        )

# Initialize benchmarker
benchmarker = ModelBenchmarker(logger)
print(f"🔧 Benchmarker initialized with providers: {benchmarker.execution_providers}")

logger.info("Benchmarking engine ready")

In [None]:
# Main Benchmarking Execution
logger.info("🚀 Starting performance benchmarking...")

all_results = []
benchmark_summary = {
    'benchmark_timestamp': datetime.now().isoformat(),
    'configuration': {
        'fast_mode': FAST_MODE,
        'iterations': ITERATIONS,
        'warmup_iterations': WARMUP_ITERATIONS,
        'batch_sizes': BATCH_SIZES,
        'include_accuracy': INCLUDE_ACCURACY,
        'accuracy_sample_size': ACCURACY_SAMPLE_SIZE if INCLUDE_ACCURACY else None
    },
    'system_info': {
        'platform': platform.platform(),
        'python_version': platform.python_version(),
        'pytorch_version': torch.__version__ if 'torch' in globals() else None,
        'onnxruntime_version': ort.__version__ if onnx_available else None,
        'available_providers': benchmarker.execution_providers if onnx_available else []
    },
    'models_benchmarked': 0,
    'total_benchmarks': 0
}

print(f"\n⚡ Performance Benchmarking:")
print(f"{'='*80}")

total_benchmarks = 0
for model_name, model_info in available_models.items():
    for batch_size in BATCH_SIZES:
        if model_info['has_pytorch']:
            total_benchmarks += 1
        if model_info['has_onnx']:
            total_benchmarks += len(benchmarker.execution_providers)

print(f"📊 Total benchmarks to run: {total_benchmarks}")
print(f"⏱️ Estimated time: {total_benchmarks * ITERATIONS * 0.1:.1f} seconds")

current_benchmark = 0
successful_benchmarks = 0

for model_name, model_info in available_models.items():
    print(f"\n🤖 Benchmarking {model_name}:")
    print(f"   📁 Path: {model_info['path']}")
    
    model_results = []
    
    for batch_size in BATCH_SIZES:
        print(f"\n   📊 Batch size: {batch_size}")
        
        # Benchmark PyTorch model
        if model_info['has_pytorch']:
            current_benchmark += 1
            print(f"   🔥 PyTorch [{current_benchmark}/{total_benchmarks}]", end=" ... ")
            
            try:
                result = benchmarker.benchmark_pytorch_model(model_info, batch_size)
                model_results.append(result)
                all_results.append(result)
                successful_benchmarks += 1
                
                print(f"✅ {result.avg_latency_ms:.2f}ms avg, {result.throughput_samples_per_sec:.1f} samples/sec")
                logger.info(f"PyTorch benchmark completed for {model_name} (batch={batch_size})")
                
            except Exception as e:
                print(f"❌ Failed: {str(e)}")
                logger.error(f"PyTorch benchmark failed for {model_name}: {str(e)}")
        
        # Benchmark ONNX model with different providers
        if model_info['has_onnx']:
            for provider in benchmarker.execution_providers:
                current_benchmark += 1
                provider_name = provider.replace('ExecutionProvider', '')
                print(f"   ⚡ ONNX-{provider_name} [{current_benchmark}/{total_benchmarks}]", end=" ... ")
                
                try:
                    result = benchmarker.benchmark_onnx_model(model_info, batch_size, provider)
                    model_results.append(result)
                    all_results.append(result)
                    successful_benchmarks += 1
                    
                    print(f"✅ {result.avg_latency_ms:.2f}ms avg, {result.throughput_samples_per_sec:.1f} samples/sec")
                    logger.info(f"ONNX-{provider_name} benchmark completed for {model_name} (batch={batch_size})")
                    
                except Exception as e:
                    print(f"❌ Failed: {str(e)}")
                    logger.error(f"ONNX-{provider_name} benchmark failed for {model_name}: {str(e)}")
    
    # Display model summary
    if model_results:
        best_latency = min(r.avg_latency_ms for r in model_results)
        best_throughput = max(r.throughput_samples_per_sec for r in model_results)
        
        best_latency_config = next(r for r in model_results if r.avg_latency_ms == best_latency)
        best_throughput_config = next(r for r in model_results if r.throughput_samples_per_sec == best_throughput)
        
        print(f"   📈 Best latency: {best_latency:.2f}ms ({best_latency_config.model_type}-{best_latency_config.provider}, batch={best_latency_config.batch_size})")
        print(f"   🚀 Best throughput: {best_throughput:.1f} samples/sec ({best_throughput_config.model_type}-{best_throughput_config.provider}, batch={best_throughput_config.batch_size})")

# Update summary
benchmark_summary['models_benchmarked'] = len(available_models)
benchmark_summary['total_benchmarks'] = total_benchmarks
benchmark_summary['successful_benchmarks'] = successful_benchmarks
benchmark_summary['failed_benchmarks'] = total_benchmarks - successful_benchmarks

print(f"\n{'='*80}")
print(f"⚡ Performance Benchmarking Summary:")
print(f"   🤖 Models benchmarked: {benchmark_summary['models_benchmarked']}")
print(f"   ✅ Successful benchmarks: {successful_benchmarks}")
print(f"   ❌ Failed benchmarks: {total_benchmarks - successful_benchmarks}")

if successful_benchmarks == 0:
    logger.error("No benchmarks completed successfully")
    raise RuntimeError("All benchmarks failed")

logger.info(f"Performance benchmarking completed: {successful_benchmarks}/{total_benchmarks} successful")

In [None]:
# Results Analysis and Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

print(f"\n📊 Results Analysis:")
print(f"{'='*80}")

# Create DataFrame from results
results_data = []
for result in all_results:
    results_data.append({
        'model_name': result.model_name,
        'model_type': result.model_type,
        'provider': result.provider,
        'batch_size': result.batch_size,
        'avg_latency_ms': result.avg_latency_ms,
        'p50_latency_ms': result.p50_latency_ms,
        'p95_latency_ms': result.p95_latency_ms,
        'p99_latency_ms': result.p99_latency_ms,
        'throughput_samples_per_sec': result.throughput_samples_per_sec,
        'memory_usage_mb': result.memory_usage_mb,
        'accuracy_score': result.accuracy_score
    })

df_results = pd.DataFrame(results_data)
print(f"📋 Created results DataFrame with {len(df_results)} entries")

# Display best performers
print(f"\n🏆 Best Performers:")
print(f"{'='*50}")

# Best latency
best_latency_row = df_results.loc[df_results['avg_latency_ms'].idxmin()]
print(f"⚡ Lowest Latency: {best_latency_row['avg_latency_ms']:.2f}ms")
print(f"   Model: {best_latency_row['model_name']} ({best_latency_row['model_type']}-{best_latency_row['provider']})")
print(f"   Batch: {best_latency_row['batch_size']}, Throughput: {best_latency_row['throughput_samples_per_sec']:.1f} samples/sec")

# Best throughput
best_throughput_row = df_results.loc[df_results['throughput_samples_per_sec'].idxmax()]
print(f"\n🚀 Highest Throughput: {best_throughput_row['throughput_samples_per_sec']:.1f} samples/sec")
print(f"   Model: {best_throughput_row['model_name']} ({best_throughput_row['model_type']}-{best_throughput_row['provider']})")
print(f"   Batch: {best_throughput_row['batch_size']}, Latency: {best_throughput_row['avg_latency_ms']:.2f}ms")

# Best accuracy (if available)
if df_results['accuracy_score'].notna().any():
    best_accuracy_row = df_results.loc[df_results['accuracy_score'].idxmax()]
    print(f"\n🎯 Highest Accuracy: {best_accuracy_row['accuracy_score']:.4f}")
    print(f"   Model: {best_accuracy_row['model_name']} ({best_accuracy_row['model_type']}-{best_accuracy_row['provider']})")
    print(f"   Batch: {best_accuracy_row['batch_size']}, Latency: {best_accuracy_row['avg_latency_ms']:.2f}ms")

# Performance by model type
print(f"\n📊 Performance by Model Type:")
print(f"{'='*50}")
type_summary = df_results.groupby('model_type').agg({
    'avg_latency_ms': ['mean', 'min', 'max'],
    'throughput_samples_per_sec': ['mean', 'min', 'max'],
    'accuracy_score': 'mean'
}).round(3)

for model_type in type_summary.index:
    print(f"{model_type.upper()}:")
    print(f"   Latency: {type_summary.loc[model_type, ('avg_latency_ms', 'mean')]:.2f}ms avg "
          f"({type_summary.loc[model_type, ('avg_latency_ms', 'min')]:.2f}-{type_summary.loc[model_type, ('avg_latency_ms', 'max')]:.2f}ms)")
    print(f"   Throughput: {type_summary.loc[model_type, ('throughput_samples_per_sec', 'mean')]:.1f} samples/sec avg "
          f"({type_summary.loc[model_type, ('throughput_samples_per_sec', 'min')]:.1f}-{type_summary.loc[model_type, ('throughput_samples_per_sec', 'max')]:.1f})")
    if not pd.isna(type_summary.loc[model_type, ('accuracy_score', 'mean')]):
        print(f"   Accuracy: {type_summary.loc[model_type, ('accuracy_score', 'mean')]:.4f}")

# Create visualizations
plt.style.use('seaborn-v0_8')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Performance Benchmarking Results', fontsize=16, fontweight='bold')

# 1. Latency comparison
ax1 = axes[0, 0]
df_pivot_latency = df_results.pivot_table(values='avg_latency_ms', index='model_name', 
                                          columns=['model_type', 'provider'], aggfunc='mean')
df_pivot_latency.plot(kind='bar', ax=ax1, rot=45)
ax1.set_title('Average Latency by Model')
ax1.set_ylabel('Latency (ms)')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Throughput comparison  
ax2 = axes[0, 1]
df_pivot_throughput = df_results.pivot_table(values='throughput_samples_per_sec', index='model_name',
                                             columns=['model_type', 'provider'], aggfunc='mean')
df_pivot_throughput.plot(kind='bar', ax=ax2, rot=45)
ax2.set_title('Throughput by Model')
ax2.set_ylabel('Samples/sec')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Batch size impact on latency
ax3 = axes[1, 0]
for model_type in df_results['model_type'].unique():
    subset = df_results[df_results['model_type'] == model_type]
    batch_latency = subset.groupby('batch_size')['avg_latency_ms'].mean()
    ax3.plot(batch_latency.index, batch_latency.values, marker='o', label=model_type)
ax3.set_title('Batch Size Impact on Latency')
ax3.set_xlabel('Batch Size')
ax3.set_ylabel('Average Latency (ms)')
ax3.legend()
ax3.set_xscale('log')

# 4. Memory usage vs performance
ax4 = axes[1, 1]
scatter = ax4.scatter(df_results['memory_usage_mb'], df_results['avg_latency_ms'], 
                      c=df_results['throughput_samples_per_sec'], cmap='viridis', alpha=0.7)
ax4.set_title('Memory vs Latency (colored by throughput)')
ax4.set_xlabel('Memory Usage (MB)')
ax4.set_ylabel('Average Latency (ms)')
plt.colorbar(scatter, ax=ax4, label='Throughput (samples/sec)')

plt.tight_layout()
plt.show()

logger.info("Results visualization completed")

In [None]:
# Save Results
import json
from pathlib import Path

print(f"\n💾 Saving Results:")
print(f"{'='*80}")

# Create results directory
results_dir = base_path / 'results'
results_dir.mkdir(exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_filename = f"benchmark_results_generalized_{timestamp}"

# 1. Save CSV results
csv_path = results_dir / f"{base_filename}.csv"
df_results.to_csv(csv_path, index=False)
print(f"📊 CSV results saved: {csv_path}")

# 2. Save detailed JSON results
json_results = {
    'benchmark_summary': benchmark_summary,
    'detailed_results': [result.__dict__ for result in all_results],
    'performance_summary': {
        'best_latency': {
            'value': float(best_latency_row['avg_latency_ms']),
            'model': best_latency_row['model_name'],
            'config': f"{best_latency_row['model_type']}-{best_latency_row['provider']}",
            'batch_size': int(best_latency_row['batch_size'])
        },
        'best_throughput': {
            'value': float(best_throughput_row['throughput_samples_per_sec']),
            'model': best_throughput_row['model_name'],
            'config': f"{best_throughput_row['model_type']}-{best_throughput_row['provider']}",
            'batch_size': int(best_throughput_row['batch_size'])
        }
    }
}

# Add accuracy summary if available
if df_results['accuracy_score'].notna().any():
    json_results['performance_summary']['best_accuracy'] = {
        'value': float(best_accuracy_row['accuracy_score']),
        'model': best_accuracy_row['model_name'],
        'config': f"{best_accuracy_row['model_type']}-{best_accuracy_row['provider']}",
        'batch_size': int(best_accuracy_row['batch_size'])
    }

json_path = results_dir / f"{base_filename}.json"
with open(json_path, 'w') as f:
    json.dump(json_results, f, indent=2, default=str)
print(f"📋 JSON results saved: {json_path}")

# 3. Save performance summary report
report_path = results_dir / f"{base_filename}_summary.txt"
with open(report_path, 'w') as f:
    f.write("Performance Benchmarking Summary Report\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Configuration: {json_results['benchmark_summary']['configuration']}\n\n")
    
    f.write("Best Performers:\n")
    f.write("-" * 20 + "\n")
    f.write(f"Lowest Latency: {best_latency_row['avg_latency_ms']:.2f}ms\n")
    f.write(f"  Model: {best_latency_row['model_name']} ({best_latency_row['model_type']}-{best_latency_row['provider']})\n")
    f.write(f"  Batch: {best_latency_row['batch_size']}\n\n")
    
    f.write(f"Highest Throughput: {best_throughput_row['throughput_samples_per_sec']:.1f} samples/sec\n")
    f.write(f"  Model: {best_throughput_row['model_name']} ({best_throughput_row['model_type']}-{best_throughput_row['provider']})\n")
    f.write(f"  Batch: {best_throughput_row['batch_size']}\n\n")
    
    if df_results['accuracy_score'].notna().any():
        f.write(f"Highest Accuracy: {best_accuracy_row['accuracy_score']:.4f}\n")
        f.write(f"  Model: {best_accuracy_row['model_name']} ({best_accuracy_row['model_type']}-{best_accuracy_row['provider']})\n")
        f.write(f"  Batch: {best_accuracy_row['batch_size']}\n\n")
    
    f.write("Performance by Model Type:\n")
    f.write("-" * 30 + "\n")
    for model_type in type_summary.index:
        f.write(f"{model_type.upper()}:\n")
        f.write(f"  Avg Latency: {type_summary.loc[model_type, ('avg_latency_ms', 'mean')]:.2f}ms\n")
        f.write(f"  Avg Throughput: {type_summary.loc[model_type, ('throughput_samples_per_sec', 'mean')]:.1f} samples/sec\n")
        if not pd.isna(type_summary.loc[model_type, ('accuracy_score', 'mean')]):
            f.write(f"  Avg Accuracy: {type_summary.loc[model_type, ('accuracy_score', 'mean')]:.4f}\n")
        f.write("\n")

print(f"📄 Summary report saved: {report_path}")

# 4. Save plot
plot_path = results_dir / f"{base_filename}_plots.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
print(f"📈 Plots saved: {plot_path}")

# 5. Update state manager with latest results
try:
    state_manager.save_state('benchmark_results', {
        'latest_run': {
            'timestamp': timestamp,
            'csv_path': str(csv_path),
            'json_path': str(json_path),
            'report_path': str(report_path),
            'plot_path': str(plot_path)
        },
        'summary': json_results['performance_summary'],
        'models_benchmarked': list(available_models.keys()),
        'successful_benchmarks': successful_benchmarks,
        'total_benchmarks': total_benchmarks
    })
    print(f"💾 State updated with benchmark results")
except Exception as e:
    logger.warning(f"Failed to update state: {e}")

# Display completion summary
print(f"\n🎉 Benchmarking Complete!")
print(f"{'='*80}")
print(f"📊 Results available in: {results_dir}")
print(f"📈 CSV: {csv_path.name}")
print(f"📋 JSON: {json_path.name}")
print(f"📄 Report: {report_path.name}")
print(f"🖼️ Plots: {plot_path.name}")

logger.info(f"Comprehensive benchmarking completed successfully. Results saved to {results_dir}")
print(f"\n✅ All benchmarking tasks completed successfully!")