<a href="https://colab.research.google.com/github/MMillward2012/deepmind_internship/blob/main/notebooks/7_benchmarks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import packages

In [1]:
%cd ..
!ls

/Users/matthew/Documents/deepmind_internship
README.md         [34mfigures[m[m           requirements.txt  [34mvenv-py311[m[m
[34mbenchmark_results[m[m [34mmodels[m[m            [34mresults[m[m
[34mdata[m[m              [34mnotebooks[m[m         [34msrc[m[m


In [2]:
import os
import time
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers.onnx import export
from transformers.onnx.features import FeaturesManager
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
import torch


In [3]:
BASE_DIR = Path("models")
ONNX_OPSET = 13

In [4]:
def is_valid_model_dir(d):
    return (d / "config.json").exists() and ((d / "pytorch_model.bin").exists() or (d / "model.safetensors").exists())

In [5]:
model_dirs = [d for d in BASE_DIR.iterdir() if d.is_dir() and is_valid_model_dir(d)]
print("Found valid models:", [m.name for m in model_dirs])

Found valid models: ['all-MiniLM-L6-v2-financial-sentiment', 'distilbert-financial-sentiment', 'finbert-tone-financial-sentiment', 'SmolLM2-360M-Instruct-financial-sentiment', 'tinybert-financial-classifier', 'mobilebert-uncased-financial-sentiment']


In [6]:
class ONNXExportWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        # Call model with return_dict=False to get a tuple output
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        # Return only the logits tensor (usually first element)
        return outputs[0]


In [7]:
def export_to_onnx(model_dir, onnx_path):
    print("🔍 Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()

    wrapped_model = ONNXExportWrapper(model)  # Wrap the model here

    dummy_input = tokenizer("This company is doing great!", return_tensors="pt")

    print("🚀 Exporting to ONNX...")
    torch.onnx.export(
        wrapped_model,
        (dummy_input["input_ids"], dummy_input["attention_mask"]),
        str(onnx_path),
        input_names=["input_ids", "attention_mask"],
        output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "output": {0: "batch_size"},
        },
        opset_version=17,  # Use >=14 due to scaled_dot_product_attention operator support
        do_constant_folding=True,
    )
    print(f"✅ Exported to {onnx_path}")


In [8]:
results = []

for model_dir in model_dirs:
    print(f"\n⏳ Processing {model_dir.name}...")
    
    onnx_dir = model_dir / "onnx"
    onnx_dir.mkdir(exist_ok=True)
    onnx_model_path = onnx_dir / "model.onnx"
    quantised_model_path = onnx_dir / "model-int8.onnx"

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    # Export ONNX if not already done
    if not onnx_model_path.exists():
        print("📦 Exporting to ONNX...")
        export_to_onnx(model_dir, onnx_model_path)
    else:
        print("✅ ONNX already exists.")


⏳ Processing all-MiniLM-L6-v2-financial-sentiment...
✅ ONNX already exists.

⏳ Processing distilbert-financial-sentiment...
✅ ONNX already exists.

⏳ Processing finbert-tone-financial-sentiment...
✅ ONNX already exists.

⏳ Processing SmolLM2-360M-Instruct-financial-sentiment...
✅ ONNX already exists.

⏳ Processing tinybert-financial-classifier...
✅ ONNX already exists.

⏳ Processing mobilebert-uncased-financial-sentiment...
✅ ONNX already exists.


In [9]:
import os
from onnxruntime.quantization import quantize_dynamic, QuantType

models_dir = "models"  # root directory containing model subfolders

def quantize_all_models(models_root):
    for model_name in os.listdir(models_root):
        model_path = os.path.join(models_root, model_name, "onnx", "model.onnx")
        
        if not os.path.isfile(model_path):
            print(f"[SKIP] No ONNX model found for {model_name} at expected path: {model_path}")
            continue
        
        quantized_model_path = os.path.join(models_root, model_name, "onnx", "model_quantized.onnx")
        print(f"[PROCESSING] Quantizing model '{model_name}'")
        
        try:
            quantize_dynamic(
                model_input=model_path,
                model_output=quantized_model_path,
                weight_type=QuantType.QInt8
            )
            print(f"[SUCCESS] Saved quantized model: {quantized_model_path}")
        except Exception as e:
            print(f"[ERROR] Failed to quantize {model_name}: {e}")

if __name__ == "__main__":
    quantize_all_models(models_dir)


[SKIP] No ONNX model found for .DS_Store at expected path: models/.DS_Store/onnx/model.onnx
[PROCESSING] Quantizing model 'all-MiniLM-L6-v2-financial-sentiment'




[SUCCESS] Saved quantized model: models/all-MiniLM-L6-v2-financial-sentiment/onnx/model_quantized.onnx
[PROCESSING] Quantizing model 'distilbert-financial-sentiment'




[SUCCESS] Saved quantized model: models/distilbert-financial-sentiment/onnx/model_quantized.onnx
[PROCESSING] Quantizing model 'finbert-tone-financial-sentiment'




[SUCCESS] Saved quantized model: models/finbert-tone-financial-sentiment/onnx/model_quantized.onnx
[SKIP] No ONNX model found for .gitkeep at expected path: models/.gitkeep/onnx/model.onnx
[PROCESSING] Quantizing model 'SmolLM2-360M-Instruct-financial-sentiment'




[SUCCESS] Saved quantized model: models/SmolLM2-360M-Instruct-financial-sentiment/onnx/model_quantized.onnx
[PROCESSING] Quantizing model 'tinybert-financial-classifier'




[SUCCESS] Saved quantized model: models/tinybert-financial-classifier/onnx/model_quantized.onnx
[PROCESSING] Quantizing model 'mobilebert-uncased-financial-sentiment'


  elem_type: 7
  shape {
    dim {
      dim_value: 3
    }
    dim {
      dim_value: 2
    }
  }
}
.
  elem_type: 7
  shape {
    dim {
      dim_value: 3
    }
    dim {
      dim_value: 2
    }
  }
}
.


[SUCCESS] Saved quantized model: models/mobilebert-uncased-financial-sentiment/onnx/model_quantized.onnx


In [10]:
import time
import onnxruntime as ort
import psutil
from transformers import AutoTokenizer

EXAMPLE_INPUT = "Stocks surged after the company reported record earnings."
MAX_LENGTH = 128
BENCHMARK_ITERATIONS = 100

def benchmark_onnx_model(onnx_path, tokenizer):
    # Load ONNX model session
    sess = ort.InferenceSession(str(onnx_path), providers=["CPUExecutionProvider"])

    # Measure memory usage after session creation
    process = psutil.Process()
    memory_mb = process.memory_info().rss / 1024 / 1024

    # Prepare input tokens
    inputs = tokenizer(EXAMPLE_INPUT, return_tensors="np", max_length=MAX_LENGTH, padding="max_length", truncation=True)

    # Warm-up
    for _ in range(10):
        sess.run(None, {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})

    # Measure latency over multiple iterations
    times = []
    for _ in range(BENCHMARK_ITERATIONS):
        start = time.time()
        sess.run(None, {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
        times.append((time.time() - start) * 1000)  # milliseconds

    avg_latency = sum(times) / len(times)
    p99_latency = sorted(times)[int(len(times) * 0.99) - 1]

    # Calculate throughput: predictions per second (using avg latency)
    throughput = 1000 / avg_latency

    # Model size in MB
    model_size_mb = onnx_path.stat().st_size / (1024 * 1024)

    return {
        "avg_latency_ms": avg_latency,
        "p99_latency_ms": p99_latency,
        "memory_mb": memory_mb,
        "model_size_mb": model_size_mb,
        "throughput_preds_per_sec": throughput
    }


In [11]:
results = []

for model_dir in BASE_DIR.iterdir():
    if not model_dir.is_dir() or model_dir.name == ".gitkeep":
        continue

    onnx_path = model_dir / "onnx" / "model_quantized.onnx"
    if onnx_path.exists():
        print(f"Benchmarking model: {model_dir.name}")
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        result = benchmark_onnx_model(onnx_path, tokenizer)
        result["model"] = model_dir.name
        results.append(result)
    else:
        print(f"ONNX model not found for {model_dir.name}")

import pandas as pd
df = pd.DataFrame(results)
df = df.sort_values("avg_latency_ms").reset_index(drop=True)

print(df)


Benchmarking model: all-MiniLM-L6-v2-financial-sentiment
Benchmarking model: distilbert-financial-sentiment
Benchmarking model: finbert-tone-financial-sentiment
Benchmarking model: SmolLM2-360M-Instruct-financial-sentiment
Benchmarking model: tinybert-financial-classifier
Benchmarking model: mobilebert-uncased-financial-sentiment
   avg_latency_ms  p99_latency_ms    memory_mb  model_size_mb  \
0        9.503901       15.094757   930.500000      13.909289   
1       27.855840       46.684980   520.015625      21.980877   
2       55.561740       63.303947   941.390625      25.459671   
3       73.846185       98.513842   593.937500      64.228925   
4      159.092190      243.785858   686.375000     105.492896   
5      446.434224      538.583994  1168.546875     347.381740   

   throughput_preds_per_sec                                      model  
0                105.219951              tinybert-financial-classifier  
1                 35.899115       all-MiniLM-L6-v2-financial-senti

In [None]:
import time
import gc
import statistics
from contextlib import contextmanager
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Tuple, Union
import onnxruntime as ort
import psutil
from transformers import AutoTokenizer
from pathlib import Path
import numpy as np
import pandas as pd
import platform
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class BenchmarkConfig:
    """Configuration for benchmarking"""
    max_length: int = 128
    benchmark_iterations: int = 100
    warmup_iterations: int = 20
    batch_sizes: List[int] = None
    test_csv_path: Optional[str] = None
    device_mode: str = "auto"  # auto, cpu, gpu, coreml
    enable_logging: bool = False
    
    def __post_init__(self):
        if self.batch_sizes is None:
            self.batch_sizes = [1, 2, 4]

@dataclass
class BenchmarkResult:
    """Results from benchmarking a model"""
    model: str
    batch_size: int
    avg_latency_ms: float
    p50_latency_ms: float
    p95_latency_ms: float
    p99_latency_ms: float
    std_latency_ms: float
    min_latency_ms: float
    max_latency_ms: float
    memory_delta_mb: float
    peak_memory_mb: float
    model_size_mb: float
    throughput_samples_per_sec: float
    tokens_per_sec: float
    cpu_utilization_avg: float
    gpu_available: bool
    provider: str
    session_creation_time_ms: float
    accuracy: Optional[float] = None
    f1_score: Optional[float] = None
    
    def to_dict(self) -> Dict:
        return asdict(self)

class ExecutionProviderManager:
    """Manages ONNX execution providers based on platform and preferences"""
    
    @staticmethod
    def get_available_providers() -> List[str]:
        return ort.get_available_providers()
    
    @staticmethod
    def get_execution_providers(mode: str = "auto") -> List[str]:
        available = ort.get_available_providers()
        
        if mode == "cpu":
            return ["CPUExecutionProvider"]
        elif mode == "gpu":
            gpu_providers = ["CUDAExecutionProvider", "ROCMExecutionProvider"]
            for provider in gpu_providers:
                if provider in available:
                    return [provider]
            logger.warning("No GPU providers available, falling back to CPU")
            return ["CPUExecutionProvider"]
        elif mode == "coreml":
            if "CoreMLExecutionProvider" in available:
                return ["CoreMLExecutionProvider"]
            logger.warning("CoreML not available, falling back to CPU")
            return ["CPUExecutionProvider"]
        else:  # auto mode
            system = platform.system()
            if system == "Darwin":
                # For macOS, prioritize CPU over CoreML as you mentioned
                return ["CPUExecutionProvider"]
            else:
                # For other systems, prefer GPU if available
                preferences = [
                    "CUDAExecutionProvider", 
                    "ROCMExecutionProvider", 
                    "OpenVINOExecutionProvider", 
                    "CPUExecutionProvider"
                ]
                return [p for p in preferences if p in available]

class ModelLoader:
    """Handles ONNX model loading and session creation"""
    
    @staticmethod
    def load_onnx_session(
        onnx_path: Path, 
        providers: List[str], 
        enable_logging: bool = False
    ) -> Tuple[ort.InferenceSession, float]:
        start = time.perf_counter()
        
        opts = ort.SessionOptions()
        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        opts.enable_mem_pattern = True
        opts.enable_cpu_mem_arena = True
        
        if not enable_logging:
            opts.log_severity_level = 3
        
        try:
            session = ort.InferenceSession(str(onnx_path), providers=providers, sess_options=opts)
            creation_time = (time.perf_counter() - start) * 1000
            
            # Verify the session was created with expected provider
            actual_providers = session.get_providers()
            logger.info(f"Session created with providers: {actual_providers}")
            
            return session, creation_time
            
        except Exception as e:
            # Fallback for provider errors
            if any("CoreML" in p for p in providers):
                logger.warning(f"CoreML fallback error, switching to CPU: {e}")
                providers = ["CPUExecutionProvider"]
                session = ort.InferenceSession(str(onnx_path), providers=providers, sess_options=opts)
                creation_time = (time.perf_counter() - start) * 1000
                return session, creation_time
            else:
                raise

class DataProcessor:
    """Handles data preprocessing and batch preparation"""
    
    def __init__(self, tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Example inputs for benchmarking
        self.example_inputs = [
            "Stocks surged after the company reported record earnings.",
            "The weather forecast predicts heavy rain throughout the weekend.", 
            "Scientists have discovered a new species of deep-sea creature.",
            "Technology companies are investing heavily in artificial intelligence research.",
            "The local community center will host a charity fundraising event next month."
        ]
    
    def prepare_batch_inputs(self, texts: List[str]) -> Dict[str, np.ndarray]:
        """Prepare batch inputs for ONNX model"""
        encoding = self.tokenizer(
            texts,
            return_tensors="np",
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        )
        
        return {
            "input_ids": encoding["input_ids"].astype(np.int64),
            "attention_mask": encoding["attention_mask"].astype(np.int64)
        }
    
    def get_benchmark_texts(self, batch_size: int) -> List[str]:
        """Get texts for benchmarking with specified batch size"""
        multiplier = (batch_size // len(self.example_inputs)) + 1
        texts = (self.example_inputs * multiplier)[:batch_size]
        return texts
    
    def load_test_dataset(self, csv_path: Path) -> Tuple[List[str], List[int]]:
        """Load test dataset for accuracy evaluation"""
        try:
            df = pd.read_csv(csv_path, names=["label", "text"], encoding="latin1")
            
            # Handle different label formats
            if df["label"].dtype == "object":
                # String labels - map to integers
                unique_labels = df["label"].str.strip().unique()
                if set(unique_labels).issubset({"positive", "neutral", "negative"}):
                    label_map = {"positive": 0, "neutral": 1, "negative": 2}
                    df["label"] = df["label"].str.strip().map(label_map)
                else:
                    # Create mapping for any string labels
                    label_map = {label: idx for idx, label in enumerate(sorted(unique_labels))}
                    df["label"] = df["label"].str.strip().map(label_map)
            
            # Drop any rows with missing labels
            df = df.dropna(subset=["label"])
            
            # Split dataset
            _, test_df = train_test_split(
                df, 
                test_size=0.25, 
                random_state=42, 
                stratify=df["label"]
            )
            
            return test_df["text"].tolist(), test_df["label"].astype(int).tolist()
            
        except Exception as e:
            logger.error(f"Error loading test dataset: {e}")
            raise

class PerformanceMonitor:
    """Monitors system performance during benchmarking"""
    
    @staticmethod
    @contextmanager
    def cpu_monitor():
        """Context manager for monitoring CPU usage"""
        process = psutil.Process()
        cpu_samples = []
        # Initial call to initialize measurement
        process.cpu_percent(interval=None)
        try:
            yield cpu_samples
        finally:
            # Take final sample
            final_sample = process.cpu_percent(interval=None)
            if final_sample > 0:  # Only add if we got a valid reading
                cpu_samples.append(final_sample)
    
    @staticmethod
    def measure_memory_usage() -> float:
        """Get current memory usage in MB"""
        return psutil.Process().memory_info().rss / (1024**2)
    
    @staticmethod
    def get_model_size_mb(onnx_path: Path) -> float:
        """Get model file size in MB"""
        return onnx_path.stat().st_size / (1024**2)

class AccuracyEvaluator:
    """Handles model accuracy evaluation"""
    
    def __init__(self, session: ort.InferenceSession, data_processor: DataProcessor):
        self.session = session
        self.data_processor = data_processor
    
    def can_evaluate_classification(self) -> bool:
        """Check if the model can be evaluated for classification"""
        output_names = [output.name for output in self.session.get_outputs()]
        output_shapes = [output.shape for output in self.session.get_outputs()]
        
        # Check if we have logits output or single output that could be logits
        has_logits = "logits" in output_names
        has_single_output = len(output_names) == 1
        
        # Check if output shape suggests classification (batch_size, num_classes)
        has_classification_shape = any(
            len(shape) == 2 and (shape[1] is None or shape[1] > 1) 
            for shape in output_shapes if shape is not None
        )
        
        return has_logits or (has_single_output and has_classification_shape)
    
    def evaluate_classification(
        self, 
        texts: List[str], 
        labels: List[int]
    ) -> Tuple[float, float]:
        """Evaluate classification accuracy and F1 score"""
        if not self.can_evaluate_classification():
            raise ValueError("Model doesn't appear to be a classification model")
        
        # Prepare inputs
        inputs = self.data_processor.prepare_batch_inputs(texts)
        
        # Run inference
        outputs = self.session.run(None, inputs)
        
        # Get logits (assume first output contains logits)
        logits = outputs[0]
        
        # Handle different output shapes
        if len(logits.shape) == 3:
            # If 3D, take the first token (CLS token for BERT-like models)
            logits = logits[:, 0, :]
        elif len(logits.shape) == 1:
            # If 1D, reshape to 2D
            logits = logits.reshape(1, -1)
        
        # Get predictions
        predictions = np.argmax(logits, axis=1)
        
        # Calculate metrics
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average="weighted")
        
        return accuracy, f1

class LatencyBenchmarker:
    """Handles latency benchmarking"""
    
    def __init__(self, config: BenchmarkConfig):
        self.config = config
    
    def warmup_session(self, session: ort.InferenceSession, inputs: Dict[str, np.ndarray]):
        """Warm up the session"""
        logger.info(f"Warming up for {self.config.warmup_iterations} iterations...")
        for i in range(self.config.warmup_iterations):
            session.run(None, inputs)
            if i % 5 == 0:
                gc.collect()
    
    def measure_latency_and_cpu(
        self, 
        session: ort.InferenceSession, 
        inputs: Dict[str, np.ndarray]
    ) -> Tuple[List[float], float]:
        """Measure latency and CPU usage"""
        times = []
        gc.collect()
        
        with PerformanceMonitor.cpu_monitor() as cpu_samples:
            for i in range(self.config.benchmark_iterations):
                if i and i % 25 == 0:
                    gc.collect()
                
                start_time = time.perf_counter()
                session.run(None, inputs)
                end_time = time.perf_counter()
                
                times.append((end_time - start_time) * 1000)
                
                # Sample CPU usage periodically
                if i and i % 10 == 0:
                    cpu_sample = psutil.Process().cpu_percent(interval=None)
                    if cpu_sample > 0:
                        cpu_samples.append(cpu_sample)
        
        avg_cpu = statistics.mean(cpu_samples) if cpu_samples else 0.0
        return times, avg_cpu
    
    def calculate_stats(self, times: List[float], batch_size: int, total_tokens: int) -> Dict[str, float]:
        """Calculate latency statistics"""
        sorted_times = sorted(times)
        n = len(times)
        mean_time = statistics.mean(times)
        
        return {
            "avg_latency_ms": mean_time,
            "p50_latency_ms": sorted_times[n // 2],
            "p95_latency_ms": sorted_times[min(int(n * 0.95), n-1)],
            "p99_latency_ms": sorted_times[min(int(n * 0.99), n-1)],
            "std_latency_ms": statistics.stdev(times) if n > 1 else 0.0,
            "min_latency_ms": sorted_times[0],
            "max_latency_ms": sorted_times[-1],
            "throughput_samples_per_sec": (1000 * batch_size) / mean_time,
            "tokens_per_sec": (1000 * total_tokens) / mean_time
        }

class ONNXModelBenchmarker:
    """Main benchmarking class that orchestrates all components"""
    
    def __init__(self, config: BenchmarkConfig, tokenizer):
        self.config = config
        self.tokenizer = tokenizer
        self.data_processor = DataProcessor(tokenizer, config.max_length)
        self.latency_benchmarker = LatencyBenchmarker(config)
        self.provider_manager = ExecutionProviderManager()
        
    def benchmark_model(self, onnx_path: Path, batch_size: int) -> Optional[BenchmarkResult]:
        """Benchmark a single model with given batch size"""
        try:
            logger.info(f"Benchmarking {onnx_path.name} with batch size {batch_size}")
            
            # Get execution providers
            providers = self.provider_manager.get_execution_providers(self.config.device_mode)
            logger.info(f"Using providers: {providers}")
            
            # Load model
            session, creation_time = ModelLoader.load_onnx_session(
                onnx_path, providers, self.config.enable_logging
            )
            
            # Prepare benchmark data
            texts = self.data_processor.get_benchmark_texts(batch_size)
            inputs = self.data_processor.prepare_batch_inputs(texts)
            
            # Initial test run
            session.run(None, inputs)
            
            # Measure memory before benchmarking
            memory_before = PerformanceMonitor.measure_memory_usage()
            
            # Warm up
            self.latency_benchmarker.warmup_session(session, inputs)
            
            # Run benchmark
            logger.info(f"Running {self.config.benchmark_iterations} benchmark iterations...")
            times, cpu_avg = self.latency_benchmarker.measure_latency_and_cpu(session, inputs)
            
            # Measure memory after benchmarking
            memory_after = PerformanceMonitor.measure_memory_usage()
            
            # Calculate performance stats
            stats = self.latency_benchmarker.calculate_stats(
                times, batch_size, inputs["input_ids"].size
            )
            
            # Get model info
            model_size = PerformanceMonitor.get_model_size_mb(onnx_path)
            gpu_available = any(
                gpu in providers for gpu in ["CUDAExecutionProvider", "ROCMExecutionProvider"]
            )
            
            # Evaluate accuracy if possible and dataset provided
            accuracy = f1 = None
            if self.config.test_csv_path:
                try:
                    evaluator = AccuracyEvaluator(session, self.data_processor)
                    if evaluator.can_evaluate_classification():
                        test_texts, test_labels = self.data_processor.load_test_dataset(
                            Path(self.config.test_csv_path)
                        )
                        # Use subset for evaluation to match batch size
                        eval_texts = test_texts[:batch_size]
                        eval_labels = test_labels[:batch_size]
                        accuracy, f1 = evaluator.evaluate_classification(eval_texts, eval_labels)
                        logger.info(f"Accuracy: {accuracy:.2%}, F1 score: {f1:.2%}")
                    else:
                        logger.warning("Model doesn't appear to support classification evaluation")
                except Exception as e:
                    logger.warning(f"Accuracy evaluation failed: {e}")
            
            # Create result
            result = BenchmarkResult(
                model=onnx_path.parent.parent.name,  # Get model name from parent.parent since structure is models/modelName/onnx/model_quantized.onnx
                batch_size=batch_size,
                memory_delta_mb=memory_after - memory_before,
                peak_memory_mb=max(memory_before, memory_after),
                model_size_mb=model_size,
                cpu_utilization_avg=cpu_avg,
                gpu_available=gpu_available,
                provider=providers[0],
                session_creation_time_ms=creation_time,
                accuracy=accuracy,
                f1_score=f1,
                **stats
            )
            
            return result
            
        except Exception as e:
            logger.error(f"Benchmark failed for {onnx_path}: {e}")
            return None

class ResultsManager:
    """Manages benchmark results and reporting"""
    
    @staticmethod
    def save_results(results: List[BenchmarkResult], output_dir: Path = Path("results")):
        """Save results to CSV and JSON files"""
        output_dir.mkdir(exist_ok=True)
        
        # Convert to DataFrame
        df = pd.DataFrame([result.to_dict() for result in results])
        
        # Save files
        df.to_csv(output_dir / "benchmark_results.csv", index=False)
        df.to_json(output_dir / "benchmark_results.json", indent=2)
        
        logger.info(f"Results saved to {output_dir}")
        
        return df
    
    @staticmethod
    def print_summary(results: List[BenchmarkResult]):
        """Print benchmark summary"""
        if not results:
            logger.warning("No results to summarize")
            return
        
        df = pd.DataFrame([result.to_dict() for result in results])
        
        # Select columns for summary
        summary_cols = [
            "model", "batch_size", "avg_latency_ms", "p99_latency_ms",
            "throughput_samples_per_sec", "memory_delta_mb", "provider"
        ]
        
        # Add accuracy columns if available
        if df["accuracy"].notna().any():
            summary_cols.extend(["accuracy", "f1_score"])
        
        summary = df[summary_cols].sort_values(["model", "batch_size"])
        
        print("\n" + "="*80)
        print(" BENCHMARK SUMMARY")
        print("="*80)
        print(summary.to_string(index=False, float_format="%.3f"))
        
        # Print best performers by batch size
        for batch_size in sorted(df["batch_size"].unique()):
            batch_df = df[df["batch_size"] == batch_size]
            fastest = batch_df.loc[batch_df["avg_latency_ms"].idxmin()]
            highest_throughput = batch_df.loc[batch_df["throughput_samples_per_sec"].idxmax()]
            
            print(f"\nBatch Size {batch_size}:")
            print(f"  Fastest: {fastest['model']} ({fastest['avg_latency_ms']:.1f} ms)")
            print(f"  Highest throughput: {highest_throughput['model']} "
                  f"({highest_throughput['throughput_samples_per_sec']:.1f} samples/s)")

# Convenience function to discover models in your directory structure
def discover_models(models_dir: Union[str, Path]) -> List[Tuple[str, Path]]:
    """Discover all available models in the directory structure"""
    models_dir = Path(models_dir)
    discovered = []
    
    for model_dir in models_dir.glob("*"):
        if not model_dir.is_dir():
            continue
            
        model_name = model_dir.name
        onnx_path = model_dir / "onnx" / "model_quantized.onnx"
        
        if onnx_path.exists():
            discovered.append((model_name, onnx_path))
        else:
            # Check for alternative ONNX files
            onnx_dir = model_dir / "onnx"
            if onnx_dir.exists():
                onnx_files = list(onnx_dir.glob("*.onnx"))
                if onnx_files:
                    discovered.append((model_name, onnx_files[0]))
    
    return discovered
    """Create a benchmark configuration with custom parameters"""
    return BenchmarkConfig(**kwargs)

# Convenience functions for notebook use
def create_benchmark_config(**kwargs) -> BenchmarkConfig:
    """Create a benchmark configuration with custom parameters"""
    return BenchmarkConfig(**kwargs)

def run_single_benchmark(
    onnx_path: Union[str, Path], 
    tokenizer, 
    batch_size: int = 1,
    config: Optional[BenchmarkConfig] = None
) -> Optional[BenchmarkResult]:
    """Run benchmark on a single model - convenient for notebook use"""
    if config is None:
        config = BenchmarkConfig()
    
    benchmarker = ONNXModelBenchmarker(config, tokenizer)
    return benchmarker.benchmark_model(Path(onnx_path), batch_size)

def run_full_benchmark(
    models_dir: Union[str, Path],
    tokenizer,
    config: Optional[BenchmarkConfig] = None,
    save_results: bool = True
) -> List[BenchmarkResult]:
    """Run full benchmark suite - convenient for notebook use"""
    if config is None:
        config = BenchmarkConfig()
    
    models_dir = Path(models_dir)
    benchmarker = ONNXModelBenchmarker(config, tokenizer)
    
    all_results = []
    
    # Look for models in the specific structure: models/___modelName___/onnx/model_quantized.onnx
    model_dirs = [d for d in models_dir.glob("*") if d.is_dir()]
    
    for model_dir in model_dirs:
        model_name = model_dir.name
        logger.info(f"Looking for model: {model_name}")
        
        # Check for the specific path structure
        onnx_path = model_dir / "onnx" / "model_quantized.onnx"
        
        if not onnx_path.exists():
            # Fallback: look for any .onnx files in onnx subdirectory
            onnx_dir = model_dir / "onnx"
            if onnx_dir.exists():
                onnx_files = list(onnx_dir.glob("*.onnx"))
                if onnx_files:
                    onnx_path = onnx_files[0]
                    logger.info(f"Found alternative ONNX file: {onnx_path.name}")
                else:
                    logger.warning(f"No ONNX files found in {onnx_dir}")
                    continue
            else:
                logger.warning(f"No onnx directory found in {model_dir}")
                continue
        else:
            logger.info(f"Found model_quantized.onnx for {model_name}")
        
        # Benchmark across all batch sizes
        for batch_size in config.batch_sizes:
            logger.info(f"Benchmarking {model_name} with batch size {batch_size}")
            result = benchmarker.benchmark_model(onnx_path, batch_size)
            if result:
                all_results.append(result)
    
    if save_results and all_results:
        ResultsManager.save_results(all_results)
    
    ResultsManager.print_summary(all_results)
    return all_results

# Example usage functions for notebook
def quick_test(models_dir: str = "models", batch_size: int = 1):
    """Quick test run with minimal iterations - useful for testing setup"""
    from transformers import AutoTokenizer
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    config = create_benchmark_config(
        benchmark_iterations=5,  # Very fast for testing
        warmup_iterations=2,
        batch_sizes=[batch_size],
        device_mode="cpu"
    )
    
    return run_full_benchmark(models_dir, tokenizer, config)

def full_benchmark(models_dir: str = "models", include_accuracy: bool = True):
    """Complete benchmark with all metrics"""
    from transformers import AutoTokenizer
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    config = create_benchmark_config(
        max_length=128,
        benchmark_iterations=100,
        warmup_iterations=20,
        batch_sizes=[1, 2, 4, 8],
        device_mode="cpu",
        test_csv_path="data/FinancialPhraseBank/all-data.csv" if include_accuracy else None
    )
    
    return run_full_benchmark(models_dir, tokenizer, config)

if __name__ == "__main__":
    # Quick discovery of available models
    print("Discovering models...")
    models = discover_models("models")
    for name, path in models:
        print(f"  ✓ {name}: {path}")
    
    if models:
        # Alternatively run a quick test using quick_test()
        print(f"\nFound {len(models)} models. Running full benchmark...")
        results = full_benchmark()
    else:
        print("No models found!")

Discovering models...
  ✓ all-MiniLM-L6-v2-financial-sentiment: models/all-MiniLM-L6-v2-financial-sentiment/onnx/model_quantized.onnx
  ✓ distilbert-financial-sentiment: models/distilbert-financial-sentiment/onnx/model_quantized.onnx
  ✓ finbert-tone-financial-sentiment: models/finbert-tone-financial-sentiment/onnx/model_quantized.onnx
  ✓ SmolLM2-360M-Instruct-financial-sentiment: models/SmolLM2-360M-Instruct-financial-sentiment/onnx/model_quantized.onnx
  ✓ tinybert-financial-classifier: models/tinybert-financial-classifier/onnx/model_quantized.onnx
  ✓ mobilebert-uncased-financial-sentiment: models/mobilebert-uncased-financial-sentiment/onnx/model_quantized.onnx

Found 6 models. Running full benchmark...
