In [1]:
%cd ..

/Users/matthew/Documents/deepmind_internship


In [2]:
# ═══════════════════════════════════════════════════════════════════════════════
# 🎯 BENCHMARK CONFIGURATION - CHANGE THESE SETTINGS TO CONTROL YOUR BENCHMARK
# ═══════════════════════════════════════════════════════════════════════════════

# 1. MODEL SELECTION - Choose which models to benchmark
# Simply comment/uncomment the models you want to test:
TARGET_MODELS = [
    "tinybert-financial-classifier",
    "tinybert-financial-classifier-fine-tuned", 
    "tinybert-financial-classifier-pruned",
    # "finbert-tone-financial-sentiment",
    # "distilbert-financial-sentiment",
    # "all-MiniLM-L6-v2-financial-sentiment",
    # "mobilebert-uncased-financial-sentiment",
    # "SmolLM2-360M-Instruct-financial-sentiment",
]

# Set to None to test ALL models found in the models/ directory:
# TARGET_MODELS = None

# 2. SPEED vs THOROUGHNESS - Choose your trade-off  
FAST_MODE = True              # ← CHANGE THIS:
                              # True  = Quick testing (15 iterations, batches [1,8])
                              # False = Thorough testing (30 iterations, batches [1,4,8,16])

# 3. ACCURACY EVALUATION - Include accuracy metrics?
INCLUDE_ACCURACY = True       # ← CHANGE THIS:
                              # True  = Include accuracy evaluation (slower)
                              # False = Skip accuracy evaluation (faster)

# ═══════════════════════════════════════════════════════════════════════════════
# 🔧 ADVANCED SETTINGS - Only change these if you know what you're doing
# ═══════════════════════════════════════════════════════════════════════════════

# Data paths (usually don't need to change these)
MODELS_DIRECTORY = "models"
ACCURACY_DATASET_PATH = "data/FinancialPhraseBank/all-data.csv"

# Custom batch sizes (override the fast/slow mode defaults)
CUSTOM_BATCH_SIZES = None     # ← Example: [1, 2, 4] or leave as None to use defaults

# Custom iteration counts (override the fast/slow mode defaults)  
CUSTOM_ITERATIONS = None      # ← Example: 50 or leave as None to use defaults
CUSTOM_WARMUP = None          # ← Example: 20 or leave as None to use defaults
CUSTOM_SAMPLE_SIZE = None     # ← Example: 1000 or leave as None to use defaults

print("📋 Configuration loaded! Run all cells to execute benchmark with these settings.")

📋 Configuration loaded! Run all cells to execute benchmark with these settings.


In [3]:
!ls

README.md        [34mdata[m[m             [34mnotebooks[m[m        [34mresults[m[m
[34manalysis_results[m[m [34mmodels[m[m           requirements.txt [34mvenv-py311[m[m


In [4]:
# Cell 1: Setup & Configuration
import gc
import logging
import platform
import statistics
import time
from contextlib import contextmanager
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import onnxruntime as ort
import pandas as pd
import psutil
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
import pickle

# Configure logging for clear output in the notebook
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


@dataclass
class BenchmarkConfig:
    """Configuration for the entire benchmarking run."""
    benchmark_iterations: int = 100
    warmup_iterations: int = 20
    batch_sizes: List[int] = None
    accuracy_sample_size: int = 500
    test_csv_path: Optional[str] = None
    device_mode: str = "auto"  # "auto", "cpu", or "gpu"

    def __post_init__(self):
        if self.batch_sizes is None:
            self.batch_sizes = [1, 2, 4, 8]

@dataclass
class BenchmarkResult:
    """A structured class to hold results from a single benchmark run."""
    model: str
    batch_size: int
    avg_latency_ms: float
    p95_latency_ms: float
    throughput_samples_per_sec: float
    peak_memory_mb: float
    model_size_mb: float
    provider: str
    accuracy: Optional[float] = None
    f1_score: Optional[float] = None
    weighted_accuracy: Optional[float] = None
    weighted_f1_score: Optional[float] = None
    # UPDATED: Added new fields for confidence and per-class metrics
    avg_confidence_correct: Optional[float] = None
    avg_confidence_incorrect: Optional[float] = None
    per_class_metrics: Optional[Dict] = None


    def to_dict(self) -> Dict:
        # UPDATED: Flatten the per_class_metrics for easier CSV export
        flat_dict = asdict(self)
        per_class = flat_dict.pop("per_class_metrics", {})
        if per_class:
            for class_name, metrics in per_class.items():
                for metric_name, value in metrics.items():
                    flat_dict[f"{class_name}_{metric_name}"] = value
        return flat_dict

In [5]:
# Cell 2: Hardware & Model Loading Components
class ExecutionProviderManager:
    """Manages ONNX execution providers based on platform and preferences."""
    @staticmethod
    def get_execution_providers(mode: str = "auto") -> List: # Return type is now just List
        # Force CPU-only for consistent benchmarking (uncomment to enable CoreML/GPU)
        return ['CPUExecutionProvider']
        
        # Original hardware acceleration code (commented out):
        # if platform.system() == "Darwin" and ort.get_device() == "ARM64":
        #     # Optimized CoreML settings for maximum speed
        #     return [
        #         ('CoreMLExecutionProvider', {
        #             'coreml_flags': 'COREML_FLAG_ENABLE_ON_SUBGRAPH',
        #             'coreml_compute_units': 'ALL'  # Use all available compute units
        #         }),
        #         'CPUExecutionProvider'
        #     ]
        # 
        # # Fallback for other systems (Linux/Windows with GPU)
        # available = ort.get_available_providers()
        # preferences = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
        # chosen = [p for p in preferences if p in available]
        # print(chosen)
        # return chosen

class ModelLoader:
    """Handles loading an ONNX model into an inference session."""
    @staticmethod
    def load_onnx_session(onnx_path: Path, providers: List[str]) -> ort.InferenceSession:
        opts = ort.SessionOptions()
        
        # Speed-optimized settings for financial trading latency
        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        opts.enable_profiling = False
        opts.enable_mem_pattern = False  # Disable for speed
        opts.enable_cpu_mem_arena = False  # Disable for speed
        
        # Aggressive threading for maximum speed
        opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        opts.inter_op_num_threads = 1  
        opts.intra_op_num_threads = 0  # Use all available cores
        
        return ort.InferenceSession(str(onnx_path), providers=providers, sess_options=opts)


In [6]:
# Cell 3: Fixed Data Handling Component
class DataProcessor:
    """Handles data preprocessing and batch preparation - FIXED for consistency with training."""
    def __init__(self, tokenizer, max_length: int = 128):
        self.tokenizer, self.max_length = tokenizer, max_length
        self.example_inputs = ["Stocks surged after the company reported record earnings."]
        self.label_encoder = None

    def prepare_batch_inputs(self, texts: List[str]) -> Dict[str, np.ndarray]:
        encoding = self.tokenizer(
            texts, return_tensors="np", max_length=self.max_length,
            padding="max_length", truncation=True
        )
        return {k: v.astype(np.int64) for k, v in encoding.items()}

    def load_label_encoder(self, model_dir: Path) -> LabelEncoder:
        """Load the label encoder used during training."""
        label_encoder_path = model_dir / "label_encoder.pkl"
        if label_encoder_path.exists():
            with open(label_encoder_path, 'rb') as f:
                self.label_encoder = pickle.load(f)
                return self.label_encoder
        else:
            return None

    def load_test_dataset(self, csv_path: Path, model_dir: Path) -> Tuple[List[str], List[int]]:
        """Load test dataset using EXACT same preprocessing as training."""
        self.load_label_encoder(model_dir)
        
        df = pd.read_csv(csv_path, header=None, names=["label", "sentence"], encoding="latin-1")
        df["sentence"] = df["sentence"].str.strip('"')
        
        if self.label_encoder is not None:
            df["label_encoded"] = self.label_encoder.transform(df["label"])
        else:
            logger.warning("Using fallback label encoding - this might cause accuracy issues!")
            unique_labels = sorted(df["label"].unique())
            label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
            df["label_encoded"] = df["label"].map(label_to_id)
        
        _, test_df = train_test_split(
            df, test_size=0.25, random_state=42, stratify=df["label"]
        )
        
        return test_df["sentence"].tolist(), test_df["label_encoded"].astype(int).tolist()


In [7]:
# Cell 4: Performance Measurement Components
class PerformanceMonitor:
    """Monitors system performance during benchmarking."""
    @staticmethod
    def measure_memory_usage() -> float:
        return psutil.Process().memory_info().rss / (1024**2)

    @staticmethod
    def get_model_size_mb(onnx_path: Path) -> float:
        return onnx_path.stat().st_size / (1024**2)

class LatencyBenchmarker:
    """Handles the details of latency benchmarking."""
    def __init__(self, config: BenchmarkConfig):
        self.config = config
    
    def warmup_session(self, session: ort.InferenceSession, inputs: Dict[str, np.ndarray]):
        # Ultra-minimal warmup - just enough to avoid first-run penalty
        for _ in range(3):  # Reduced to absolute minimum
            session.run(None, inputs)
    
    def measure_latency(self, session: ort.InferenceSession, inputs: Dict[str, np.ndarray]) -> List[float]:
        # Maximum speed measurement - no overhead
        times = []
        
        for _ in range(self.config.benchmark_iterations):
            start = time.perf_counter()
            session.run(None, inputs)
            times.append((time.perf_counter() - start) * 1000)
        
        return times

In [8]:
# Cell 5: Enhanced Accuracy Evaluation Component

# UPDATED: New comprehensive metrics calculation function
def calculate_ordinal_metrics(y_true, y_pred, y_prob, labels_map):
    """
    Calculates standard, ordinally-weighted, confidence, and per-class metrics.
    'labels_map' should be an ordered list of class names, e.g., ['negative', 'neutral', 'positive'].
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_prob = np.array(y_prob)

    # --- Standard & Ordinal Metrics ---
    standard_accuracy = accuracy_score(y_true, y_pred)
    standard_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    
    penalized_y_pred = [true if abs(true - pred) <= 1 else pred for true, pred in zip(y_true, y_pred)]
    weighted_accuracy = accuracy_score(y_true, penalized_y_pred)
    weighted_f1 = f1_score(y_true, penalized_y_pred, average="weighted", zero_division=0)

    # --- Confidence Analysis ---
    is_correct_mask = (y_true == y_pred)
    confidences = y_prob.max(axis=1)
    
    avg_conf_correct = np.mean(confidences[is_correct_mask]) if np.any(is_correct_mask) else 0.0
    avg_conf_incorrect = np.mean(confidences[~is_correct_mask]) if np.any(~is_correct_mask) else 0.0

    # --- Per-Class Metrics ---
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None, labels=range(len(labels_map)), zero_division=0)
    per_class_metrics = {
        labels_map[i]: {"precision": p[i], "recall": r[i], "f1_score": f[i], "support": s[i]}
        for i in range(len(labels_map))
    }
    
    return {
        "accuracy": standard_accuracy,
        "f1_score": standard_f1,
        "weighted_accuracy": weighted_accuracy,
        "weighted_f1_score": weighted_f1,
        "avg_confidence_correct": avg_conf_correct,
        "avg_confidence_incorrect": avg_conf_incorrect,
        "per_class_metrics": per_class_metrics
    }


class AccuracyEvaluator:
    """Handles model accuracy and F1 score evaluation with enhanced debugging."""
    def __init__(self, session: ort.InferenceSession, data_processor: DataProcessor):
        self.session, self.data_processor = session, data_processor
    
    def evaluate(self, texts: List[str], labels: List[int], batch_size: int, max_samples: int):
        num_samples = min(len(texts), max_samples)
        eval_texts, eval_labels = texts[:num_samples], labels[:num_samples]
        
        all_predictions = []
        all_probabilities = []
        
        for i in range(0, num_samples, batch_size):
            batch_texts = eval_texts[i: i + batch_size]
            inputs = self.data_processor.prepare_batch_inputs(batch_texts)
            
            model_inputs = {inp.name for inp in self.session.get_inputs()}
            valid_inputs = {k: v for k, v in inputs.items() if k in model_inputs}
            
            outputs = self.session.run(None, valid_inputs)
            logits = outputs[0]
            
            probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
            all_probabilities.extend(probabilities)
            
            predictions = np.argmax(logits, axis=1)
            all_predictions.extend(predictions)
        
        # UPDATED: Call the new comprehensive metrics function
        if self.data_processor.label_encoder:
            ordered_labels = self.data_processor.label_encoder.classes_
            metrics = calculate_ordinal_metrics(eval_labels, all_predictions, all_probabilities, ordered_labels)
            return metrics
        else: # Fallback
            accuracy = accuracy_score(eval_labels, all_predictions)
            f1 = f1_score(eval_labels, all_predictions, average="weighted")
            return {"accuracy": accuracy, "f1_score": f1}

In [9]:
class ONNXModelBenchmarker:
    """Orchestrates all components to run a benchmark for a single model."""
    def __init__(self, config: BenchmarkConfig, tokenizer, model_dir: Path):
        self.config = config
        self.model_dir = model_dir
        self.data_processor = DataProcessor(tokenizer, max_length=128)
        self.latency_benchmarker = LatencyBenchmarker(config)
    
    # UPDATED: Added 'model_name' as a parameter
    def _run_inference_and_get_metrics(self, session, model_name, onnx_path, batch_size):
        """Helper function to run the core benchmark and accuracy logic."""
        inputs = self.data_processor.prepare_batch_inputs(self.data_processor.example_inputs * batch_size)
        model_inputs = {inp.name for inp in session.get_inputs()}
        valid_inputs = {k: v for k, v in inputs.items() if k in model_inputs}

        self.latency_benchmarker.warmup_session(session, valid_inputs)
        times = self.latency_benchmarker.measure_latency(session, valid_inputs)
        
        avg_latency = statistics.mean(times)
        p95_latency = np.percentile(times, 95)
        std_latency = statistics.stdev(times) if len(times) > 1 else 0.0
        
        # Simplified output for speed
        print(f"   -> {avg_latency:.1f}ms avg ({std_latency:.1f}ms std)")
        
        metrics = {}
        if self.config.test_csv_path:
            evaluator = AccuracyEvaluator(session, self.data_processor)
            texts, labels = self.data_processor.load_test_dataset(
                Path(self.config.test_csv_path), self.model_dir
            )
            metrics = evaluator.evaluate(texts, labels, batch_size, self.config.accuracy_sample_size)

        return BenchmarkResult(
            model=model_name, batch_size=batch_size,
            avg_latency_ms=avg_latency, p95_latency_ms=p95_latency,
            throughput_samples_per_sec=(1000 * batch_size) / avg_latency if avg_latency > 0 else 0,
            peak_memory_mb=PerformanceMonitor.measure_memory_usage(),
            model_size_mb=PerformanceMonitor.get_model_size_mb(onnx_path),
            provider=session.get_providers()[0],
            accuracy=metrics.get("accuracy"),
            f1_score=metrics.get("f1_score"),
            weighted_accuracy=metrics.get("weighted_accuracy"),
            weighted_f1_score=metrics.get("weighted_f1_score"),
            avg_confidence_correct=metrics.get("avg_confidence_correct"),
            avg_confidence_incorrect=metrics.get("avg_confidence_incorrect"),
            per_class_metrics=metrics.get("per_class_metrics")
        )

    def benchmark_model(self, model_name: str, onnx_path: Path, batch_size: int) -> Optional[BenchmarkResult]:
        print(f"   -> Running benchmark for batch size: {batch_size}")
        
        try:
            print("   -> Attempting with optimal providers (CoreML)...")
            providers = ExecutionProviderManager.get_execution_providers(self.config.device_mode)
            session = ModelLoader.load_onnx_session(onnx_path, providers)
            # UPDATED: Pass 'model_name' to the helper function
            return self._run_inference_and_get_metrics(session, model_name, onnx_path, batch_size)
        
        except ort.capi.onnxruntime_pybind11_state.Fail as e:
            print(f"   -> ⚠️  CoreML execution failed. Retrying with CPU-only provider.")
            try:
                providers = ['CPUExecutionProvider']
                session = ModelLoader.load_onnx_session(onnx_path, providers)
                # UPDATED: Pass 'model_name' to the helper function
                return self._run_inference_and_get_metrics(session, model_name, onnx_path, batch_size)
            except Exception as cpu_e:
                logger.error(f"❌ Benchmark failed on CPU fallback for {model_name}: {cpu_e}", exc_info=True)
                return None
        
        except Exception as e:
            logger.error(f"❌ An unexpected error occurred for {model_name}: {e}", exc_info=True)
            return None

In [10]:
# Cell 7: Results Management
class ResultsManager:
    """Manages benchmark results and reporting."""
    @staticmethod
    def save_results(results: List[BenchmarkResult], output_dir: Path = Path("results")):
        if not results: return
        output_dir.mkdir(exist_ok=True)
        df = pd.DataFrame([r.to_dict() for r in results])
        df.to_csv(output_dir / "benchmark_results_cpu.csv", index=False)


    @staticmethod
    def print_summary(results: List[BenchmarkResult]):
        if not results: return
        df = pd.DataFrame([r.to_dict() for r in results if r])
        
        # UPDATED: Main summary table with new high-level metrics
        main_summary_cols = [
            "model", "batch_size", "provider", "avg_latency_ms", "accuracy", "f1_score", 
            "weighted_f1_score", "avg_confidence_correct", "avg_confidence_incorrect"
        ]
        # Filter out columns that may not exist if a run failed partially
        main_summary_cols = [col for col in main_summary_cols if col in df.columns]

        print("\n" + "="*120 + "\n📊 BENCHMARK SUMMARY\n" + "="*120)
        print(df[main_summary_cols].to_string(index=False, float_format="%.3f"))
        print("="*120)

        # UPDATED: Separate, detailed report for per-class metrics
        print("\n" + "="*120 + "\n🔬 PER-CLASS METRICS (PRECISION / RECALL / F1-SCORE)\n" + "="*120)
        for _, row in df.iterrows():
            print(f"\nModel: {row['model']} | Batch Size: {row['batch_size']}")
            
            # Reconstruct per-class metrics from flattened CSV columns
            class_names = sorted(list(set([k.split('_')[0] for k in df.columns if '_precision' in k])))
            
            header = f"{'CLASS':<15}" + "".join([f"{metric.upper():>12}" for metric in ["PRECISION", "RECALL", "F1-SCORE", "SUPPORT"]])
            print(header)
            print("-" * len(header))

            for name in class_names:
                p = row.get(f"{name}_precision", 0.0)
                r = row.get(f"{name}_recall", 0.0)
                f1 = row.get(f"{name}_f1_score", 0.0)
                s = int(row.get(f"{name}_support", 0))
                print(f"{name:<15}{p:>12.3f}{r:>12.3f}{f1:>12.3f}{s:>12d}")
        print("="*120)

In [11]:
# Cell 8: Model Discovery Functions
def discover_models(models_dir: str, target_models: List[str] = None) -> List[Tuple[str, Path, Path]]:
    """Discover ONNX models with flexible filtering."""
    valid_models = []
    
    for model_dir in Path(models_dir).iterdir():
        if not model_dir.is_dir() or not (model_dir / "onnx").exists(): 
            continue
        
        # If target_models is specified, only process those models
        if target_models and model_dir.name not in target_models:
            continue
            
        # Add standard models if they exist
        standard_path = model_dir / "onnx" / "model.onnx"
        if standard_path.exists():
            valid_models.append((f"{model_dir.name}-standard", standard_path, model_dir))
            print(f"✅ Found: {model_dir.name}")
        
        # quant_path = model_dir / "onnx" / "model-quantised.onnx"
        # if quant_path.exists():
        #     valid_models.append((f"{model_dir.name}-quant", quant_path, model_dir))

    return valid_models

def run_full_benchmark(models_dir: str, config: BenchmarkConfig, target_models: List[str] = None):
    """Run the full benchmark suite on discovered models with optional filtering."""
    all_results = []
    
    valid_models = discover_models(models_dir, target_models)
    if not valid_models: 
        print("❌ No valid models found!")
        return

    for model_name, onnx_path, model_dir in valid_models:
        print(f"\n--- ⏳ Now processing: {model_name} ---")
        
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_dir)
        except Exception as e:
            logger.error(f"Failed to load tokenizer for {model_name}: {e}")
            continue
        
        for batch_size in config.batch_sizes:
            # Create a new benchmarker instance for EACH run to prevent state leakage
            benchmarker = ONNXModelBenchmarker(config, tokenizer, model_dir)
            
            result = benchmarker.benchmark_model(model_name, onnx_path, batch_size)
            if result: all_results.append(result)
    
    if all_results:
        ResultsManager.print_summary(all_results)
        ResultsManager.save_results(all_results)


In [12]:
# Cell 10: Configuration Processing (uses settings from Cell 2)
@dataclass
class ExperimentConfig:
    """Automatically configures benchmark based on settings from Cell 2."""
    
    def get_models_and_config(self):
        # Use the TARGET_MODELS directly - no complex mapping needed!
        target_models = TARGET_MODELS
        
        # Performance settings with custom overrides
        iterations = CUSTOM_ITERATIONS if CUSTOM_ITERATIONS is not None else (15 if FAST_MODE else 30)
        warmup = CUSTOM_WARMUP if CUSTOM_WARMUP is not None else (5 if FAST_MODE else 10)
        batch_sizes = CUSTOM_BATCH_SIZES if CUSTOM_BATCH_SIZES is not None else ([1, 8] if FAST_MODE else [1, 4, 8, 16])
        sample_size = CUSTOM_SAMPLE_SIZE if CUSTOM_SAMPLE_SIZE is not None else (300 if FAST_MODE else 500)
        
        # Build configuration
        config = BenchmarkConfig(
            batch_sizes=batch_sizes,
            test_csv_path=ACCURACY_DATASET_PATH if INCLUDE_ACCURACY else None,
            benchmark_iterations=iterations,
            warmup_iterations=warmup,
            accuracy_sample_size=sample_size
        )
        
        return target_models, config

# Create configuration
experiment = ExperimentConfig()
target_models, benchmark_config = experiment.get_models_and_config()

# Display what will be run
print("🎯 BENCHMARK CONFIGURATION")
print("=" * 50)
print(f"Models to test: {target_models or 'ALL AVAILABLE MODELS'}")
print(f"Fast Mode: {FAST_MODE}")
print(f"Include Accuracy: {INCLUDE_ACCURACY}")
print(f"Batch Sizes: {benchmark_config.batch_sizes}")
print(f"Iterations: {benchmark_config.benchmark_iterations}")
print(f"Warmup: {benchmark_config.warmup_iterations}")
if INCLUDE_ACCURACY:
    print(f"Accuracy Sample Size: {benchmark_config.accuracy_sample_size}")
print("=" * 50)

# Execute the benchmark
print("🚀 Starting benchmark...")
run_full_benchmark(MODELS_DIRECTORY, benchmark_config, target_models)
print("✅ Benchmark complete! Check the results above and the CSV file in the results/ folder.")

🎯 BENCHMARK CONFIGURATION
Models to test: ['tinybert-financial-classifier', 'tinybert-financial-classifier-fine-tuned', 'tinybert-financial-classifier-pruned']
Fast Mode: True
Include Accuracy: True
Batch Sizes: [1, 8]
Iterations: 15
Warmup: 5
Accuracy Sample Size: 300
🚀 Starting benchmark...
✅ Found: tinybert-financial-classifier-fine-tuned
✅ Found: tinybert-financial-classifier
✅ Found: tinybert-financial-classifier-pruned

--- ⏳ Now processing: tinybert-financial-classifier-fine-tuned-standard ---
   -> Running benchmark for batch size: 1
   -> Attempting with optimal providers (CoreML)...
   -> 12.5ms avg (2.8ms std)
   -> 12.5ms avg (2.8ms std)
   -> Running benchmark for batch size: 8
   -> Attempting with optimal providers (CoreML)...
   -> Running benchmark for batch size: 8
   -> Attempting with optimal providers (CoreML)...
   -> 111.7ms avg (14.0ms std)
   -> 111.7ms avg (14.0ms std)

--- ⏳ Now processing: tinybert-financial-classifier-standard ---
   -> Running benchmark fo

In [13]:
# 🚀 QUICK TEST: Direct PyTorch quantized model latency
# This tests the quantized model from the fine-tuning notebook WITHOUT ONNX conversion

def test_pytorch_model_speed(model, tokenizer, model_name="test-model", num_iterations=50):
    """Quick latency test for PyTorch models"""
    import time
    import torch
    import statistics
    
    print(f"\n⚡ Testing {model_name} speed directly in PyTorch...")
    
    # Prepare test input
    test_text = "Stocks surged after the company reported record earnings."
    inputs = tokenizer(test_text, return_tensors="pt", padding="max_length", 
                      truncation=True, max_length=128)
    
    model.eval()
    
    # Warmup
    with torch.no_grad():
        for _ in range(5):
            _ = model(**inputs)
    
    # Measure latency
    times = []
    with torch.no_grad():
        for _ in range(num_iterations):
            start = time.perf_counter()
            _ = model(**inputs)
            end = time.perf_counter()
            times.append((end - start) * 1000)
    
    avg_latency = statistics.mean(times)
    std_latency = statistics.stdev(times) if len(times) > 1 else 0.0
    p95_latency = sorted(times)[int(0.95 * len(times))]
    
    print(f"   📊 Average latency: {avg_latency:.2f}ms (±{std_latency:.2f}ms)")
    print(f"   📊 P95 latency: {p95_latency:.2f}ms")
    print(f"   📊 Throughput: {1000/avg_latency:.1f} samples/sec")
    
    return avg_latency

# Test if we have access to the quantized model from the fine-tuning notebook
print("🔍 Checking for quantized model from fine-tuning notebook...")

# Try to import the quantized model - this requires the fine-tuning notebook to be run first
try:
    # We'll need to run this in the fine-tuning notebook context
    print("⚠️  To test the quantized model, run this cell in the fine-tuning notebook:")
    print("   test_pytorch_model_speed(quantized_model, tokenizer, 'quantized-tinybert')")
    print()
    print("🎯 Expected result with quantization: ~2-4ms latency")
    print("🎯 Current ONNX models: ~8-10ms latency")
    print()
    print("💡 The 3x speed improvement comes from:")
    print("   ✅ INT8 quantization (vs FP32)")
    print("   ✅ Optimized PyTorch operations")
    print("   ✅ No ONNX conversion overhead")
    
except Exception as e:
    print(f"   Model not available in this notebook context")
    print("   Run the quantization in the fine-tuning notebook first!")

🔍 Checking for quantized model from fine-tuning notebook...
⚠️  To test the quantized model, run this cell in the fine-tuning notebook:
   test_pytorch_model_speed(quantized_model, tokenizer, 'quantized-tinybert')

🎯 Expected result with quantization: ~2-4ms latency
🎯 Current ONNX models: ~8-10ms latency

💡 The 3x speed improvement comes from:
   ✅ INT8 quantization (vs FP32)
   ✅ Optimized PyTorch operations
   ✅ No ONNX conversion overhead
