## Create a lightweight inference class & perform Benchmarking

In [11]:

import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import torch
import re
import time
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score



In [2]:
# Preprocessing function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = text.replace('$', '')  # Remove dollar signs (tickers)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove special characters except basic punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text.strip()


In [None]:
class SentimentAnalyzer:
    def __init__(self, model_path="./tinybert-sentiment-onnx"):
        self.session = ort.InferenceSession(f"{model_path}/model.onnx")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.labels = ["negative", "neutral", "positive"]  # Hard-coded labels
        
    def predict(self, text):
        # Preprocess
        text = preprocess_text(text)
        
        # Tokenize
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="np",
            return_token_type_ids=True
        )
        
        # Run inference
        outputs = self.session.run(
            None,
            {
                "input_ids": inputs["input_ids"].astype(np.int64),
                "attention_mask": inputs["attention_mask"].astype(np.int64),
                "token_type_ids": inputs["token_type_ids"].astype(np.int64)
            }
        )
        
        # Get predictions
        logits = outputs[0]
        probabilities = torch.nn.functional.softmax(torch.from_numpy(logits), dim=-1)
        predicted_class = np.argmax(logits, axis=-1)[0] # type: ignore
        
        return {
            "sentiment": self.labels[predicted_class],
            "confidence": float(probabilities[0][predicted_class]),
            "probabilities": {
                label: float(prob)
                for label, prob in zip(self.labels, probabilities[0])
            }
        }


In [4]:
# Example usage
analyzer = SentimentAnalyzer()
result = analyzer.predict("$BYND - JPMorgan reels in expectations on Beyond Meat")
print(result)

{'sentiment': 'negative', 'confidence': 0.6044331192970276, 'probabilities': {'negative': 0.6044331192970276, 'neutral': 0.045562803745269775, 'positive': 0.35000404715538025}}


## Performance Benchmarking

In [5]:
def benchmark_model(analyzer, dataset, n_runs=100):
    # Warmup
    for _ in range(5):
        analyzer.predict(dataset["text"].iloc[0])
    
    # Latency test
    latencies = []
    for i in tqdm(range(n_runs)):
        text = dataset["text"].iloc[i % len(dataset)]
        start = time.perf_counter()
        analyzer.predict(text)
        end = time.perf_counter()
        latencies.append((end - start) * 1000)  # in ms
    
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    
    print(f"Average latency: {avg_latency:.2f}ms")
    print(f"95th percentile latency: {p95_latency:.2f}ms")
    return latencies


In [6]:
test_df = pd.read_csv('../processed-datasets/test.csv')

In [7]:
test_df.shape

(5834, 2)

In [8]:
# Run benchmark
print("Benchmarking ONNX model...")
latencies = benchmark_model(analyzer, test_df)

Benchmarking ONNX model...


100%|██████████| 100/100 [00:00<00:00, 205.30it/s]

Average latency: 4.79ms
95th percentile latency: 5.62ms





In [9]:
# Check model size
import os
model_size = os.path.getsize("./tinybert-sentiment-onnx/model.onnx") / (1024 * 1024)
print(f"Model size: {model_size:.2f}MB")

Model size: 54.88MB


In [15]:

def evaluate_model(analyzer, test_dataset, n_runs=100):
    """
    Comprehensive evaluation of SentimentAnalyzer
    Args:
        analyzer: Your SentimentAnalyzer instance
        test_dataset: DataFrame with 'text' and 'label' columns
        n_runs: Number of iterations for latency benchmark
    Returns:
        Dictionary containing all metrics
    """
    # Initialize containers
    y_true = []
    y_pred = []
    confidences = []
    latencies = []
    
    # Warmup
    for _ in range(5):
        analyzer.predict(test_dataset["text"].iloc[0])

    # Convert numeric labels to strings if needed
    if pd.api.types.is_numeric_dtype(test_dataset["label"]):
        label_map = {0: "negative", 1: "neutral", 2: "positive"}
        test_dataset["label"] = test_dataset["label"].map(label_map)

     # Main evaluation loop
    for i in tqdm(range(len(test_dataset)), desc="Evaluating"):
        text = test_dataset["text"].iloc[i]
        true_label = test_dataset["label"].iloc[i]
        
        # Time prediction
        start = time.perf_counter()
        result = analyzer.predict(text)
        latency = (time.perf_counter() - start) * 1000  # ms
        latencies.append(latency)
        
        # Store results (ensure lowercase comparison)
        y_true.append(str(true_label).lower())
        y_pred.append(result["sentiment"].lower())
        confidences.append(result["confidence"])
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    cm = confusion_matrix(y_true, y_pred, labels=analyzer.labels)
    cm_percent = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]) * 100
    
    # Confidence analysis
    correct_conf = [c for c, t, p in zip(confidences, y_true, y_pred) if t == p]
    incorrect_conf = [c for c, t, p in zip(confidences, y_true, y_pred) if t != p]
    
    # Latency stats (additional runs for stability)
    if n_runs > len(test_dataset):
        extra_latencies = []
        for _ in tqdm(range(n_runs - len(test_dataset)), desc="Benchmarking"):
            text = test_dataset["text"].sample(1).iloc[0]
            start = time.perf_counter()
            analyzer.predict(text)
            extra_latencies.append((time.perf_counter() - start) * 1000)
        latencies.extend(extra_latencies)
    
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    
    return {
        # Core metrics
        "accuracy": accuracy,
        "f1_score": f1,
        "class_metrics": classification_report(
            y_true, y_pred, 
            target_names=analyzer.labels,
            output_dict=True
        ),
        
        # Confusion matrices
        "confusion_matrix": cm,
        "confusion_matrix_percent": cm_percent,
        
        # Confidence analysis
        "avg_confidence": np.mean(confidences),
        "correct_confidence": np.mean(correct_conf) if correct_conf else 0,
        "incorrect_confidence": np.mean(incorrect_conf) if incorrect_conf else 0,
        
        # Performance
        "avg_latency_ms": avg_latency,
        "p95_latency_ms": p95_latency,
        "throughput_rps": 1000 / avg_latency,
        
        # Raw data
        "predictions": list(zip(y_true, y_pred, confidences)),
        "latencies": latencies
    }

In [16]:
def print_metrics(metrics):
    """Pretty print evaluation results"""
    print(f"\n{' Model Evaluation Results ':=^60}")
    print(f"Accuracy: {metrics['accuracy']*100:.2f}%")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Avg Confidence: {metrics['avg_confidence']*100:.1f}%")
    print(f"  - Correct predictions: {metrics['correct_confidence']*100:.1f}%")
    print(f"  - Incorrect predictions: {metrics['incorrect_confidence']*100:.1f}%")
    
    print("\nLatency:")
    print(f"Average: {metrics['avg_latency_ms']:.2f}ms")
    print(f"95th percentile: {metrics['p95_latency_ms']:.2f}ms")
    print(f"Throughput: {metrics['throughput_rps']:.1f} requests/second")
    
    print("\nConfusion Matrix (%):")
    cm = metrics['confusion_matrix_percent']
    print(f"{'':<10}{'Negative':<10}{'Neutral':<10}{'Positive':<10}")
    for i, label in enumerate(["Negative", "Neutral", "Positive"]):
        print(f"{label:<10}{cm[i][0]:<10.1f}{cm[i][1]:<10.1f}{cm[i][2]:<10.1f}")
    
    print("\nClassification Report:")
    print(classification_report(
        [p[0] for p in metrics['predictions']],
        [p[1] for p in metrics['predictions']],
        target_names=["negative", "neutral", "positive"]
    ))

In [17]:
metrics = evaluate_model(analyzer, test_df, n_runs=100)
print_metrics(metrics)

Evaluating: 100%|██████████| 5834/5834 [00:25<00:00, 232.96it/s]



Accuracy: 89.58%
F1 Score: 0.8962
Avg Confidence: 92.5%
  - Correct predictions: 94.2%
  - Incorrect predictions: 77.9%

Latency:
Average: 4.21ms
95th percentile: 5.64ms
Throughput: 237.5 requests/second

Confusion Matrix (%):
          Negative  Neutral   Positive  
Negative  83.8      12.6      3.7       
Neutral   2.8       91.4      5.8       
Positive  2.7       9.3       88.0      

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.84      0.83       764
     neutral       0.93      0.91      0.92      3589
    positive       0.85      0.88      0.86      1481

    accuracy                           0.90      5834
   macro avg       0.87      0.88      0.87      5834
weighted avg       0.90      0.90      0.90      5834

