In [4]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


README.md
[34mSmolLM2-360M-Instruct-financial-sentiment[m[m
[34mbenchmark_results[m[m
[34mdata[m[m
[34mfigures[m[m
[34mmodels[m[m
[34mnotebooks[m[m
requirements.txt
[34mresults[m[m
[34mresults2[m[m
[34mresults23[m[m
[34mresults_moreee[m[m
[34msrc[m[m
[34mvenv-py311[m[m


In [2]:
%cd ..

/Users/matthew/Documents/deepmind_internship


In [None]:
# ONNX Conversion Code - Run This First!

# Cell 1: Imports
import gc
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List

# Data & ML
import numpy as np
import torch
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType

# Hugging Face
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Suppress ONNX Runtime logging
import logging
logging.getLogger("onnxruntime").setLevel(logging.ERROR)

# Cell 2: Configuration & Model Discovery
# Model & ONNX Configuration
BASE_DIR = Path("models")
ONNX_OPSET_VERSION = 17

# Data & Split Configuration
DATA_FILE_PATH = Path("data/FinancialPhraseBank/all-data.csv")
RANDOM_SEED = 42
TEST_SIZE = 0.25 # 25% for the test set

def is_valid_model_dir(d: Path) -> bool:
    """Checks if a directory contains a valid Hugging Face model."""
    config_exists = (d / "config.json").exists()
    model_file_exists = (d / "pytorch_model.bin").exists() or (d / "model.safetensors").exists()
    return config_exists and model_file_exists

def prepare_calibration_data(data_path, test_size, random_seed, num_samples=100):
    """Loads, splits, and samples the data to create a calibration set."""
    print(f"Loading data from {data_path}...")
    df = pd.read_csv(
        data_path,
        header=None,
        names=['sentiment', 'text'],
        encoding='latin-1')

    # Split data to get the test set
    _, test_df = train_test_split(
        df, test_size=test_size, random_state=random_seed, stratify=df['sentiment'])

    # Sample the calibration set from the test data
    calibration_df = test_df.sample(n=num_samples, random_state=random_seed)
    print(f"✅ Created a calibration dataset with {len(calibration_df)} samples.")
    return calibration_df

# Find all valid model directories
model_dirs = [d for d in BASE_DIR.iterdir() if d.is_dir() and is_valid_model_dir(d)]
print(f"✅ Found {len(model_dirs)} valid models.")

# Call the function to prepare data
calibration_df = prepare_calibration_data(DATA_FILE_PATH, TEST_SIZE, RANDOM_SEED)


# NEW - Cell 3: Automated Node Finder
def find_final_nodes_to_exclude(onnx_model_path: Path) -> List[str]:
    """
    Analyzes an ONNX model to find the names of the final MatMul or Add nodes
    right before the output, which are common candidates for exclusion.
    """
    nodes_to_exclude = []
    try:
        model = onnx.load(str(onnx_model_path))
        
        # Create a map of all node outputs to the node that produces them
        output_to_node_map = {}
        for node in model.graph.node:
            for output_name in node.output:
                output_to_node_map[output_name] = node

        # Find the final output of the graph
        graph_outputs = [output.name for output in model.graph.output]
        
        for graph_output in graph_outputs:
            # Find the node that produces this graph output
            final_node = output_to_node_map.get(graph_output)
            if final_node and (final_node.op_type == 'MatMul' or final_node.op_type == 'Add'):
                nodes_to_exclude.append(final_node.name)
                print(f"   -> 🎯 Automatically identified final node to exclude: '{final_node.name}' ({final_node.op_type})")
                
        if not nodes_to_exclude:
            print("   -> ⚠️  Could not automatically identify a final MatMul/Add node to exclude.")
            
    except Exception as e:
        print(f"   -> ❌ Error analyzing ONNX graph: {e}. Proceeding without exclusions.")

    return nodes_to_exclude


# Cell 4: ONNX Helper Classes
class ONNXExportWrapper(torch.nn.Module):
    """A wrapper to ensure model output is a simple tensor for ONNX compatibility."""
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=False
        )
        return outputs[0]

class TextCalibrationDataReader(CalibrationDataReader):
    """A robust data reader that adapts to the model's specific inputs."""
    def __init__(self, data_df: pd.DataFrame, tokenizer, onnx_model_path: Path):
        self.tokenizer = tokenizer
        self.data_list = data_df["text"].tolist()
        self.index = 0

        # Find the model's required inputs
        session = ort.InferenceSession(str(onnx_model_path), providers=["CPUExecutionProvider"])
        model_inputs = {input.name for input in session.get_inputs()}

        # Tokenize all data and filter to only include the model's inputs
        tokenized_data = self.tokenizer(
            self.data_list, padding="max_length", truncation=True, max_length=128, return_tensors="np"
        )
        self.feed = {
            key: tokenized_data[key] for key in tokenized_data if key in model_inputs
        }
        self.input_names = list(self.feed.keys())

    def get_next(self):
        if self.index >= len(self.data_list):
            return None

        item = {name: self.feed[name][self.index:self.index+1] for name in self.input_names}
        self.index += 1
        return item

# Cell 5: Main Processing & Export Loop
def export_model_to_onnx(model, tokenizer, onnx_path: Path, opset_version: int):
    """Exports a PyTorch model to the ONNX format."""
    print("   - Wrapping model for ONNX export...")
    wrapped_model = ONNXExportWrapper(model)
    wrapped_model.eval()
    dummy_input = tokenizer("This is a sample sentence.", return_tensors="pt")
    print(f"   - 🚀 Exporting to ONNX (Opset {opset_version})...")
    torch.onnx.export(
        model=wrapped_model,
        args=(dummy_input["input_ids"], dummy_input["attention_mask"]),
        f=str(onnx_path), input_names=["input_ids", "attention_mask"], output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "output": {0: "batch_size"},
        },
        opset_version=opset_version, do_constant_folding=True,
    )
    print(f"   - ✅ Model successfully exported to {onnx_path.name}")

for model_dir in model_dirs:
    print("-" * 70)
    print(f"⏳ Processing model: {model_dir.name}")

    onnx_dir = model_dir / "onnx"
    onnx_dir.mkdir(exist_ok=True)
    onnx_model_path = onnx_dir / "model.onnx"
    quantised_model_path = onnx_dir / "model-quantised.onnx"

    # --- Step 1: Export to ONNX if needed ---
    if not onnx_model_path.exists():
        print("   - 📦 ONNX model not found. Starting export...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_dir)
            model = AutoModelForSequenceClassification.from_pretrained(model_dir)
            export_model_to_onnx(model, tokenizer, onnx_model_path, ONNX_OPSET_VERSION)
            del model, tokenizer
            gc.collect()
        except Exception as e:
            print(f"   - ❌ Export failed for {model_dir.name}: {e}")
            continue
    else:
        print(f"   - ✅ Standard ONNX model already exists.")

    # --- Step 2: Perform Static Quantisation if needed ---
    if onnx_model_path.exists() and not quantised_model_path.exists():
        print(f"   - ⚖️ Performing static quantisation for {onnx_model_path.name}...")
        try:
            # UPDATED: Automatically find nodes to exclude and run robust quantization
            nodes_to_exclude = find_final_nodes_to_exclude(onnx_model_path)
            
            tokenizer = AutoTokenizer.from_pretrained(model_dir)
            calibration_data_reader = TextCalibrationDataReader(calibration_df, tokenizer, onnx_model_path)
            
            quantize_static(
                model_input=onnx_model_path,
                model_output=quantised_model_path,
                calibration_data_reader=calibration_data_reader,
                nodes_to_exclude=nodes_to_exclude,
                op_types_to_quantize=['MatMul', 'Add'],
                extra_options={'ActivationSymmetric': True},
                weight_type=QuantType.QInt8,
            )
            
            print(f"   - ✅ Statically quantised model saved to {quantised_model_path.name}")
        except Exception as e:
            print(f"   - ❌ Static quantisation failed for {model_dir.name}: {e}")
    elif quantised_model_path.exists():
         print(f"   - ✅ Statically quantised model already exists.")

print("-" * 70)
print("🎉 All models have been processed.")

✅ Found 5 valid models.
Loading data from data/FinancialPhraseBank/all-data.csv...
✅ Created a calibration dataset with 100 samples.
----------------------------------------------------------------------
⏳ Processing model: all-MiniLM-L6-v2-financial-sentiment
   - ✅ Standard ONNX model already exists.
   - ⚖️ Performing static quantisation for model.onnx...


  dr = numpy.array(rmax - rmin, dtype=numpy.float64)


   - ✅ Statically quantised model saved to model-quantised.onnx
----------------------------------------------------------------------
⏳ Processing model: distilbert-financial-sentiment
   - ✅ Standard ONNX model already exists.
   - ⚖️ Performing static quantisation for model.onnx...




   - ✅ Statically quantised model saved to model-quantised.onnx
----------------------------------------------------------------------
⏳ Processing model: finbert-tone-financial-sentiment
   - ✅ Standard ONNX model already exists.
   - ⚖️ Performing static quantisation for model.onnx...




   - ✅ Statically quantised model saved to model-quantised.onnx
----------------------------------------------------------------------
⏳ Processing model: tinybert-financial-classifier
   - ✅ Standard ONNX model already exists.
   - ⚖️ Performing static quantisation for model.onnx...




   - ✅ Statically quantised model saved to model-quantised.onnx
----------------------------------------------------------------------
⏳ Processing model: mobilebert-uncased-financial-sentiment
   - ✅ Standard ONNX model already exists.
   - ⚖️ Performing static quantisation for model.onnx...




   - ✅ Statically quantised model saved to model-quantised.onnx
----------------------------------------------------------------------
🎉 All models have been processed.
