In [1]:
# %% [markdown]
### Task 6 – Merge LoRA, export ONNX, INT8-quantise
# Run the helper script.
# Sanity-check fp32 vs INT8 outputs differ < 1 % on a random batch.

# %% [code] ▸ 1  Run the export script
# !python ../src/export_quant.py --lora_dir models/codebert_mini_lora --out_dir  models/quantised

# %% [code] ▸ 2  Quick functional test
import numpy as np, onnxruntime as ort, torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tok = AutoTokenizer.from_pretrained("microsoft/codebert-base")

text = "char buf[8]; memcpy(buf, input, len);"          # dummy code
ids  = tok(text, return_tensors="pt")

# fp32 HF model (merged)
hf = AutoModelForSequenceClassification.from_pretrained("../models/quantised/fp32")
hf.eval()
with torch.no_grad():
    logits_fp32 = hf(**{k:v for k,v in ids.items()})[0].softmax(-1)[0,1].item()

# INT8 ONNX
sess = ort.InferenceSession("../models/quantised/codebert_int8.onnx",
                            providers=["CPUExecutionProvider"])
# inputs = {
#     "input_ids":       ids["input_ids"].numpy(),
#     "attention_mask":  ids["attention_mask"].numpy(),
#     "token_type_ids":  torch.zeros_like(ids["input_ids"]).numpy()  # ← NEW
# }
inputs = {
    "input_ids":      ids["input_ids"].numpy(),
    "attention_mask": ids["attention_mask"].numpy(),
    "token_type_ids": torch.zeros_like(ids["input_ids"]).numpy()   # all-zeros feed
}


logits_int8 = sess.run(None, inputs)[0]
prob_int8   = torch.softmax(torch.tensor(logits_int8), -1)[0,1].item()

print(f"fp32 prob={logits_fp32:.4f}   int8 prob={prob_int8:.4f}   Δ={abs(prob_int8-logits_fp32):.4f}")


  from .autonotebook import tqdm as notebook_tqdm


fp32 prob=0.4631   int8 prob=0.4474   Δ=0.0157
