In [None]:
! pip install transformers
! pip install torch
! pip install evaluate

In [None]:
# Terminal: huggingface-cli login

In [None]:
import json
import random
import time
from tqdm import tqdm
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
# === Load model and tokenizer ===
# checkpoint_dir = "trained_model"
model_name = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# model = AutoModelForCausalLM.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
model.eval()

In [None]:
import json
import random
import time
from tqdm import tqdm
import torch
import evaluate

# === Load QA data ===
with open("qa_from_commits_formatted.json", "r") as f:
    all_data = json.load(f)
qa_data = random.sample(all_data, 10)


In [None]:


# === Initialize metric ===
squad_metric = evaluate.load("squad")

# === Run inference ===
latencies = []
total_tokens = 0

print("\nRunning inference on QA pairs...\n")
for idx, item in enumerate(tqdm(qa_data)):
    context = item["context"]
    question = item["question"]
    expected = item["answer"]
    qid = str(idx)

    # LLaMA 3 prompt format with extractive QA instruction
    prompt = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n"
        "You are a concise assistant. Answer only using direct quotes from the context. Do not explain.\n"
        "<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"Context:\n{context}\n\nQuestion: {question}\n"
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>"
    )

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    end_time = time.time()

    # Measure latency
    latency = end_time - start_time
    latencies.append(latency)

    # Decode only newly generated tokens
    input_len = inputs["input_ids"].shape[-1]
    generated_tokens = outputs[0][input_len:]
    generated_answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    total_tokens += generated_tokens.shape[-1]

    # Print each QA pair
    print(f"\n--- QA Pair {idx} ---")
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Reference Answer: {expected}")

    # Format for squad metric
    squad_metric.add(
        prediction={"id": qid, "prediction_text": generated_answer},
        reference={"id": qid, "answers": {"text": [expected], "answer_start": [0]}}
    )

# === Compute metrics ===
results = squad_metric.compute()
em_score = results["exact_match"]
f1_score = results["f1"]
avg_latency = sum(latencies) / len(latencies)
p95_latency = sorted(latencies)[int(0.95 * len(latencies))]
throughput = len(qa_data) / sum(latencies)
token_throughput = total_tokens / sum(latencies)

# === Print results ===
print(f"\n--- Benchmark Results (n={len(qa_data)}) ---")
print(f"Accuracy (EM):           {em_score:.2f}")
print(f"F1 Score:                {f1_score:.2f}")
print(f"Avg Latency:             {avg_latency:.3f} sec")
print(f"P95 Latency:             {p95_latency:.3f} sec")
print(f"Throughput:              {throughput:.2f} samples/sec")
print(f"Token Throughput:        {token_throughput:.2f} tokens/sec")


In [None]:
#### ONNX

In [None]:
! python -m pip install onnx onnxruntime-gpu

In [None]:
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

In [None]:
# Save as ONNX
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# Disable caching to avoid DynamicCache errors
model.config.use_cache = False

# Define dummy input
inputs = tokenizer("Hello", return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Use a wrapper to isolate the exact outputs
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

# Export
torch.onnx.export(
    Wrapper(model),
    (input_ids, attention_mask),
    "Llama-Instruct.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch", 1: "seq"},
        "attention_mask": {0: "batch", 1: "seq"},
        "logits": {0: "batch", 1: "seq"}
    },
    opset_version=14
)

In [None]:
def onnx_generate(prompt, max_new_tokens=50):
    encoded = tokenizer(prompt, return_tensors="np")
    input_ids = encoded["input_ids"].astype(np.int64)
    attention_mask = encoded["attention_mask"].astype(np.int64)

    generated_ids = input_ids
    generated_mask = attention_mask

    for _ in range(max_new_tokens):
        outputs = session.run(["logits"], {
            "input_ids": generated_ids,
            "attention_mask": generated_mask
        })
        logits = outputs[0]

        # Greedy decode: take highest scoring token from last position
        next_token_logits = logits[:, -1, :]
        next_token_id = np.argmax(next_token_logits, axis=-1).reshape(1, 1)

        # Append to inputs
        generated_ids = np.concatenate([generated_ids, next_token_id], axis=1)
        next_mask = np.ones_like(next_token_id)
        generated_mask = np.concatenate([generated_mask, next_mask], axis=1)

        if next_token_id.item() == tokenizer.eos_token_id:
            break

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [None]:

import onnxruntime
from transformers import AutoTokenizer
import numpy as np

onnx_path = "Llama-Instruct.onnx"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# session = onnxruntime.InferenceSession(onnx_path)
session = onnxruntime.InferenceSession(onnx_path, providers=['CUDAExecutionProvider'])

output_text = onnx_generate("Once upon a time", 20)
print(output_text)

In [None]:
onnxruntime.get_device()

In [None]:
onnxruntime.get_available_providers()

In [None]:
## USE CUDAExecutionProvider
 

# === Initialize metric ===
squad_metric = evaluate.load("squad")

# === Initialize ONNX session with CUDAExecutionProvider ===
import onnxruntime as ort
session = ort.InferenceSession("Llama-3.1-8B-Instruct.onnx", providers=["CUDAExecutionProvider"])

# === Run inference ===
latencies = []
total_tokens = 0

print("\nRunning inference on QA pairs using ONNX model...\n")
for idx, item in enumerate(tqdm(qa_data)):
    context = item["context"]
    question = item["question"]
    expected = item["answer"]
    qid = str(idx)

    # LLaMA 3 extractive QA prompt
    prompt = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n"
        "You are a concise assistant. Answer only using direct quotes from the context. Do not explain.\n"
        "<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"Context:\n{context}\n\nQuestion: {question}\n"
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>"
    )

    # Tokenize input to track input length
    inputs = tokenizer(prompt, return_tensors="np")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    input_length = input_ids.shape[-1]

    # Inference using CUDAExecutionProvider
    start_time = time.time()
    logits = session.run(["logits"], {"input_ids": input_ids, "attention_mask": attention_mask})[0]
    next_token_id = logits[0, -1].argmax()
    output_ids = np.append(input_ids[0], next_token_id)
    end_time = time.time()

    # Extract only generated token
    generated_ids = output_ids[input_length:]
    generated_answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    latency = end_time - start_time
    latencies.append(latency)
    total_tokens += len(generated_ids)

    # Print QA pair
    print(f"\n--- QA Pair {idx} ---")
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Reference Answer: {expected}")

    # Evaluate
    squad_metric.add(
        prediction={"id": qid, "prediction_text": generated_answer},
        reference={"id": qid, "answers": {"text": [expected], "answer_start": [0]}}
    )

# === Compute metrics ===
results = squad_metric.compute()
em_score = results["exact_match"]
f1_score = results["f1"]
avg_latency = sum(latencies) / len(latencies)
p95_latency = sorted(latencies)[int(0.95 * len(latencies))]
throughput = len(qa_data) / sum(latencies)
token_throughput = total_tokens / sum(latencies)

# === Print results ===
print(f"\n--- ONNX Benchmark Results (n={len(qa_data)}) ---")
print(f"Accuracy (EM):           {em_score:.2f}")
print(f"F1 Score:                {f1_score:.2f}")
print(f"Avg Latency:             {avg_latency:.3f} sec")
print(f"P95 Latency:             {p95_latency:.3f} sec")
print(f"Throughput:              {throughput:.2f} samples/sec")
print(f"Token Throughput:        {token_throughput:.2f} tokens/sec")