In [1]:
# quantization_impact.ipynb

# ---------------------
# 1. Setup and Imports
# ---------------------
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import os
import gc

torch.set_default_dtype(torch.float32)  # Avoid AMP/FP16 for now

# --------------------------
# 2. Model Loading (Change this line to use your own model)
# --------------------------
model_name = "EleutherAI/pythia-410m"  # Swap this with Mistral or any custom model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model_fp32 = AutoModelForCausalLM.from_pretrained(model_name)
model_fp32.eval()

# Move to CPU for quantization work
device = torch.device("cpu")
model_fp32.to(device)

# --------------------------
# 3. Define Inference Helper
# --------------------------
def run_inference(model, prompt, max_new_tokens=20):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    start = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    end = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end - start

# --------------------------
# 4. Baseline FP32 Inference
# --------------------------
prompt = "The singularity is near because"
output_fp32, time_fp32 = run_inference(model_fp32, prompt)

print("📦 FP32 Inference Output:", output_fp32)
print("⏱️ FP32 Inference Time: {:.3f}s".format(time_fp32))

# --------------------------
# 5. Apply Dynamic Quantization
# --------------------------
model_dynamic = torch.quantization.quantize_dynamic(
    model_fp32, {torch.nn.Linear}, dtype=torch.qint8
)

output_dynamic, time_dynamic = run_inference(model_dynamic, prompt)

print("📦 Dynamic Quantized Output:", output_dynamic)
print("⏱️ Dynamic Quantized Time: {:.3f}s".format(time_dynamic))

# --------------------------
# 6. Compare Model Sizes
# --------------------------
def get_model_size(model, name="temp.pt"):
    torch.save(model.state_dict(), name)
    size_mb = os.path.getsize(name) / 1e6
    os.remove(name)
    return size_mb

size_fp32 = get_model_size(model_fp32)
size_dynamic = get_model_size(model_dynamic)

print("📉 Model Size (FP32): {:.2f} MB".format(size_fp32))
print("📉 Model Size (Dynamic Q): {:.2f} MB".format(size_dynamic))

# --------------------------
# 7. Optional: Post-Training Static Quantization (Experimental)
# --------------------------
# Static quantization requires fusion + calibration dataset. Recommended only after full model prep.
# Placeholder for future QAT or calibration steps.

# --------------------------
# 8. Summary Report
# --------------------------
print("\n🧠 Summary")
print(f"Accuracy proxy (output match): {'Same' if output_fp32 == output_dynamic else 'Different'}")
print(f"Latency reduction: {((time_fp32 - time_dynamic) / time_fp32) * 100:.2f}%")
print(f"Model size reduction: {((size_fp32 - size_dynamic) / size_fp32) * 100:.2f}%")

# --------------------------
# 9. Cleanup
# --------------------------
del model_fp32, model_dynamic
gc.collect()


KeyboardInterrupt: 