In [2]:
# ---

# # Practical Demo: Quantizing a Transformer Model
#
# In this notebook, we will walk through the process of applying Post-Training Dynamic Quantization to a Hugging Face Transformer model.
#
# **Our Goal:** To demonstrate that we can make a model significantly **smaller** and **faster** without a major drop in **accuracy**.
#
# **Our Process:**
# 1.  **Establish Baseline:** Load a standard FP32 (32-bit float) model and benchmark its Size, Latency, and Accuracy.
# 2.  **Apply Quantization:** Use PyTorch's built-in tools to convert the model to INT8 (8-bit integer).
# 3.  **Compare Results:** Benchmark the new INT8 model and compare its performance to the baseline.

# ## 1. Setup
#
# First, we'll install the necessary libraries and import them. We will use `torch` for the model and quantization, `transformers` and `datasets` to get a pre-trained model and evaluation data.

# +
# !pip install transformers datasets torch -q

In [3]:
import torch
import torch.quantization
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import os
import time
import numpy as np
from tqdm.notebook import tqdm

# Ensure we're using the CPU for this demo, as dynamic quantization is a CPU optimization
device = torch.device("cpu")
# -

# ## 2. Load Baseline FP32 Model
#
# We will use `distilbert-base-uncased-finetuned-sst-2-english`, a small but effective model for sentiment analysis. This is our full-precision, 32-bit floating point model.

# +
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
fp32_model = AutoModelForSequenceClassification.from_pretrained(model_name)
fp32_model.to(device)
fp32_model.eval() # Set model to evaluation mode

print("Baseline FP32 model loaded.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Baseline FP32 model loaded.


In [4]:
# ## 3. Benchmark the Baseline (FP32) Model
#
# We'll now measure size, latency, and accuracy.

# ### 3.1. Model Size

# +
# Save the model's state dictionary to get its size on disk
torch.save(fp32_model.state_dict(), "fp32_model.pth")
fp32_size_mb = os.path.getsize("fp32_model.pth") / (1024 * 1024)

print(f"Baseline Model Size (FP32): {fp32_size_mb:.2f} MB")
# -

# ### 3.2. Inference Latency
# We'll create a helper function to measure the average time it takes to perform inference on a sample sentence. We run it multiple times to get a stable result.

# +
def measure_latency(model, tokenizer, sentence, num_runs=100):
    """Measures the average inference latency of a model."""
    model.eval()
    tokens = tokenizer(sentence, return_tensors="pt").to(device)

    # Warm-up run
    with torch.no_grad():
        _ = model(**tokens)

    # Timed runs
    timings = []
    for _ in range(num_runs):
        start_time = time.perf_counter()
        with torch.no_grad():
            _ = model(**tokens)
        end_time = time.perf_counter()
        timings.append(end_time - start_time)

    return np.mean(timings) * 1000 # Return average latency in milliseconds

sample_sentence = "This is a great course, I'm learning a lot!"
fp32_latency_ms = measure_latency(fp32_model, tokenizer, sample_sentence)

print(f"Average Latency (FP32): {fp32_latency_ms:.2f} ms")
# -

# ### 3.3. Model Accuracy
# Finally, let's check the model's accuracy on the SST-2 validation dataset to ensure it's performing well.

# +
def evaluate_model(model, tokenizer, dataset):
    """Evaluates the accuracy of a model on a given dataset."""
    model.eval()
    correct = 0
    total = 0

    for item in tqdm(dataset):
        sentence = item['sentence']
        label = item['label']

        tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**tokens)
            prediction = torch.argmax(outputs.logits, dim=1).item()

        if prediction == label:
            correct += 1
        total += 1

    return (correct / total) * 100

Baseline Model Size (FP32): 255.45 MB
Average Latency (FP32): 62.21 ms


In [5]:
# Load the validation split of the SST-2 dataset
sst2_validation = load_dataset("sst2", split="validation")

fp32_accuracy = evaluate_model(fp32_model, tokenizer, sst2_validation)

print(f"Accuracy (FP32): {fp32_accuracy:.2f}%")


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

  0%|          | 0/872 [00:00<?, ?it/s]

Accuracy (FP32): 91.06%


In [6]:
# ## 4. Apply Post-Training Dynamic Quantization
#
# Now for the core step. We use `torch.quantization.quantize_dynamic` to convert our FP32 model to a quantized INT8 model. We specify that we only want to quantize the `Linear` layers, which is a standard practice for Transformer models.

# +
# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    fp32_model, {torch.nn.Linear}, dtype=torch.qint8
)

print("Model has been quantized.")
print("\nOriginal Model Architecture:\n", fp32_model)
print("\nQuantized Model Architecture:\n", quantized_model)
# -

# ## 5. Benchmark the Quantized (INT8) Model
#
# Let's run the exact same benchmarks and see the improvements.

# ### 5.1. Model Size

# +
# Save the quantized model's state dictionary
torch.save(quantized_model.state_dict(), "quantized_model.pth")
quantized_size_mb = os.path.getsize("quantized_model.pth") / (1024 * 1024)

print(f"Quantized Model Size (INT8): {quantized_size_mb:.2f} MB")
print(f"Size Reduction: {fp32_size_mb / quantized_size_mb:.1f}x smaller")
# -

# ### 5.2. Inference Latency

# +
int8_latency_ms = measure_latency(quantized_model, tokenizer, sample_sentence)

print(f"Average Latency (INT8): {int8_latency_ms:.2f} ms")
print(f"Speedup: {fp32_latency_ms / int8_latency_ms:.1f}x faster")
# -

# ### 5.3. Model Accuracy

# +
int8_accuracy = evaluate_model(quantized_model, tokenizer, sst2_validation)

print(f"Accuracy (INT8): {int8_accuracy:.2f}%")
print(f"Accuracy Drop: {fp32_accuracy - int8_accuracy:.2f}%")

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  quantized_model = torch.quantization.quantize_dynamic(


Model has been quantized.

Original Model Architecture:
 DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): F

  0%|          | 0/872 [00:00<?, ?it/s]

Accuracy (INT8): 89.79%
Accuracy Drop: 1.26%


In [7]:
# ## 6. Final Results Comparison
#
# Let's put all our measurements into a single table to see the final results.

from IPython.display import display, Markdown

summary = f"""
| Metric          | Baseline (FP32)   | Quantized (INT8)  | Improvement      |
|-----------------|-------------------|-------------------|------------------|
| **Model Size**  | {fp32_size_mb:.2f} MB      | **{quantized_size_mb:.2f} MB**     | **{fp32_size_mb/quantized_size_mb:.1f}x smaller** |
| **Latency**     | {fp32_latency_ms:.2f} ms      | **{int8_latency_ms:.2f} ms**      | **{fp32_latency_ms/int8_latency_ms:.1f}x faster** |
| **Accuracy**    | {fp32_accuracy:.2f}%       | **{int8_accuracy:.2f}%**      | **-{fp32_accuracy - int8_accuracy:.2f}%**         |
"""

display(Markdown(summary))


| Metric          | Baseline (FP32)   | Quantized (INT8)  | Improvement      |
|-----------------|-------------------|-------------------|------------------|
| **Model Size**  | 255.45 MB      | **132.29 MB**     | **1.9x smaller** |
| **Latency**     | 62.21 ms      | **35.36 ms**      | **1.8x faster** |
| **Accuracy**    | 91.06%       | **89.79%**      | **-1.26%**         |
