In [1]:
# quantize_ptq_int8_large.py

import torch
from transformers import AutoModelForSequenceClassification
import os # <-- 1. Import the 'os' library

# --- 1. 配置 ---
# --- 2. Correct the path by converting it to an absolute path ---
# Define the relative path first
RELATIVE_FP32_MODEL_PATH = "./models/bert_pruned_16_layers_finetuned/best_model"
# Convert it to an absolute path
PRUNED_FP32_MODEL_PATH = os.path.abspath(RELATIVE_FP32_MODEL_PATH)

# Check if the path exists to avoid confusion
if not os.path.isdir(PRUNED_FP32_MODEL_PATH):
    raise FileNotFoundError(f"The model directory does not exist at the specified path: {PRUNED_FP32_MODEL_PATH}")

# Output file for the INT8 quantized model weights
QUANTIZED_INT8_SAVE_PATH = "./models/bert_pruned_16L_int8_ptq.pt"

print(f"Input model (FP32) from absolute path: {PRUNED_FP32_MODEL_PATH}")
print(f"Output state_dict file for INT8 model: {QUANTIZED_INT8_SAVE_PATH}")

# --- 2. 加载 FP32 模型 ---
# Load from the CPU, as quantization is for CPU deployment
# The from_pretrained function will now correctly interpret the absolute path
model_fp32 = AutoModelForSequenceClassification.from_pretrained(PRUNED_FP32_MODEL_PATH)
model_fp32.to("cpu")
model_fp32.eval()

# --- 3. 应用动态量化 ---
# Import PyTorch's quantization module
import torch.quantization

# Apply dynamic quantization to all Linear layers in the model
model_int8 = torch.quantization.quantize_dynamic(
    model_fp32,
    {torch.nn.Linear}, # Specify the layer types to quantize
    dtype=torch.qint8  # Specify the target data type as int8
)

print("\nBERT-Large model successfully quantized to INT8.")

# --- 4. 保存量化后的模型权重 ---
# Ensure the parent directory for the output file exists
os.makedirs(os.path.dirname(QUANTIZED_INT8_SAVE_PATH), exist_ok=True)

# Save only the model's state_dict (weights and parameters)
torch.save(model_int8.state_dict(), QUANTIZED_INT8_SAVE_PATH)

print(f"\n--- INT8 Model State Dict for 16L Bert-Large Saved to: {QUANTIZED_INT8_SAVE_PATH} ---")

Input model (FP32) from absolute path: /root/zscloud-tmp/demo/models/bert_pruned_16_layers_finetuned/best_model
Output state_dict file for INT8 model: ./models/bert_pruned_16L_int8_ptq.pt


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_int8 = torch.quantization.quantize_dynamic(



BERT-Large model successfully quantized to INT8.

--- INT8 Model State Dict for 16L Bert-Large Saved to: ./models/bert_pruned_16L_int8_ptq.pt ---
