In [1]:

import torch
from transformers import AutoModelForSequenceClassification
import os

# --- 1. 配置 ---
# 输入模型：我们剪枝并微调好的8层FP32 Bert-Base模型
PRUNED_FP32_MODEL_PATH = "./models/bert_pruned_8_layers_finetuned/best_model"

# 输出文件：保存INT8量化后模型权重的文件路径
QUANTIZED_INT8_SAVE_PATH = "./models/bert_pruned_8L_int8_ptq.pt"

print(f"Input model (FP32): {PRUNED_FP32_MODEL_PATH}")
print(f"Output state_dict file for INT8 model: {QUANTIZED_INT8_SAVE_PATH}")

# --- 2. 加载 FP32 模型 ---
# 加载到 CPU 上，因为量化是为 CPU 准备的
model_fp32 = AutoModelForSequenceClassification.from_pretrained(PRUNED_FP32_MODEL_PATH)
model_fp32.to("cpu")
model_fp32.eval()

# --- 3. 应用动态量化 ---
# 导入 PyTorch 的量化模块
import torch.quantization

# 对模型中的所有线性层 (torch.nn.Linear) 应用动态量化
# 这是 Transformer 模型中主要的计算密集型部分
model_int8 = torch.quantization.quantize_dynamic(
    model_fp32,
    {torch.nn.Linear}, # 指定要量化的层类型
    dtype=torch.qint8  # 指定目标数据类型为 int8
)

print("\nModel successfully quantized to INT8.")
print("Original FP32 model sample (first layer weight):")
print(model_fp32.bert.encoder.layer[0].attention.self.query.weight)
print("\nQuantized INT8 model sample (first layer weight):")
print(model_int8.bert.encoder.layer[0].attention.self.query.weight()) # 注意 quantized 模块的访问方式

# --- 4. 保存量化后的模型权重 ---
# 确保目录存在
os.makedirs(os.path.dirname(QUANTIZED_INT8_SAVE_PATH), exist_ok=True)

# 我们只保存模型的 "state_dict" (权重和参数)
torch.save(model_int8.state_dict(), QUANTIZED_INT8_SAVE_PATH)

print(f"\n--- INT8 Model State Dict for 8L Bert-Base Saved to: {QUANTIZED_INT8_SAVE_PATH} ---")

Input model (FP32): ./models/bert_pruned_8_layers_finetuned/best_model
Output state_dict file for INT8 model: ./models/bert_pruned_8L_int8_ptq.pt


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_int8 = torch.quantization.quantize_dynamic(



Model successfully quantized to INT8.
Original FP32 model sample (first layer weight):
Parameter containing:
tensor([[-0.0173,  0.0278, -0.0232,  ...,  0.0160,  0.0754,  0.0555],
        [-0.0367,  0.0362, -0.0440,  ..., -0.0525,  0.1387,  0.0082],
        [ 0.0123,  0.0343,  0.0110,  ..., -0.0253,  0.0251, -0.0463],
        ...,
        [-0.0071,  0.0507,  0.0551,  ...,  0.0298,  0.0533, -0.0514],
        [-0.0174,  0.0945,  0.0594,  ..., -0.1067,  0.0603,  0.0472],
        [ 0.0054, -0.0985,  0.0102,  ..., -0.0205, -0.0550, -0.0120]],
       requires_grad=True)

Quantized INT8 model sample (first layer weight):
tensor([[-0.0164,  0.0287, -0.0246,  ...,  0.0164,  0.0739,  0.0574],
        [-0.0369,  0.0369, -0.0451,  ..., -0.0533,  0.1395,  0.0082],
        [ 0.0123,  0.0328,  0.0123,  ..., -0.0246,  0.0246, -0.0451],
        ...,
        [-0.0082,  0.0492,  0.0533,  ...,  0.0287,  0.0533, -0.0533],
        [-0.0164,  0.0944,  0.0574,  ..., -0.1067,  0.0615,  0.0492],
        [ 0.004