In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import numpy as np
import os
import evaluate
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# --- 配置 ---
FP32_MODEL_PATH = "./my_bert_sst2_finetuned/checkpoint-1800"
TOKENIZER_NAME = "bert-base-uncased"
# PyTorch的动态量化主要在CPU上进行优化和运行，所以我们在这里指定CPU
# 这样可以和FP32模型在CPU上的表现进行公平对比
DEVICE = torch.device("cpu") 

print("所有库已导入，配置完成！")
print(f"将使用设备: {DEVICE}")

所有库已导入，配置完成！
将使用设备: cpu


In [2]:
print(f"正在从 '{FP32_MODEL_PATH}' 加载FP32基线模型...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
model_fp32 = AutoModelForSequenceClassification.from_pretrained(FP32_MODEL_PATH)
model_fp32.to(DEVICE)
model_fp32.eval() # 切换到评估模式
print("FP32模型加载完成！")

正在从 './my_bert_sst2_finetuned/checkpoint-1800' 加载FP32基线模型...
FP32模型加载完成！


In [3]:
print("正在应用后训练动态量化 (PTQ)...")
# 'torch.quantization.quantize_dynamic' 会返回一个全新的、已量化的模型
model_int8 = torch.quantization.quantize_dynamic(
    model_fp32,
    {torch.nn.Linear}, # 指定要对哪些类型的层进行量化
    dtype=torch.qint8  # 指定量化的目标数据类型为INT8
)
print("INT8模型创建成功！")

正在应用后训练动态量化 (PTQ)...
INT8模型创建成功！


In [4]:
# 我们通过临时保存模型权重文件来测量它们的大小
fp32_temp_path = "fp32_weights.pt"
int8_temp_path = "int8_weights.pt"

torch.save(model_fp32.state_dict(), fp32_temp_path)
torch.save(model_int8.state_dict(), int8_temp_path)

# 计算文件大小（MB）
fp32_mb = os.path.getsize(fp32_temp_path) / (1024 * 1024)
int8_mb = os.path.getsize(int8_temp_path) / (1024 * 1024)

print("="*30)
print("模型大小对比:")
print(f"  - FP32 模型大小: {fp32_mb:.2f} MB")
print(f"  - INT8 模型大小: {int8_mb:.2f} MB")
print(f"  - 体积减小了: {(1 - int8_mb / fp32_mb) * 100:.2f}%")
print("="*30)

# 删除临时文件
os.remove(fp32_temp_path)
os.remove(int8_temp_path)

模型大小对比:
  - FP32 模型大小: 417.72 MB
  - INT8 模型大小: 173.09 MB
  - 体积减小了: 58.56%


In [5]:
print("正在测量INT8模型的推理延迟 (在CPU上)...")
TEST_SENTENCE = "This is a great movie, I really enjoyed it."
WARMUP_RUNS = 10
MEASURE_RUNS = 100

# 准备输入
inputs = tokenizer(TEST_SENTENCE, return_tensors="pt").to(DEVICE)

# 预热
with torch.no_grad():
    for _ in range(WARMUP_RUNS):
        _ = model_int8(**inputs)

# 测量
timings = []
with torch.no_grad():
    for _ in range(MEASURE_RUNS):
        start_time = time.perf_counter()
        _ = model_int8(**inputs)
        end_time = time.perf_counter()
        timings.append(end_time - start_time)

# 计算并报告结果
avg_latency_ms = np.mean(timings) * 1000
std_latency_ms = np.std(timings) * 1000

print("="*30)
print(f"INT8 模型推理延迟 (在 {DEVICE} 上):")
print(f"  - 平均: {avg_latency_ms:.2f} ms")
print(f"  - 标准差: {std_latency_ms:.2f} ms")
print("="*30)

正在测量INT8模型的推理延迟 (在CPU上)...
INT8 模型推理延迟 (在 cpu 上):
  - 平均: 70.02 ms
  - 标准差: 47.48 ms


In [12]:
print("正在测量FP32模型的推理延迟 (在CPU上)...")
TEST_SENTENCE = "This is a great movie, I really enjoyed it."
WARMUP_RUNS = 10
MEASURE_RUNS = 100

# 准备输入
inputs = tokenizer(TEST_SENTENCE, return_tensors="pt").to(DEVICE)

# 预热
with torch.no_grad():
    for _ in range(WARMUP_RUNS):
        _ = model_fp32(**inputs)

# 测量
timings = []
with torch.no_grad():
    for _ in range(MEASURE_RUNS):
        start_time = time.perf_counter()
        _ = model_fp32(**inputs)
        end_time = time.perf_counter()
        timings.append(end_time - start_time)

# 计算并报告结果
avg_latency_ms = np.mean(timings) * 1000
std_latency_ms = np.std(timings) * 1000

print("="*30)
print(f"fp32 模型推理延迟 (在 {DEVICE} 上):")
print(f"  - 平均: {avg_latency_ms:.2f} ms")
print(f"  - 标准差: {std_latency_ms:.2f} ms")
print("="*30)

正在测量FP32模型的推理延迟 (在CPU上)...
fp32 模型推理延迟 (在 cpu 上):
  - 平均: 172.98 ms
  - 标准差: 59.00 ms


In [8]:
from datasets import load_dataset
print("\n正在加载'glue', 'sst2'数据集...")
# 注意：如果服务器网络访问Hugging Face Hub有问题，
# 你可能需要像之前一样设置HF_ENDPOINT环境变量来使用镜像
raw_datasets = load_dataset("glue", "sst2")
print("数据集加载成功！")

print("正在加载分词器...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

print("正在对数据集进行分词预处理...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print("预处理完成！")
    
eval_dataset = tokenized_datasets["validation"]


正在加载'glue', 'sst2'数据集...
数据集加载成功！
正在加载分词器...
正在对数据集进行分词预处理...
预处理完成！


In [11]:
# --- 单元格 6 (修正版) ---

print("正在评估INT8模型的准确率...")

# 确保你的验证数据集 'tokenized_datasets["validation"]' 已经被加载
eval_dataset = tokenized_datasets["validation"]

# 创建一个数据加载器
eval_dataloader = DataLoader(eval_dataset.with_format("torch"), batch_size=64)

# 加载评估指标
metric = evaluate.load("accuracy")

model_int8.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating INT8 Model"):
    # (调试技巧) 如果不确定batch里有什么，可以取消下面这行的注释来查看
    # print(batch.keys()) 
    
    batch = {k: v.to(DEVICE) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'label']}
    
    with torch.no_grad():
        outputs = model_int8(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    # <-- 关键修正：将 "labels" 改为 "label"
    metric.add_batch(predictions=predictions, references=batch["label"]) 

eval_metric = metric.compute()
accuracy = eval_metric["accuracy"]

print("\n" + "="*40)
print("       INT8 模型性能评估结果")
print("="*40)
print(f"  - 在验证集上的准确率: {accuracy * 100:.2f}%")
print("="*40)

正在评估INT8模型的准确率...


Evaluating INT8 Model:   0%|          | 0/14 [00:00<?, ?it/s]


       INT8 模型性能评估结果
  - 在验证集上的准确率: 91.51%


In [13]:
# --- 保存 INT8 模型的代码 ---

import os

# 1. 定义一个清晰的保存路径和文件名
#    建议专门创建一个 'models' 文件夹来存放
save_directory = "./saved_models"
int8_model_filename = "bert_sst2_int8_ptq.pt"
int8_model_save_path = os.path.join(save_directory, int8_model_filename)

# 2. 确保目录存在 (这是一个好习惯)
os.makedirs(save_directory, exist_ok=True)

# 3. 保存模型的状态字典 (state_dict)
#    model_int8 是你上一步在内存中创建的量化模型对象
torch.save(model_int8.state_dict(), int8_model_save_path)

print(f"INT8 模型已成功保存到: {int8_model_save_path}")

INT8 模型已成功保存到: ./saved_models/bert_sst2_int8_ptq.pt
