In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from datasets import load_dataset
import evaluate
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import time
import numpy as np
import os

# --- 配置 ---
FP32_MODEL_PATH = "./my_bert_sst2_finetuned/checkpoint-1800" 
TOKENIZER_NAME = "bert-base-uncased"
# bitsandbytes 量化主要在 GPU 上运行和优化
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("所有库已导入，配置完成！")
print(f"将使用设备: {DEVICE}")

所有库已导入，配置完成！
将使用设备: cuda


In [2]:
# 1. 定义4-bit量化配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# 2. 加载分词器
print(f"正在加载分词器: {TOKENIZER_NAME}")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# 3. 加载模型，并在加载时应用量化配置
print(f"正在从 '{FP32_MODEL_PATH}' 加载模型并应用INT4量化...")
model_int4 = AutoModelForSequenceClassification.from_pretrained(
    FP32_MODEL_PATH,
    quantization_config=quantization_config,
    device_map="auto",  # 让 accelerate 自动处理设备放置
)

print("INT4模型创建并加载成功！")


正在加载分词器: bert-base-uncased
正在从 './my_bert_sst2_finetuned/checkpoint-1800' 加载模型并应用INT4量化...
INT4模型创建并加载成功！


In [3]:
#!pip install bitsandbytes

In [4]:
int4_model_save_path = "./saved_models/bert_sst2_int4_bnb"

print(f"正在将INT4模型保存到: {int4_model_save_path}")
model_int4.save_pretrained(int4_model_save_path)
print("INT4模型保存成功！")

正在将INT4模型保存到: ./saved_models/bert_sst2_int4_bnb
INT4模型保存成功！


In [5]:
print("正在测量INT4模型的推理延迟 (在GPU上)...")
TEST_SENTENCE = "This is a great movie, I really enjoyed it."
WARMUP_RUNS = 10
MEASURE_RUNS = 100

# 准备输入
inputs = tokenizer(TEST_SENTENCE, return_tensors="pt").to(DEVICE)

model_int4.eval()
# 预热
with torch.no_grad():
    for _ in range(WARMUP_RUNS):
        _ = model_int4(**inputs)

# 测量
timings = []
with torch.no_grad():
    for _ in range(MEASURE_RUNS):
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        _ = model_int4(**inputs)
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        timings.append(end_time - start_time)

# 计算并报告结果
avg_latency_ms = np.mean(timings) * 1000

print("="*30)
print(f"INT4 模型推理延迟 (在 {DEVICE} 上):")
print(f"  - 平均: {avg_latency_ms:.2f} ms")
print("="*30)

正在测量INT4模型的推理延迟 (在GPU上)...
INT4 模型推理延迟 (在 cuda 上):
  - 平均: 8.57 ms


In [7]:
from datasets import load_dataset
print("\n正在加载'glue', 'sst2'数据集...")
raw_datasets = load_dataset("glue", "sst2")
print("数据集加载成功！")

print("正在加载分词器...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

print("正在对数据集进行分词预处理...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print("预处理完成！")
    
eval_dataset = tokenized_datasets["validation"]


正在加载'glue', 'sst2'数据集...
数据集加载成功！
正在加载分词器...
正在对数据集进行分词预处理...
预处理完成！


In [8]:
print("正在评估INT4模型的准确率...")

eval_dataset = tokenized_datasets["validation"]
eval_dataloader = DataLoader(eval_dataset.with_format("torch"), batch_size=32) # batch size可以根据显存调整
metric = evaluate.load("accuracy")

model_int4.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating INT4 Model"):
    # device_map="auto" 已经把模型放到GPU，所以数据也要放到GPU
    batch = {k: v.to(DEVICE) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'label']}
    
    with torch.no_grad():
        outputs = model_int4(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["label"])

eval_metric = metric.compute()
accuracy = eval_metric["accuracy"]

print("\n" + "="*40)
print("       INT4 模型性能评估结果")
print("="*40)
print(f"  - 在验证集上的准确率: {accuracy * 100:.2f}%")
print("="*40)

正在评估INT4模型的准确率...


Evaluating INT4 Model:   0%|          | 0/28 [00:00<?, ?it/s]


       INT4 模型性能评估结果
  - 在验证集上的准确率: 93.00%
