In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import numpy as np

# --- 配置 ---

# !! 关键修改：将模型路径和分词器标识符分开
MODEL_PATH = "./my_bert_sst2_finetuned/checkpoint-1800"
TOKENIZER_NAME = "bert-base-uncased"  # <--- 使用你微调时用的原始分词器名称

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TEST_SENTENCE = "This is a great movie, I really enjoyed it."
WARMUP_RUNS = 10
MEASURE_RUNS = 100

# --- 1. 加载模型和分词器 ---

print(f"Loading tokenizer from: {TOKENIZER_NAME}")
# 从原始预训练模型的名称加载分词器，它会从Hugging Face Hub或本地缓存下载
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

print(f"Loading model from: {MODEL_PATH}")
# 从你保存的、包含微调后权重的路径加载模型
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

model.to(DEVICE)
model.eval()

# --- 2. 准备输入 ---
inputs = tokenizer(TEST_SENTENCE, return_tensors="pt").to(DEVICE)

# --- 3. 预热 (Warmup) ---
print(f"Running {WARMUP_RUNS} warmup runs...")
with torch.no_grad():
    for _ in range(WARMUP_RUNS):
        _ = model(**inputs)

# --- 4. 测量 ---
print(f"Running {MEASURE_RUNS} measurement runs...")
timings = []
with torch.no_grad():
    for _ in range(MEASURE_RUNS):
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        _ = model(**inputs)
        
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        
        timings.append(end_time - start_time)

# --- 5. 计算并报告结果 ---
avg_latency_ms = np.mean(timings) * 1000
std_latency_ms = np.std(timings) * 1000

print("-" * 30)
print(f"Inference Latency on {DEVICE}:")
print(f"  Average: {avg_latency_ms:.2f} ms")
print(f"  Std Dev: {std_latency_ms:.2f} ms")
print("-" * 30)

Loading tokenizer from: bert-base-uncased
Loading model from: ./my_bert_sst2_finetuned/checkpoint-1800
Running 10 warmup runs...
Running 100 measurement runs...
------------------------------
Inference Latency on cuda:
  Average: 2.76 ms
  Std Dev: 0.08 ms
------------------------------
