In [1]:
# quantize_bnb_int4.py

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
import os

In [2]:
# --- 1. 配置 ---
# 输入模型：我们剪枝并微调好的16层FP32模型
PRUNED_FP32_MODEL_PATH = "./models/bert_pruned_16_layers_finetuned/best_model"

# 输出目录：保存INT4量化配置和FP16权重的新路径
QUANTIZED_INT4_SAVE_PATH = "./models/bert_pruned_16L_int4_bnb"

print(f"Input model (FP32): {PRUNED_FP32_MODEL_PATH}")
print(f"Output path for INT4 model: {QUANTIZED_INT4_SAVE_PATH}")

Input model (FP32): ./models/bert_pruned_16_layers_finetuned/best_model
Output path for INT4 model: ./models/bert_pruned_16L_int4_bnb


In [3]:
# --- 2. 定义INT4量化配置 ---
# 这是BitsAndBytes库的核心配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                   # 启用4-bit量化
    bnb_4bit_quant_type="nf4",           # 使用NF4数据类型，通常效果最好
    bnb_4bit_compute_dtype=torch.bfloat16, # 在计算过程中使用bfloat16以保持精度和性能
    bnb_4bit_use_double_quant=True,      # 开启二次量化以节省更多空间
)

In [4]:
# --- 3. 加载、量化并保存模型 ---
print("\nLoading model with INT4 quantization...")

# 确保有可用的GPU
if not torch.cuda.is_available():
    raise RuntimeError("BitsAndBytes INT4 quantization requires a GPU!")

# 加载模型时直接应用量化配置
# device_map="auto" 会自动将模型分配到可用的GPU上
model = AutoModelForSequenceClassification.from_pretrained(
    PRUNED_FP32_MODEL_PATH,
    quantization_config=quantization_config,
    device_map="auto",
)

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(PRUNED_FP32_MODEL_PATH)

print("Model loaded in INT4 on GPU.")

# 确保输出目录存在
os.makedirs(QUANTIZED_INT4_SAVE_PATH, exist_ok=True)

# 保存模型（这会保存FP16权重和量化配置文件）
print(f"Saving model to {QUANTIZED_INT4_SAVE_PATH}...")
model.save_pretrained(QUANTIZED_INT4_SAVE_PATH)
tokenizer.save_pretrained(QUANTIZED_INT4_SAVE_PATH)

print("\n--- INT4 Model Saved Successfully! ---")
print("Note: The saved files are FP16 weights + a config file.")
print("The INT4 quantization is applied automatically when you load this model.")


Loading model with INT4 quantization...
Model loaded in INT4 on GPU.
Saving model to ./models/bert_pruned_16L_int4_bnb...

--- INT4 Model Saved Successfully! ---
Note: The saved files are FP16 weights + a config file.
The INT4 quantization is applied automatically when you load this model.
