In [1]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
import os

# --- 1. 配置 ---
# 输入模型：我们剪枝并微调好的8层FP32 Bert-Base模型
PRUNED_FP32_MODEL_PATH = "./models/bert_pruned_8_layers_finetuned/best_model"

# 输出目录：保存INT4量化配置和FP16权重的新路径
QUANTIZED_INT4_SAVE_PATH = "./models/bert_pruned_8L_int4_bnb"

print(f"Input model (FP32): {PRUNED_FP32_MODEL_PATH}")
print(f"Output path for INT4 model: {QUANTIZED_INT4_SAVE_PATH}")

# --- 2. 定义INT4量化配置 ---
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# --- 3. 加载、量化并保存模型 ---
print("\nLoading model with INT4 quantization...")

if not torch.cuda.is_available():
    raise RuntimeError("BitsAndBytes INT4 quantization requires a GPU!")

model = AutoModelForSequenceClassification.from_pretrained(
    PRUNED_FP32_MODEL_PATH,
    quantization_config=quantization_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(PRUNED_FP32_MODEL_PATH)

print("Model loaded in INT4 on GPU.")

os.makedirs(QUANTIZED_INT4_SAVE_PATH, exist_ok=True)

print(f"Saving model to {QUANTIZED_INT4_SAVE_PATH}...")
model.save_pretrained(QUANTIZED_INT4_SAVE_PATH)
tokenizer.save_pretrained(QUANTIZED_INT4_SAVE_PATH)

print("\n--- INT4 Model for 8L Bert-Base Saved Successfully! ---")

Input model (FP32): ./models/bert_pruned_8_layers_finetuned/best_model
Output path for INT4 model: ./models/bert_pruned_8L_int4_bnb

Loading model with INT4 quantization...
Model loaded in INT4 on GPU.
Saving model to ./models/bert_pruned_8L_int4_bnb...

--- INT4 Model for 8L Bert-Base Saved Successfully! ---
