In [1]:
# =======================================================
# Cell 1: Imports for Pruning
# =======================================================
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import os

print("Libraries for pruning loaded.")
# =======================================================
# Cell 2: Pruning Configuration
# =======================================================
# --- 请修改这里的配置 ---
# 你原始的、微调好的 24 层 FP32 模型的路径 (根据你的日志)
ORIGINAL_MODEL_PATH = "./bert-large-sst2-finetuned/checkpoint-300/"

# 剪枝后的模型（尚未微调）的保存路径
PRUNED_MODEL_SAVE_PATH = "./saved_models/bert_pruned_16_layers_unfinetuned"

# 用来加载分词器的基础模型名称
TOKENIZER_NAME = "bert-large-uncased"

LAYERS_TO_REMOVE = {22, 20, 6, 7, 21, 1, 5, 4}
# -----------------------------------------------------------

# 确保保存目录存在
os.makedirs(PRUNED_MODEL_SAVE_PATH, exist_ok=True)

print(f"Original model path: {ORIGINAL_MODEL_PATH}")
print(f"Will remove {len(LAYERS_TO_REMOVE)} layers: {sorted(list(LAYERS_TO_REMOVE))}")
print(f"Pruned model will be saved to: {PRUNED_MODEL_SAVE_PATH}")
# =======================================================
# Cell 3: Pruning Logic and Saving
# =======================================================
print("Loading original 24-layer model...") 
original_model = AutoModelForSequenceClassification.from_pretrained(ORIGINAL_MODEL_PATH)
original_config = original_model.config
original_state_dict = original_model.state_dict()

print("Creating new pruned model configuration...")
# 1. 创建新的配置，层数减少
pruned_config = AutoConfig.from_pretrained(ORIGINAL_MODEL_PATH)
pruned_config.num_hidden_layers = original_config.num_hidden_layers - len(LAYERS_TO_REMOVE)

# 2. 用新配置初始化一个新模型（此时权重是随机的）
pruned_model = AutoModelForSequenceClassification.from_config(pruned_config)
pruned_state_dict = pruned_model.state_dict()

# 3. 核心：权重映射 (此部分逻辑无需修改，自动适配)
print("Mapping weights from original model to pruned model...")

# 拷贝非 Encoder 层的权重 (embeddings, pooler, classifier)
for key in original_state_dict:
    if not key.startswith("bert.encoder.layer."):
        if key in pruned_state_dict:
            pruned_state_dict[key] = original_state_dict[key]

# 拷贝需要保留的 Encoder 层的权重
new_layer_idx = 0
for old_layer_idx in range(original_config.num_hidden_layers):
    if old_layer_idx not in LAYERS_TO_REMOVE:
        # 遍历旧层中的所有参数 (attention, output, layernorm等)
        for key_suffix in [
            "attention.self.query.weight", "attention.self.query.bias",
            "attention.self.key.weight", "attention.self.key.bias",
            "attention.self.value.weight", "attention.self.value.bias",
            "attention.output.dense.weight", "attention.output.dense.bias",
            "attention.output.LayerNorm.weight", "attention.output.LayerNorm.bias",
            "intermediate.dense.weight", "intermediate.dense.bias",
            "output.dense.weight", "output.dense.bias",
            "output.LayerNorm.weight", "output.LayerNorm.bias"
        ]:
            old_key = f"bert.encoder.layer.{old_layer_idx}.{key_suffix}"
            new_key = f"bert.encoder.layer.{new_layer_idx}.{key_suffix}"
            if new_key in pruned_state_dict and old_key in original_state_dict:
                 pruned_state_dict[new_key] = original_state_dict[old_key]

        new_layer_idx += 1

# 4. 将映射好的权重加载到新模型中
pruned_model.load_state_dict(pruned_state_dict)

# 5. 保存剪枝后的模型和分词器，以备微调
print(f"Saving pruned model to {PRUNED_MODEL_SAVE_PATH}...")
pruned_model.save_pretrained(PRUNED_MODEL_SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
tokenizer.save_pretrained(PRUNED_MODEL_SAVE_PATH)

print("\n--- Pruning Complete! ---")
print(f"Original model had {original_config.num_hidden_layers} layers.")
print(f"Pruned model has {pruned_config.num_hidden_layers} layers.")

Libraries for pruning loaded.
Original model path: ./bert-large-sst2-finetuned/checkpoint-300/
Will remove 8 layers: [1, 4, 5, 6, 7, 20, 21, 22]
Pruned model will be saved to: ./saved_models/bert_pruned_16_layers_unfinetuned
Loading original 24-layer model...
Creating new pruned model configuration...
Mapping weights from original model to pruned model...
Saving pruned model to ./saved_models/bert_pruned_16_layers_unfinetuned...

--- Pruning Complete! ---
Original model had 24 layers.
Pruned model has 16 layers.


In [3]:
# =======================================================
# Cell 4: Imports for Fine-tuning
# =======================================================
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

print("Libraries for fine-tuning loaded.")
# =======================================================
# Cell 5: Fine-tuning Config and Data Prep
# =======================================================
# --- 配置 ---
# 这就是我们上一步保存的、等待微调的剪枝模型
MODEL_CHECKPOINT = PRUNED_MODEL_SAVE_PATH
# 微调后最终模型的输出目录
OUTPUT_DIR = "./models/bert_pruned_16_layers_finetuned" # <-- 修改了描述
# (可选) Weights & Biases 项目名称
WANDB_PROJECT_NAME = "bert_large_pruning_sst2"

# --- 数据准备 ---
print("Loading and tokenizing dataset...")
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
# 加载数据集
dataset = load_dataset("glue", "sst2")
# 定义分词函数
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)
# 对整个数据集进行分词
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 定义评估指标计算函数
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

print("Dataset ready and metrics function defined.")
# =======================================================
# Cell 6: Run Fine-tuning
# =======================================================
# 1. 加载我们剪枝过的模型
model_to_finetune = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT)

# 2. 设置训练参数
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    # --- 修改点：减小batch size以适应bert-large模型，防止OOM ---
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    # ----------------------------------------------------
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=50,
    eval_strategy="epoch", # 每个epoch评估一次
    save_strategy="epoch",       # 每个epoch保存一次
    save_total_limit=1,          # 只保留最好的checkpoint
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="wandb",
    fp16=torch.cuda.is_available(), # 开启fp16加速
)

# 设置WandB项目名
os.environ["WANDB_PROJECT"] = WANDB_PROJECT_NAME

# 3. 初始化 Trainer
trainer = Trainer(
    model=model_to_finetune,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 4. 开始训练！
print("Starting fine-tuning of the pruned model...")
trainer.train()

# 5. 保存最终的最佳模型
print("Saving the best fine-tuned model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "best_model"))

print("\n--- Fine-tuning Complete! ---")
print(f"The final fine-tuned pruned model is saved at: {os.path.join(OUTPUT_DIR, 'best_model')}")

Libraries for fine-tuning loaded.
Loading and tokenizing dataset...


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Dataset ready and metrics function defined.


  trainer = Trainer(


Starting fine-tuning of the pruned model...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1333,0.200327,0.93922
2,0.0729,0.245418,0.922018
3,0.0502,0.247868,0.930046


Saving the best fine-tuned model...

--- Fine-tuning Complete! ---
The final fine-tuned pruned model is saved at: ./models/bert_pruned_16_layers_finetuned/best_model
