# 测试环境代理是否正常

In [1]:
# 测试代理
import os
import requests

# 设置代理环境变量
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
os.environ['ALL_PROXY'] = 'socks5://127.0.0.1:7891'

# 测试代理连接
try:
    response = requests.get('https://huggingface.co', timeout=10)
    print("✅ HuggingFace 连接成功，状态码:", response.status_code)
except Exception as e:
    print("❌ 连接失败:", e)

# 设置 HuggingFace 缓存路径
os.environ['HF_HOME'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf'
os.environ['HF_HUB_CACHE'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf_hu'

# 服务器环境 ubuntu22.04 GPU 2080Ti 22G

✅ HuggingFace 连接成功，状态码: 200


# 下载测试集

In [2]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

# 预处理数据

In [3]:
# 填充到最大长度的策略，处理整个数据集：
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [4]:
# 完整训练集
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
print(f"📊 数据集信息:")
print(f"   训练集: {len(full_train_dataset):,} 样本")
print(f"   评估集: {len(full_eval_dataset):,} 样本")

📊 数据集信息:
   训练集: 650,000 样本
   评估集: 50,000 样本


# 微调配置

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练过程中的评估

In [6]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    """计算准确率指标"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

print("✅ 评估函数已定义")

✅ 评估函数已定义


## 训练超参

In [7]:
from transformers import TrainingArguments, Trainer

model_dir = "models/bert-base-cased-finetune-yelp"


training_args = TrainingArguments(
    output_dir=model_dir,
    
    # === 评估策略 === Yelp 使用step评估比epoch好
    evaluation_strategy="steps", # 按训练步数进行评估， Yelp数据集大(650K样本)，每epoch约20K步，Yelp数据集大(650K样本)，每epoch约20K步
    save_strategy="steps", # 按训练步数保存模型checkpoint， 与评估策略保持一致， 及时保存最佳模型，避免丢失
    logging_strategy="steps", #按训练步数记录日志，提供详细的训练过程监控，便于调试和分析训练曲线
    eval_steps=500,  # 每500步评估一次，约每och评估20次 ep(20K/500)，平衡评估频率和训练效率，平衡评估频率和训练效率
    save_steps=500, # 每500步保存一次checkpoint，与eval_steps同步，保存最佳模# 每500步记录一次训练日志
    logging_steps=50, # 每50步记录一次训练日志，提供详细的loss变化曲线 #
    
    # === 训练配置 ===
    per_device_train_batch_size=32, # 每个GPU的训练批次大小，平衡显存使用和训练稳定性，32样本 × 512长度 × 4字节，较小batch_size训练更稳定
    per_device_eval_batch_size=64, # 每个GPU的评估批次大小，评估时不需要梯度，可以用更大batch，加速评估过程，节省时间
    gradient_accumulation_steps=2, # 累积X个batch的梯度再更新，在显存限制下模拟大batch_size，获得大batch_size的训练效果
    num_train_epochs=2, # 训练2个完整的数据遍历，BERT在大数据集上通常2-4个epoch最优，1 epoch欠拟合，5+ epoch过拟合，Yelp特点: 3个epoch通常能达到最佳性能
    
    # === 学习率优化===
    learning_rate=2.5e-5, # BERT微调的经典学习率范围[1e-5, 5e-5]，2e-5是平衡收敛速度和稳定性的甜点，太大(>5e-5)不稳定，太小(<1e-5)收敛慢
    weight_decay=0.01, # L2正则化系数，防止权重过大，防止过拟合，提高泛化能力，loss = original_loss + 0.01 × ||weights||²，0.01是BERT微调的标准设置
    warmup_steps=800, # 前xxx步的线性增加学习率，避免训练初期学习率过大导致不稳定，约占总步数的3-5% (2000/60000)，0 → 3.2e-5 (线性增长) 总步数3%
    lr_scheduler_type="cosine", # 余弦退火学习率调度，训练后期缓慢降低学习率，精细调优，lr = lr_min + (lr_max - lr_min) × (1 + cos(π×t/T))/2，比线性衰减更平滑，避免震荡
    
    # === 正则化 ===
    label_smoothing_factor=0.1,# 标签平滑，软化one-hot标签，通常能提升0.5-1%的准确率
    
    # === 性能优化 ===
    fp16=True, # 使用16位浮点数代替32位，跟2080Ti显卡有关
    dataloader_num_workers=2, # 适中多进程，避免过多进程竞争
    gradient_checkpointing=False, # 重新计算中间激活值而不是存储，显存不足时的必要选择
    dataloader_pin_memory=True,
    max_grad_norm=1.0,         # 梯度裁剪，防止梯度爆炸
    
    # === 模型选择 ===
    load_best_model_at_end=True, # 训练结束后加载最佳checkpoint
    metric_for_best_model="eval_accuracy", # 使用验证准确率作为最佳模型标准
    greater_is_better=True, # 准确率越高越好
    save_total_limit=2, # 最多保留2个checkpoint

    # === 监控===
    logging_dir=f'{model_dir}/logs',

    # === 其他优化 ===
    remove_unused_columns=True,
    seed=42, # 随机种子，确保结果可复现
)


#  开始训练

In [8]:
# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)

print("🚀 Trainer已创建，准备开始训练...")

🚀 Trainer已创建，准备开始训练...


In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,1.0988,1.090557,0.58104
1000,1.044,1.073347,0.60546
1500,1.0119,1.004492,0.64146
2000,1.0037,0.972724,0.65896
2500,0.9785,0.966885,0.66268
3000,0.978,0.964803,0.66578
3500,0.9444,0.961881,0.66876
4000,0.9473,0.964928,0.6655
4500,0.966,0.946655,0.67638
5000,0.943,0.959583,0.67092


TrainOutput(global_step=20312, training_loss=0.9252217751687415, metrics={'train_runtime': 31118.6981, 'train_samples_per_second': 41.776, 'train_steps_per_second': 0.653, 'total_flos': 3.420409555323617e+17, 'train_loss': 0.9252217751687415, 'epoch': 2.0})

# 感觉精确度也不高哈，训练了快8个小时，调了下符合目前我自己显卡能承受的范围的超参数