## 测试环境代理是否正常

In [1]:
# 测试代理
import os
import requests

# 设置代理环境变量
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
os.environ['ALL_PROXY'] = 'socks5://127.0.0.1:7891'

# 测试代理连接
try:
    response = requests.get('https://huggingface.co', timeout=10)
    print("✅ HuggingFace 连接成功，状态码:", response.status_code)
    print("Test server is ubuntu22.04 GPU 2080Ti 22G")
except Exception as e:
    print("连接失败:", e)

# 设置 HuggingFace 缓存路径
os.environ['HF_HOME'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf'
os.environ['HF_HUB_CACHE'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf_hu'

# 服务器环境 ubuntu22.04 GPU 2080Ti 22G

✅ HuggingFace 连接成功，状态码: 200
Test server is ubuntu22.04 GPU 2080Ti 22G


In [2]:
## 设置环境参数

In [3]:
# 根据你使用的模型和GPU资源情况，调整以下关键参数
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 32

## 下载数据集-SQuAD 数据集

In [4]:
from datasets import load_dataset
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

## 预处理数据

In [5]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# The maximum length of a feature (question and context)
max_length = 384 
# The authorized overlap between two part of the context when splitting it is needed.
doc_stride = 128 

pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    # 一些问题的左侧可能有很多空白字符，这对我们没有用，而且会导致上下文的截断失败
    # （标记化的问题将占用大量空间）。因此，我们删除左侧的空白字符。
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # 使用截断和填充对我们的示例进行标记化，但保留溢出部分，使用步幅（stride）。
    # 当上下文很长时，这会导致一个示例可能提供多个特征，其中每个特征的上下文都与前一个特征的上下文有一些重叠。
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 由于一个示例可能给我们提供多个特征（如果它具有很长的上下文），我们需要一个从特征到其对应示例的映射。这个键就提供了这个映射关系。
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # 偏移映射将为我们提供从令牌到原始上下文中的字符位置的映射。这将帮助我们计算开始位置和结束位置。
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 让我们为这些示例进行标记！
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # 我们将使用 CLS 特殊 token 的索引来标记不可能的答案。
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # 获取与该示例对应的序列（以了解上下文和问题是什么）。
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 一个示例可以提供多个跨度，这是包含此文本跨度的示例的索引。
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # 如果没有给出答案，则将cls_index设置为答案。
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # 答案在文本中的开始和结束字符索引。
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # 当前跨度在文本中的开始令牌索引。
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # 当前跨度在文本中的结束令牌索引。
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # 检测答案是否超出跨度（在这种情况下，该特征的标签将使用CLS索引）。
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # 否则，将token_start_index和token_end_index移到答案的两端。
                # 注意：如果答案是最后一个单词（边缘情况），我们可以在最后一个偏移之后继续。
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

tokenized_datasets = datasets.map(prepare_train_features,
                                  batched=True,
                                  remove_columns=datasets["train"].column_names)



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

## 微调模型

In [6]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# 训练超参数 （TrainingArguments）
batch_size=64
model_dir = f"models/{model_checkpoint}-finetuned-squad"

args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)
# Data collator(数据整理器)
from transformers import default_data_collator

data_collator = default_data_collator
# 实例化训练器(Trainer)
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.5312,1.273894
2,1.1486,1.182421
3,1.003,1.172099


TrainOutput(global_step=4152, training_loss=1.3412248688626152, metrics={'train_runtime': 3537.5901, 'train_samples_per_second': 75.071, 'train_steps_per_second': 1.174, 'total_flos': 2.602335381127373e+16, 'train_loss': 1.3412248688626152, 'epoch': 3.0})

## 训练完成后，第一时间保存模型权重文件。

In [8]:
model_to_save = trainer.save_model(model_dir)

## 加载本地模型

In [27]:
# 加载模型和tokenizer
trained_model = AutoModelForQuestionAnswering.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print(f"模型加载完成: {model_dir}")
print(f"Tokenizer加载完成")
print()

模型加载完成: models/distilbert-base-uncased-finetuned-squad
Tokenizer加载完成



##  模型评估 

In [28]:
import numpy as np
from tqdm.auto import tqdm
import collections

def prepare_validation_features(examples):
    """准备验证集特征，与训练集类似但不需要答案标签"""
    # 清理问题文本
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 保存样本映射关系
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    # 为每个特征添加example_id
    tokenized_examples["example_id"] = []
    
    # 处理offset_mapping - 修复sequence_ids作用域问题
    for i in range(len(tokenized_examples["input_ids"])):
        # 获取当前特征的sequence_ids
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        
        # 添加example_id
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        
        # 设置不属于context的offset为None
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples


def postprocess_qa_predictions(examples, features, predictions, version_2_with_negative=False, n_best_size=20, max_answer_length=30):
    """后处理QA预测结果"""
    
    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
    all_start_logits, all_end_logits = predictions

    # 构建从example到feature的映射
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # 我们需要填充的字典
    predictions = collections.OrderedDict()

    # 日志记录
    print(f"正在后处理 {len(examples)} 个示例的预测，这些预测分散在 {len(features)} 个特征中。")

    # 遍历所有示例
    for example_index, example in enumerate(tqdm(examples)):
        # 这些是与当前示例关联的特征的索引
        feature_indices = features_per_example[example_index]

        min_null_score = None # 仅在squad_v2为True时使用
        valid_answers = []
        
        context = example["context"]
        # 遍历与当前示例关联的所有特征
        for feature_index in feature_indices:
            # 我们获取模型对这个特征的预测
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # 这是将token映射到原始上下文中的字符位置的方法
            offset_mapping = features[feature_index]["offset_mapping"]

            # 更新最小空答案分数
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # 浏览所有的最佳开始和结束logits，为 `n_best_size` 个最佳选择
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # 不考虑超出范围的答案，原因是索引超出范围或对应于输入ID的部分不在上下文中
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # 不考虑长度小于0或大于max_answer_length的答案
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # 在极少数情况下我们没有一个非空预测，我们创建一个假预测以避免失败
            best_answer = {"text": "", "score": 0.0}
        
        # 选择我们的最终答案：最佳答案或空答案（仅适用于squad_v2）
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [29]:
# ## 执行模型评估

# 1. 准备验证集
print("准备验证集特征...")
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

# 2. 使用训练好的模型进行预测
print("开始预测...")
raw_predictions = trainer.predict(validation_features)

# 3. 后处理预测结果
print("后处理预测结果...")
final_predictions = postprocess_qa_predictions(
    datasets["validation"], 
    validation_features, 
    raw_predictions.predictions
)

# 4. 计算评估指标
print("计算评估指标...")
from datasets import load_metric
metric = load_metric("squad_v2" if squad_v2 else "squad")

# 格式化预测和标签
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]

# 计算最终评估结果
eval_results = metric.compute(predictions=formatted_predictions, references=references)

print("评估结果:")
print("=" * 40)
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

准备验证集特征...
开始预测...


后处理预测结果...
正在后处理 10570 个示例的预测，这些预测分散在 10784 个特征中。


  0%|          | 0/10570 [00:00<?, ?it/s]

计算评估指标...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


评估结果:
exact_match: 74.7682
f1: 83.4873


In [30]:
# ## 模型推理测试
import torch

def test_qa_model(model, tokenizer, question, context):
    """测试QA模型的推理能力"""
    
    # 准备输入
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=384,
        truncation=True,
        padding=True
    )
    
    # 模型推理
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取答案位置
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    # 提取答案
    input_ids = inputs["input_ids"][0]
    answer_tokens = input_ids[start_index:end_index+1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    
    # 计算置信度
    start_score = start_scores[0][start_index].item()
    end_score = end_scores[0][end_index].item()
    confidence = start_score + end_score
    
    return {
        "answer": answer,
        "confidence": confidence,
        "start_index": start_index.item(),
        "end_index": end_index.item()
    }

# 测试示例
print(" 模型推理测试:")
print("=" * 30)

test_cases = [
    {
        "question": "What is the capital of France?",
        "context": "France is a country in Europe. The capital of France is Paris, which is also the largest city in the country."
    },
    {
        "question": "When was the company founded?",
        "context": "Apple Inc. was founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne. The company is headquartered in Cupertino, California."
    }
]

for i, test_case in enumerate(test_cases, 1):
    print(f"\n测试 {i}:")
    print(f"问题: {test_case['question']}")
    print(f"上下文: {test_case['context'][:100]}...")
    
    result = test_qa_model(trained_model, tokenizer, test_case['question'], test_case['context'])
    
    print(f"答案: '{result['answer']}'")
    print(f"置信度: {result['confidence']:.4f}")

 模型推理测试:

测试 1:
问题: What is the capital of France?
上下文: France is a country in Europe. The capital of France is Paris, which is also the largest city in the...
答案: 'paris'
置信度: 13.1589

测试 2:
问题: When was the company founded?
上下文: Apple Inc. was founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne. The company is headqu...
答案: '1976'
置信度: 12.3387


## 调整训练参数

In [31]:
# ## 策略1：F1优化训练 - 调整训练超参数

optimized_args_stable = TrainingArguments(
    output_dir=f"{model_dir}-f1-optimized-stable",
    
    # === 评估方式 ===
    evaluation_strategy="epoch",  
    
    # === 优化的超参数 ===
    learning_rate=1.5e-5,  # 从2e-5降低到1.5e-5，更稳定
    per_device_train_batch_size=16,  # 从32降低到16，更稳定
    per_device_eval_batch_size=32,   # 评估时可以更大
    num_train_epochs=2,  # 额外训练2个epoch
    
    # === 增强正则化 ===
    weight_decay=0.015,  # 从0.01增加到0.015
    
    # === 添加优化技术 ===
    warmup_ratio=0.1,  # 添加warmup
    lr_scheduler_type="cosine",  # 添加学习率调度
    
    # === 性能优化 ===
    fp16=True,  # 混合精度训练
    gradient_accumulation_steps=2,  # 梯度累积
    max_grad_norm=1.0,  # 梯度裁剪
    
    # === 保存策略 ===
    save_strategy="epoch",  # 每epoch保存
    save_total_limit=2,  # 只保留最好的2个
    
    # === 其他设置 ===
    logging_steps=100,
    seed=42,
    report_to=[],
    remove_unused_columns=True,
)

In [32]:
stable_trainer = Trainer(
    model=trained_model,  # 使用已训练的模型继续训练
    args=optimized_args_stable,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,  # 使用原始的data_collator
    tokenizer=tokenizer,
)


In [33]:
# 开始训练
try:
    print("开始训练...")
    stable_train_result = stable_trainer.train()
    
    print("\nF1优化训练完成！")
    
    # 显示训练指标
    if hasattr(stable_train_result, 'metrics'):
        print("训练结果:")
        for key, value in stable_train_result.metrics.items():
            if isinstance(value, float):
                print(f"• {key}: {value:.4f}")
            else:
                print(f"• {key}: {value}")
    
    # 保存模型
    print(f"\n保存优化模型...")
    stable_trainer.save_model()
    print(f"模型已保存到: {optimized_args_stable.output_dir}")
    
    print("\n训练成功完成！")
    print("接下来评估优化效果")
    
except Exception as e:
    print(f"训练失败: {e}")
    print("\n🔧 如果出现内存问题，可以尝试:")
    print("• 减小per_device_train_batch_size到8")
    print("• 移除fp16=True")
    print("• 减小gradient_accumulation_steps到1")

开始训练...


Epoch,Training Loss,Validation Loss
0,0.9794,1.157104
1,0.829,1.165746


Checkpoint destination directory models/distilbert-base-uncased-finetuned-squad-f1-optimized-stable/checkpoint-2766 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory models/distilbert-base-uncased-finetuned-squad-f1-optimized-stable/checkpoint-5532 already exists and is non-empty.Saving will proceed but saved results may be invalid.



F1优化训练完成！
训练结果:
• train_runtime: 1205.5841
• train_samples_per_second: 146.8570
• train_steps_per_second: 4.5890
• total_flos: 17346158825748480.0000
• train_loss: 0.8929
• epoch: 2.0000

保存优化模型...
模型已保存到: models/distilbert-base-uncased-finetuned-squad-f1-optimized-stable

训练成功完成！
接下来评估优化效果


In [34]:
try:
    # === 步骤1：准备验证集特征  ===
    print("步骤1: 准备验证集特征...")
    # 使用原始的prepare_validation_features函数
    stable_validation_features = datasets["validation"].map(
        prepare_validation_features,
        batched=True,
        remove_columns=datasets["validation"].column_names
    )
    print("验证集特征准备完成")
    
    # === 步骤2：使用优化模型进行预测  ===
    print("步骤2: 使用优化模型进行预测...")
    stable_raw_predictions = stable_trainer.predict(stable_validation_features)
    print("模型预测完成")
    
    # === 步骤3：后处理预测结果  ===
    print("步骤3: 后处理预测结果...")
    stable_final_predictions = postprocess_qa_predictions(
        datasets["validation"], 
        stable_validation_features, 
        stable_raw_predictions.predictions
        # 使用原始的默认参数，不添加额外优化
    )
    print("预测结果后处理完成")
    
    # === 步骤4：计算评估指标  ===
    print("步骤4: 计算评估指标...")
    from datasets import load_metric
    metric = load_metric("squad_v2" if squad_v2 else "squad")
    
    # 格式化预测和标签
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in stable_final_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
    
    # 计算最终评估结果 
    stable_eval_results = metric.compute(predictions=formatted_predictions, references=references)
    print("评估指标计算完成")
    
    # === 显示对比结果 ===
    print("\n使用原始评估方式的结果对比:")
    print("=" * 70)
    print("第一次微调结果 (原始评估方式):")
    print("   • F1 Score: 83.4873")
    print("   • Exact Match: 74.7682")
    print("   • 训练配置: 3 epochs, lr=2e-5, batch=32")
    print()
    print("优化结果 (相同评估方式):")
    for key, value in stable_eval_results.items():
        print(f"   • {key}: {value:.4f}")
    print("   • 训练配置: +2 epochs, lr=1.5e-5, batch=16, 优化超参数")
    print()
    
    # 计算提升 (使用原始评估结果作为基准)
    f1_improvement = stable_eval_results['f1'] - 83.4873
    em_improvement = stable_eval_results['exact_match'] - 74.7682
    
    print("性能提升 (基于原始评估方式):")
    print(f"   • F1 Score 提升: {f1_improvement:+.4f} 个百分点")
    print(f"   • Exact Match 提升: {em_improvement:+.4f} 个百分点")
    
    # 相对提升百分比
    f1_relative = (f1_improvement / 83.4873) * 100
    em_relative = (em_improvement / 74.7682) * 100
    print(f"   • F1 相对提升: {f1_relative:+.2f}%")
    print(f"   • EM 相对提升: {em_relative:+.2f}%")
   
except Exception as e:
    print(f"评估失败: {e}")
    print("请确保稳定版训练已成功完成")
    import traceback
    traceback.print_exc()

步骤1: 准备验证集特征...
验证集特征准备完成
步骤2: 使用优化模型进行预测...


模型预测完成
步骤3: 后处理预测结果...
正在后处理 10570 个示例的预测，这些预测分散在 10784 个特征中。


  0%|          | 0/10570 [00:00<?, ?it/s]

预测结果后处理完成
步骤4: 计算评估指标...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


评估指标计算完成

使用原始评估方式的结果对比:
第一次微调结果 (原始评估方式):
   • F1 Score: 83.4873
   • Exact Match: 74.7682
   • 训练配置: 3 epochs, lr=2e-5, batch=32

优化结果 (相同评估方式):
   • exact_match: 76.0170
   • f1: 84.4804
   • 训练配置: +2 epochs, lr=1.5e-5, batch=16, 优化超参数

性能提升 (基于原始评估方式):
   • F1 Score 提升: +0.9931 个百分点
   • Exact Match 提升: +1.2488 个百分点
   • F1 相对提升: +1.19%
   • EM 相对提升: +1.67%
