In [1]:
from datasets import load_dataset

ds = load_dataset("mikasenghaas/wikitext-2")


Using the latest cached version of the dataset since mikasenghaas/wikitext-2 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/nas511/guandewei/.cache/huggingface/datasets/mikasenghaas___wikitext-2/default/0.0.0/f7836bd4080d244e6507fcf70604c740d73a230a (last modified on Sun May 11 17:42:42 2025).


In [2]:
# 清洗数据：删除空行和标题行（以 " = " 开头）
# def clean_wikitext(examples):
#     cleaned_texts = []
#     for text in examples["text"]:
#         if text.strip() and not text.startswith(" = "):
#             cleaned_texts.append(text)
#     return {"text": cleaned_texts}
import re
import ftfy  # 用于修复常见的Unicode问题
def clean_wikitext(examples):
    cleaned_texts = []
    
    for text in examples["text"]:
        # 1. 基础清洗 删除空行
        text = text.strip()
        if not text:
            continue
            
        # 2. 修复Unicode字符问题（使用ftfy库）
        text = ftfy.fix_text(text)
        
        # 3. 过滤维基百科结构化内容
        # 匹配标题行（包含不同层级的标题 = Title == Subtitle === Section ====）
        if re.match(r'^\s*=+\s.*\s=+\s*$', text):
            continue
        # 过滤列表项、导航模板
        if text.startswith(('* ', '# ', '{{', '}}', '|-')):
            continue
        # 过滤文件链接和分类标记
        if re.search(r'\[\[(File|Category):', text):
            continue
            
        # 4. 清理维基标记语法
        # 移除内部链接标记（保留链接文字）
        text = re.sub(r'\[\[([^\]|]+)\|?([^\]]+)?\]\]', lambda m: m.group(2) or m.group(1), text)
        # 移除模板
        text = re.sub(r'\{\{.*?\}\}', '', text)
        # 移除引用标记
        text = re.sub(r'<ref.*?</ref>', '', text, flags=re.DOTALL)
        
        # 5. 文本规范化
        # 合并多个换行/空格
        text = re.sub(r'\s+', ' ', text)
        # 移除特殊字符（保留常见标点）
        text = re.sub(r'[^\w\s.,!?\'"-—–@$%&*+/:;()]', '', text)
        # 标准化引号
        text = text.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
        
        # 6. 内容质量过滤
        # 过滤过短/无意义的句子
        if len(text) < 25:
            continue
        # 过滤纯数字内容
        if re.fullmatch(r'\d+[\d,\.%\s]*', text):
            continue
        
        cleaned_texts.append(text)
    
    return {"text": cleaned_texts}

In [3]:
# 应用清洗
train_data = ds["train"].map(clean_wikitext, batched=True)
val_data = ds["validation"].map(clean_wikitext, batched=True)

In [4]:
from transformers import AutoTokenizer
import pandas as pd

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# 分词与格式化（输入为前 256 tokens，目标为后 256 tokens）
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        return_tensors="pt",
        padding="max_length"
    )
    # 分割输入和目标（前 256 tokens 为输入，后 256 tokens 为目标）
    inputs = {k: v[:, :256] for k, v in tokenized.items()}
    labels = {k: v[:, 256:] for k, v in tokenized.items()}
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels["input_ids"]}

In [7]:
# 应用分词
tokenized_train = train_data.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_data.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/16958 [00:00<?, ? examples/s]

Map:   0%|          | 0/1792 [00:00<?, ? examples/s]

In [9]:
tokenized_train


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 16958
})

In [51]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

local_model_path = "./Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    trust_remote_code=True,
    device_map="balanced_low_0"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [54]:
print("start")
epochs = 10
lr = 5e-6
batch_size=8
training_args = TrainingArguments(
    output_dir=f"./qwen3-wikitext-epoch{epochs}",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    logging_steps=50,
    learning_rate=lr,
    fp16=True,
    gradient_accumulation_steps=2,  # 减少显存压力
    weight_decay=0.01,  # 添加权重衰减
    metric_for_best_model="eval_loss",  # 根据验证损失选择最佳模型
)

trainer = Trainer(         
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)
trainer.train()

# 保存模型
model.save_pretrained(f"./qwen3-wikitext-sft-epoch-{epochs}-lr-{lr}")

start


  trainer = Trainer(


Step,Training Loss
50,6.8414
100,0.3438


KeyboardInterrupt: 

In [4]:
from IPython.display import display, HTML
display(HTML("<script>Jupyter.notebook.config.update({'ServerApp': {'iopub_data_rate_limit': 100000000}})</script>"))

In [3]:
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM 
from transformers import AutoModelForCausalLM, AutoTokenizer

# 定义评估任务（使用 wikitext 的困惑度）
tasks = ["wikitext"]

# 评估函数
def evaluate_model(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    # 包装为 lm_eval 兼容的模型
    lm = HFLM(
        pretrained=model,  # 直接传入已加载的模型
        tokenizer=tokenizer,
        device="cuda"      # 指定设备
    )
    
    results = evaluator.simple_evaluate(
        model=lm,
        tasks=tasks,
        batch_size=2,
        device="cuda",
    )
    #print(f"results: {results}")
    #return results["results"]["wikitext"]["perplexity"]
    return results
# # 微调前评估
base_rst = evaluate_model("./Qwen3-1.7B")
print(f"微调前困惑度: {base_rst}")
# # 微调后评估 
#finetuned_rst = evaluate_model("./qwen3-wikitext-sft")
#print(f"微调后困惑度: {finetuned_rst}")

# # 微调前评估
# base_ppl = evaluate_model("./Qwen3-1.7B")
# # 微调后评估
# finetuned_ppl = evaluate_model("./qwen3-wikitext-sft")

# print(f"微调前困惑度: {base_ppl:.2f}")
# print(f"微调后困惑度: {finetuned_ppl:.2f}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
[Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
[Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. u

In [6]:
finetuned_rst = evaluate_model("./qwen3-wikitext-sft")
print(f"微调后困惑度: {finetuned_rst}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
[Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
[Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. u

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.84 GiB. GPU 1 has a total capacity of 23.64 GiB of which 6.08 GiB is free. Process 493197 has 854.00 MiB memory in use. Including non-PyTorch memory, this process has 16.70 GiB memory in use. Of the allocated memory 13.98 GiB is allocated by PyTorch, and 2.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)