# 因果语言模型训练实例

## Step1 导入相关包

In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, BloomForCausalLM

## Step2 加载数据集

In [3]:
with open('短篇章和资治通鉴翻译.txt','r') as f:
    lines = f.readlines()
    contents=[{'content':line.replace("\n","")} for line in lines]
da=Dataset.from_list(contents[:100000])
da

Dataset({
    features: ['content'],
    num_rows: 100000
})

In [4]:
da[0]

{'content': '季路说：请问死是怎么回事？'}

## Step3 数据集处理

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

def process_func(examples):
    contents = [e + tokenizer.eos_token for e in examples["content"]]
    return tokenizer(contents, max_length=128, truncation=True)
tokenized_da = da.map(process_func, batched=True, remove_columns=da.column_names)
tokenized_da

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100000
})

In [6]:
tokenized_da[0]

{'input_ids': [8357, 2758, 147012, 161158, 4549, 644, 99225, 2498, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
from torch.utils.data import DataLoader

dl = DataLoader(tokenized_da, batch_size=2, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False))

In [8]:
next(enumerate(dl))

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(0,
 {'input_ids': tensor([[     3,      3,      3,      3,      3,   8357,   2758, 147012, 161158,
            4549,    644,  99225,   2498,      2],
         [ 12142,  41872,   3872,  19150,  30704,  11812,   7204,    355,  18298,
          102267, 138047,  15361,    420,      2]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  -100,   -100,   -100,   -100,   -100,   8357,   2758, 147012, 161158,
            4549,    644,  99225,   2498,      2],
         [ 12142,  41872,   3872,  19150,  30704,  11812,   7204,    355,  18298,
          102267, 138047,  15361,    420,      2]])})

In [9]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 2)

## Step4 创建模型

In [10]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")

## Step5 配置训练参数

In [11]:
args = TrainingArguments(
    output_dir="./causal_lm",
    per_device_train_batch_size=32,
    gradient_accumulation_steps=8,
    logging_steps=50,
    num_train_epochs=1
)

## Step6 创建训练器

In [12]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_da,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

## Step7 模型训练

In [13]:
trainer.train()



Step,Training Loss
50,4.6062
100,4.2574
150,4.0992
200,3.9988
250,3.928
300,3.8771
350,3.8256


TrainOutput(global_step=390, training_loss=4.05466813307542, metrics={'train_runtime': 1703.0429, 'train_samples_per_second': 58.718, 'train_steps_per_second': 0.229, 'total_flos': 1.2699893900771328e+16, 'train_loss': 4.05466813307542, 'epoch': 1.0})

## Step8 模型推理

In [14]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

In [38]:
# 狡兔死，走狗烹；飞鸟尽，良弓藏；敌国破，谋臣亡。
pipe("狡兔死，走狗烹；飞鸟尽，良弓", max_length=128, do_sample=True)

[{'generated_text': '狡兔死，走狗烹；飞鸟尽，良弓死。'}]

In [60]:
pipe("勿以善小而不为，", max_length=128, do_sample=True)

[{'generated_text': '勿以善小而不为，虽小又如小。'}]

In [66]:
pipe("司马光和一群小孩子在庭院里面玩，一个小孩站在大缸上面，失足跌落缸中被水淹没,司马光立马", max_length=256, do_sample=True)

[{'generated_text': '司马光和一群小孩子在庭院里面玩，一个小孩站在大缸上面，失足跌落缸中被水淹没,司马光立马站起，向客人道歉告辞，说：如果看见水池里有鱼儿，那我就不会哭出来。'}]