# 掩码语言模型训练实例

## Step1 导入相关包

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer

In [None]:
## Step2 加载数据集

In [None]:
# 读取文件
with open('资治通鉴-古文.txt','r') as f:
    lines = f.readlines()
    contents=[{'content':line.replace("\n","")} for line in lines]
ds=Dataset.from_list(contents)
ds

In [None]:
ds[0]

## Step3 数据集处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

def process_func(examples):
    return tokenizer(examples["content"], max_length=128, truncation=True)
tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds.column_names)
tokenized_ds

In [None]:
from torch.utils.data import DataLoader

dl = DataLoader(tokenized_ds, batch_size=2, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15))

In [None]:
dd=next(enumerate(dl))[1]
tokenizer.decode(dd.input_ids[0])

In [None]:
tokenizer.mask_token, tokenizer.mask_token_id

## Step4 创建模型

In [None]:
model = AutoModelForMaskedLM.from_pretrained("bigscience/bloom-560m")

## Step5 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir="./masked_lm",
    per_device_train_batch_size=128,
    logging_steps=50,
    num_train_epochs=1
)

## Step6 创建训练器

In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15)
)

## Step7 模型训练

In [None]:
trainer.train()

## Step8 模型推理

In [None]:
from transformers import pipeline

pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=0)

In [None]:
res =pipe("撒盐空中差可拟,未若柳絮因[MASK]起")

In [None]:
res_tuple=[]
for i in res:
    res_tuple.append((i['score'],i['token_str'],i['sequence'].replace(' ','')))

In [None]:
sorted(res_tuple,reverse=True)