## 掩码任务

In [1]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [2]:
distilbert_num_parameters = model.num_parameters()/1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
text = "This is a great [MASK]."
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
import torch
inputs = tokenizer(text,return_tensors='pt')
token_logits = model(**inputs).logits
mask_token_index = torch.where(inputs['input_ids']==tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0,mask_token_index]
top_5_tokens = torch.topk(mask_token_logits,5,dim=1).indices[0].tolist()
print(mask_token_logits.shape) # 候选单词的logits排列，序号代表单词序号  topk是排序选择值最大的 indeices是选择下标
print(top_5_tokens)
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

torch.Size([1, 30522])
[3066, 3112, 6172, 2801, 8658]
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [7]:
# 下载IMDB电影评论数据集
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
imdb_dataset

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
sample = imdb_dataset['train'].shuffle(seed=42).select(range(3))
# 0是负面评论，1是正面
for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [12]:
imdb_dataset['train'].features['label']

ClassLabel(names=['neg', 'pos'], id=None)

In [13]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        # word_ids(i) 用于获取第 i 个样本中每个 token 对应的原始文本中的 单词索引
    return result


# 使用 batched=True 来激活快速多线程!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [16]:
# 现在我们已经对电影评论进行了 tokenize，下一步是将它们全部组合在一起并将结果分割成块
tokenizer.model_max_length # 最大支持的块大小
chunk_size = 128
# 切片会为每个特征生成一个列表的列表
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

#然后，我们可以用一个简单的字典推导式将所有这些示例连接在一起，如下所示：
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

# 连接的评论拆分为大小为 chunk_size 的块
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] 
    for k, t in concatenated_examples.items()
}
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")
# 如果最后一个块小于 chunk_size ，就丢弃 最后让我们将上述所有逻辑包装在一个函数中，以便我们可以将其应用于我们的已分词数据集上

def group_texts(examples):
    # 拼接所有的文本
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 计算拼接文本的长度
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # 如果最后一个块小于 chunk_size,我们将其丢弃
    total_length = (total_length // chunk_size) * chunk_size
    # 按最大长度分块
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 创建一个新的 labels 列
    result["labels"] = result["input_ids"].copy()
    #在 group_texts() 的最后一步，我们创建了一个新的 labels 列，它是通过复制 input_ids 列形成的。这是因为在掩码语言模型的目标是预测输入中随机遮住(Masked)的 token，我们保存了让我们的语言模型从中学习 [Mask] 的答案
    return result

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'
'>>> Concatenated reviews length: 800'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [17]:
lm_datasets = tokenized_datasets.map(group_texts,batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [24]:
print(tokenizer.decode(lm_datasets['train']['input_ids'][1]))
print(lm_datasets['train']['word_ids'][0])
print(lm_datasets['train']['input_ids'][0])

as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it ' s not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104

In [None]:
# input_ids 和 label一摸一样，在输入中随机插入 [MASK] token
# 我们需要一个特殊的数据整理器，它可以随机屏蔽每批文本中的一些 tokens。幸运的是，🤗 Transformers 为这项任务准备了专用的 DataCollatorForLanguageModeling 。我们只需要将 tokenizer 和一个 mlm_probability 参数（掩盖 tokens 的比例）传递给它。在这里我们将 mlm_probability 参数设置为 15%，这是 BERT 默认的数量，也是文献中最常见的选择

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [22]:
samples = [lm_datasets["train"][i] for i in range(2)]
samples[0].keys()

dict_keys(['input_ids', 'attention_mask', 'word_ids', 'labels'])

In [23]:

for sample in samples:
    _ = sample.pop("word_ids") # 删除这个字段
for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] [MASK] rented i am curious - yellow from [MASK] video store because of all the controversy that surrounded it when it was first released in [MASK]. i also heard [MASK] at first it was seized by u. s. customs aviation it [MASK] tried to [MASK] this country, therefore being a fan of films considered " controversial " researcher really had to see this [MASK] myself. < br / > < br [MASK] > the plot is centered around a young swedish [MASK] student named [MASK] who wants [MASK] learn everything [MASK] can about life. in particular ん [MASK] [MASK] focus her [MASK]s [MASK] making some [MASK] of documentary on what the average swede [MASK] about [MASK] political issues such'

'>>> as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men [MASK] < br / > < br / > what kills me about i am curious - pedro is that 40 [MASK] [M

In [25]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # 创建一个单词与对应 token 索引之间的映射
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # 随机遮蔽单词
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [26]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] [MASK] [MASK] i am curious - yellow [MASK] my [MASK] store because of all [MASK] controversy that surrounded it when it [MASK] first released in 1967. [MASK] also heard [MASK] [MASK] first [MASK] was seized by u [MASK] s. customs if it ever tried to enter this country, therefore [MASK] a fan [MASK] films considered " controversial [MASK] i [MASK] had [MASK] see this for myself. < [MASK] / [MASK] < [MASK] / [MASK] the [MASK] is [MASK] around a young swedish drama student named lena who wants to [MASK] everything [MASK] can about life. in particular she [MASK] to focus [MASK] attentions to making some sort of documentary on what the average swede thought about certain political issues such'

'>>> as the [MASK] [MASK] and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions [MASK] politics, she [MASK] sex with her [MASK] teacher, [MASK], and [MASK] men [MASK] < [MASK] [MASK] > < br / [MASK] what kills me about 

In [27]:
# 简化数据规模
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
from transformers import TrainingArguments,Trainer

batch_size = 64
# 在每个 epoch 输出训练的 loss
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f'./my_model/{model_name}-fintuned-imdb',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',  # 每个epoch执行evaluate，在验证集记录准确率等
    learning_rate = 2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,                  # fp16 指的是使用半精度浮点数（16 位浮点数，Float16）进行计算，而不是传统的单精度浮点数（32 位浮点数，Float32）。这是一种优化技术，主要用于加速训练过程并减少显存占用
    logging_steps=logging_steps, #表示每训练 logging_steps 步就记录一次训练相关的日志信息，如损失值、准确率等
)

trainer = Trainer(
    model = model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=downsampled_dataset['train'],
    eval_dataset=downsampled_dataset['test'],
    tokenizer=tokenizer,
)

  trainer = Trainer(


### 困惑度

训练之前工作

In [34]:
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity:{math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/16 [00:00<?, ?it/s]

>>> Perplexity:21.94


In [35]:
# 训练
trainer.train()

  0%|          | 0/471 [00:00<?, ?it/s]

{'loss': 2.6806, 'grad_norm': 3.7878775596618652, 'learning_rate': 1.3460721868365181e-05, 'epoch': 0.99}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.511887311935425, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 1.425, 'eval_samples_per_second': 701.737, 'eval_steps_per_second': 11.228, 'epoch': 1.0}
{'loss': 2.5893, 'grad_norm': 3.616826295852661, 'learning_rate': 6.836518046709129e-06, 'epoch': 1.99}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.4500088691711426, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 1.4313, 'eval_samples_per_second': 698.644, 'eval_steps_per_second': 11.178, 'epoch': 2.0}
{'loss': 2.5282, 'grad_norm': 3.6923487186431885, 'learning_rate': 2.1231422505307855e-07, 'epoch': 2.98}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.482969045639038, 'eval_model_preparation_time': 0.0011, 'eval_runtime': 1.4482, 'eval_samples_per_second': 690.517, 'eval_steps_per_second': 11.048, 'epoch': 3.0}
{'train_runtime': 116.2255, 'train_samples_per_second': 258.119, 'train_steps_per_second': 4.052, 'train_loss': 2.5988421561611688, 'epoch': 3.0}


TrainOutput(global_step=471, training_loss=2.5988421561611688, metrics={'train_runtime': 116.2255, 'train_samples_per_second': 258.119, 'train_steps_per_second': 4.052, 'total_flos': 994208670720000.0, 'train_loss': 2.5988421561611688, 'epoch': 3.0})

In [36]:
eval_results = trainer.evaluate()
print(f">>> Perplexity:{math.exp(eval_results['eval_loss']):.2f}")
# 可以看到困惑度是降低的

  0%|          | 0/16 [00:00<?, ?it/s]

>>> Perplexity:12.06


In [49]:
# DataCollatorForLanguageModeling 在每次评估时也会进行随机遮罩，因此我们在每次训练运行中都会看到困惑度得分有些波动。消除这种随机性的一种方法是在整个测试集上 仅进行一次 遮罩

def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # 为数据集中的每一列创建一个新的"masked"列
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)
eval_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

### pytorch手动训练

In [50]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator  
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch
import math

batch_size = 64
train_data_loader = DataLoader(
    downsampled_dataset['train'],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)
eval_data_loader =  DataLoader(
    eval_dataset,
    batch_size=batch_size,
    collate_fn=default_data_collator,
)

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(),lr=5e-5)

accelerator = Accelerator()
model,optimizer,train_dataloader,eval_dataloader = accelerator.prepare(model,optimizer,train_data_loader,eval_data_loader)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0, # 预热阶段会在训练开始时使用较小的学习率，逐步增加到正常学习率，以帮助模型更好地收敛
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

output_dir=f'./my_model/{model_name}-fintuned-imdb-accelerator'

for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 评估
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # 保存并上传
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)


  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 12.085213716475206
>>> Epoch 1: Perplexity: 11.544630048050726
>>> Epoch 2: Perplexity: 11.379201422045155


In [56]:
from transformers import pipeline

mask_fill = pipeline('fill-mask','./my_model/distilbert-base-uncased-fintuned-imdb-accelerator')
pred=mask_fill(text)

Device set to use cuda:0


In [60]:
pred[0]['sequence']

'this is a great film.'