## 掩码任务

In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

model.safetensors:  86%|########6 | 231M/268M [00:00<?, ?B/s]

In [None]:
distilbert_num_parameters = model.num_parameters()/1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [None]:
text = "This is a great [MASK]."
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
import torch
inputs = tokenizer(text,return_tensors='pt')
token_logits = model(**inputs).logits
mask_token_index = torch.where(inputs['input_ids']==tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0,mask_token_index]
top_5_tokens = torch.topk(mask_token_logits,5,dim=1).indices[0].tolist()
print(mask_token_logits.shape) # 候选单词的logits排列，序号代表单词序号  topk是排序选择值最大的 indeices是选择下标
print(top_5_tokens)
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

torch.Size([1, 30522])
[3066, 3112, 6172, 2801, 8658]
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [None]:
# 下载IMDB电影评论数据集
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
imdb_dataset

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
sample = imdb_dataset['train'].shuffle(seed=42).select(range(3))
# 0是负面评论，1是正面
for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        # word_ids(i) 用于获取第 i 个样本中每个 token 对应的原始文本中的 单词索引
    return result


# 使用 batched=True 来激活快速多线程!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [None]:
# 现在我们已经对电影评论进行了 tokenize，下一步是将它们全部组合在一起并将结果分割成块
tokenizer.model_max_length # 最大支持的块大小
chunk_size = 128
# 切片会为每个特征生成一个列表的列表
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

#然后，我们可以用一个简单的字典推导式将所有这些示例连接在一起，如下所示：
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

# 连接的评论拆分为大小为 chunk_size 的块
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] 
    for k, t in concatenated_examples.items()
}
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")
# 如果最后一个块小于 chunk_size ，就丢弃 最后让我们将上述所有逻辑包装在一个函数中，以便我们可以将其应用于我们的已分词数据集上

def group_texts(examples):
    # 拼接所有的文本
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 计算拼接文本的长度
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # 如果最后一个块小于 chunk_size,我们将其丢弃
    total_length = (total_length // chunk_size) * chunk_size
    # 按最大长度分块
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 创建一个新的 labels 列
    result["labels"] = result["input_ids"].copy()
    #在 group_texts() 的最后一步，我们创建了一个新的 labels 列，它是通过复制 input_ids 列形成的。这是因为在掩码语言模型的目标是预测输入中随机遮住(Masked)的 token，我们保存了让我们的语言模型从中学习 [Mask] 的答案
    return result

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'
'>>> Concatenated reviews length: 800'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [None]:
lm_datasets = tokenized_datasets.map(group_texts,batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [None]:
tokenizer.decode(lm_datasets['train']['input_ids'][1])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [None]:
# input_ids 和 labe一摸一样，在输入中随机插入 [MASK] token
# 我们需要一个特殊的数据整理器，它可以随机屏蔽每批文本中的一些 tokens。幸运的是，🤗 Transformers 为这项任务准备了专用的 DataCollatorForLanguageModeling 。我们只需要将 tokenizer 和一个 mlm_probability 参数（掩盖 tokens 的比例）传递给它。在这里我们将 mlm_probability 参数设置为 15%，这是 BERT 默认的数量，也是文献中最常见的选择

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
samples[0].keys()

dict_keys(['input_ids', 'attention_mask', 'word_ids', 'labels'])

In [None]:

for sample in samples:
    _ = sample.pop("word_ids") # 删除这个字段
for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

dict_keys(['input_ids', 'attention_mask', 'labels'])

'>>> [CLS] i [MASK] i am curious - yellow from my [MASK] store because of [MASK] the controversy that surrounded it when it was first released in 1967 [MASK] i also vassal that at [MASK] it was seized by [MASK]. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " [MASK] really had to [MASK] this for [MASK]. [MASK] br / > < br / > the plot is centered around a young swedish drama student named lena who [MASK] to learn [MASK] she [MASK] about life. in particular she wants to focus her attentions to making some sort of documentary on [MASK] the average swede thought about certain political issues such'

'>>> as the vietnam war and race issues in [MASK] united states. in between [MASK] politicians and ordinary denizens of stockholm about [MASK] [MASK] [MASK] politics, she has sex with her [MASK] teacher, classmates, and [MASK] men [MASK] < br [MASK] > < br / > what kills me abou

In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # 创建一个单词与对应 token 索引之间的映射
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # 随机遮蔽单词
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am curious - [MASK] from my [MASK] store because [MASK] [MASK] [MASK] controversy that surrounded [MASK] when [MASK] was first released in 1967 [MASK] i [MASK] heard [MASK] [MASK] first it [MASK] seized [MASK] u. [MASK]. customs [MASK] [MASK] ever tried to enter this country [MASK] therefore being [MASK] fan of [MASK] considered [MASK] controversial [MASK] i really had to see this for myself. < br / > < br / > the plot is centered around a [MASK] swedish drama [MASK] named lena who wants to learn everything she can about life. [MASK] [MASK] she [MASK] [MASK] focus her attentions [MASK] making some [MASK] of [MASK] on what the average swede thought about certain political issues such'

'>>> [MASK] the [MASK] war and race issues in the united [MASK]. in between asking politicians [MASK] [MASK] denizens of stockholm about their opinions on politics, she has sex with her drama teacher [MASK] classmates, and married men [MASK] < br / > [MASK] br [MASK] > what kills me

In [None]:
# 简化数据规模
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})