In [2]:
%matplotlib inline

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import Dataset

## Dataset


In [4]:
%store -r DT_rally_speaches_dataset
df = DT_rally_speaches_dataset

In [5]:
df

Unnamed: 0,Location,Month,Year,filename,content
0,Battle Creek,Dec,2019,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...
1,Bemidji,Sep,2020,BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...
2,Charleston,Feb,2020,CharlestonFeb28_2020.txt,Thank you. Thank you. Thank you. All I can say...
3,Charlotte,Mar,2020,CharlotteMar2_2020.txt,"I want to thank you very much. North Carolina,..."
4,Cincinnati,Aug,2019,CincinnatiAug1_2019.txt,Thank you all. Thank you very much. Thank you ...
5,Colorador Springs,Feb,2020,ColoradorSpringsFeb20_2020.txt,"Hello Colorado. We love Colorado, most beautif..."
6,Dallas,Oct,2019,DallasOct17_2019.txt,Thank you. Thank you very much. Hello Dallas. ...
7,Des Moines,Jan,2020,DesMoinesJan30_2020.txt,I worked so hard for this state. I worked so h...
8,Fayetteville,Sep,2020,FayettevilleSep19_2020.txt,"What a crowd, what a crowd. Get those people o..."
9,Fayetteville,Sep,2019,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...


In [6]:
# rallies = [text for text in df["content"]]
# rallies_dict = {
#     'train': rallies[:30],
#     'test': rallies[30:],
#     'unsupervised': rallies
# }

In [7]:
# rallies_dict2 = {
#     'rallies': rallies
# }
# dataset = Dataset.from_dict(rallies_dict2)

## Model

In [8]:
model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [9]:
model.summary()

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 23866170  
 rtLMHead)                                                       
                                                                 
Total params: 66,985,530
Trainable params: 66,985,530
Non-trainable params: 0
__________________________

## Tokenizer

In [18]:
# text = "This is a great [MASK]."
# text = "Make [MASK] great again!"
# text = "[MASK] virus"
text = "kung [MASK]"

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [20]:
inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> kung fu
>>> kung chung
>>> kung .
>>> kung kung
>>> kung !


In [22]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['Location', 'Month', 'Year', 'filename', 'content'],
    num_rows: 35
})

In [23]:
def tokenize_function(examples):
    result = tokenizer(examples["content"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=['Location', 'Month', 'Year', 'filename', 'content']
)
tokenized_datasets

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (24291 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 35
})

In [24]:
tokenizer.model_max_length

512

In [25]:
chunk_size = 128

In [16]:
tokenized_samples = tokenized_datasets[:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Rally {idx} length: {len(sample)}'")

'>>> Rally 0 length: 24291'
'>>> Rally 1 length: 22976'
'>>> Rally 2 length: 12491'


In [17]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 59758'


In [18]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk lengt

In [19]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [20]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 3864
})

In [21]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'when i first started this beautiful trip, this beautiful journey, i just said to the first lady, " you are so lucky. i took you on this fantastic journey. it\'s so much fun. they want to impeach you. they want to do worse than that. " by the way, by the way, by the way, it doesn\'t really feel like we\'re being impeached. the country is doing better than ever before. we did nothing wrong. we did nothing wrong. and we have tremendous support in the republican party like we\'ve never had before. nobody\'s ever had this kind'

In [22]:
tokenizer.decode(lm_datasets[1]["labels"])

'when i first started this beautiful trip, this beautiful journey, i just said to the first lady, " you are so lucky. i took you on this fantastic journey. it\'s so much fun. they want to impeach you. they want to do worse than that. " by the way, by the way, by the way, it doesn\'t really feel like we\'re being impeached. the country is doing better than ever before. we did nothing wrong. we did nothing wrong. and we have tremendous support in the republican party like we\'ve never had before. nobody\'s ever had this kind'

In [23]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [24]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] thank [MASK]. thank you. thank you to vice president pence. he's a good guy [MASK] we've done a great job together. [MASK] merry christmas, michigan.ক you, michigan. what a [MASK] we had in michigan. what a victory was that. one of the greats. was that the greatest evening? but i'm thrilled to be here with thousands of hardworking patriots as we celebrate the miracle of christmas [MASK] the great [MASK] paisley america and [MASK] glory of god [MASK] [MASK] you very much. and did you notice that everybody is saying merry christmas again? [MASK] you notice? saying taiwan christmas [MASK] i remember'

'>>> [MASK] [MASK] first [MASK] this beautiful trip, this beautiful journey, i just said to the first lady, " you are so lucky. i scully you on [MASK] [MASK] journey. it'raging so much fun. they want to impeach you. they want to do worse than that. " by the way, by the way, by the [MASK], it doesn [MASK] t really feel like we're being impeached. the country is doing better than e

In [25]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

In [26]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] thank you. thank [MASK]. thank you [MASK] vice president pence. he's a good guy. we [MASK] ve done [MASK] great job together. and merry christmas, michigan. [MASK] you, michigan. [MASK] a [MASK] we had [MASK] michigan. what [MASK] victory was that. [MASK] of the [MASK] [MASK]. was that [MASK] greatest [MASK]? but i'm thrilled to be here with thousands of hardworking patriots as we celebrate the miracle of [MASK], the [MASK] [MASK] of america [MASK] the glory of god. [MASK] [MASK] very much. [MASK] [MASK] you notice that everybody is saying merry christmas again [MASK] [MASK] you [MASK]? saying merry christmas [MASK] i remember'

'>>> when i first started this beautiful trip, this beautiful [MASK] [MASK] [MASK] [MASK] [MASK] to the first lady, [MASK] you are so lucky. i took you [MASK] this fantastic journey. it's so much [MASK] [MASK] they want to impeach [MASK]. they [MASK] to do worse than [MASK]. " by the way, by [MASK] way [MASK] by the way, it [MASK]'t really feel like

In [33]:
train_size = 3500
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 350
    })
})

In [34]:
tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [36]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]
# callback = PushToHubCallback(
#     output_dir=f"{model_name}-finetuned-imdb", tokenizer=tokenizer
# )

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [37]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 14.47


In [39]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset)



<keras.callbacks.History at 0x186e8d77ca0>

In [40]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 9.26


In [44]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model=model, tokenizer=tokenizer
)

In [45]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great deal.
>>> this is a great success.
>>> this is a great adventure.
>>> this is a great idea.
>>> this is a great shame.
