In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import collections
import math

from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForMaskedLM, DataCollatorForLanguageModeling, create_optimizer, pipeline
from transformers.data.data_collator import tf_default_data_collator
from transformers.keras_callbacks import PushToHubCallback
from huggingface_hub import notebook_login

## Dataset


Importing dataframe from a previous notebook

In [3]:
%store -r DT_rally_speaches_dataset
df = DT_rally_speaches_dataset
df

Unnamed: 0,Location,Month,Year,filename,content
0,Battle Creek,Dec,2019,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...
1,Bemidji,Sep,2020,BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...
2,Charleston,Feb,2020,CharlestonFeb28_2020.txt,Thank you. Thank you. Thank you. All I can say...
3,Charlotte,Mar,2020,CharlotteMar2_2020.txt,"I want to thank you very much. North Carolina,..."
4,Cincinnati,Aug,2019,CincinnatiAug1_2019.txt,Thank you all. Thank you very much. Thank you ...
5,Colorador Springs,Feb,2020,ColoradorSpringsFeb20_2020.txt,"Hello Colorado. We love Colorado, most beautif..."
6,Dallas,Oct,2019,DallasOct17_2019.txt,Thank you. Thank you very much. Hello Dallas. ...
7,Des Moines,Jan,2020,DesMoinesJan30_2020.txt,I worked so hard for this state. I worked so h...
8,Fayetteville,Sep,2020,FayettevilleSep19_2020.txt,"What a crowd, what a crowd. Get those people o..."
9,Fayetteville,Sep,2019,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...


## Model

In [4]:
model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.summary()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 23866170  
 rtLMHead)                                                       
                                                                 
Total params: 66,985,530
Trainable params: 66,985,530
Non-trainable params: 0
__________________________

## Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Let's pick a text to test the base model on

In [6]:
# text = "This is a great [MASK]."
text = "Make [MASK] great"
# text = "[MASK] virus"
# text = "kung [MASK]"

In [7]:
base_model = model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [8]:
inputs = tokenizer(text, return_tensors="np")
token_logits = base_model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> Make yourself great
>>> Make it great
>>> Make thee great
>>> Make me great
>>> Make yourselves great


The tokenizer works best using a dataset so let's convert the pandas dataframe

In [9]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['Location', 'Month', 'Year', 'filename', 'content'],
    num_rows: 35
})

Let's set up a tokenize function that can then be mapped onto the dataset. If using a fast tokenizer we can also use the word ids for whole word masking later on. We can also drop the column that will not be required for this task. 

Since we are working with very long texts we cannot truncate the excess since that will lose us most of the dataset. Instead we can split the texts into batches small enough to fit the model.

In [10]:
def tokenize_function(examples):
    result = tokenizer(examples["content"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=['Location', 'Month', 'Year', 'filename', 'content']
)
tokenized_dataset

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (24291 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 35
})

Let's check the model's max context length in order to determine the size of the chunks

In [11]:
tokenizer.model_max_length

512

The capabilities of your machine is also a factor when picking the chunk size. If the machine is lacking in memmory it might be better to pick a smaller number than what the model is capable of handling.

In [12]:
chunk_size = 256

Let's check the number of tokens per speech

In [13]:
for idx, sample in enumerate(tokenized_dataset["input_ids"]):
    print(f"'>>> Rally {idx} length: {len(sample)}'")

'>>> Rally 0 length: 24291'
'>>> Rally 1 length: 22976'
'>>> Rally 2 length: 12491'
'>>> Rally 3 length: 8802'
'>>> Rally 4 length: 10662'
'>>> Rally 5 length: 15759'
'>>> Rally 6 length: 13867'
'>>> Rally 7 length: 15730'
'>>> Rally 8 length: 22452'
'>>> Rally 9 length: 12007'
'>>> Rally 10 length: 13599'
'>>> Rally 11 length: 14241'
'>>> Rally 12 length: 12027'
'>>> Rally 13 length: 13050'
'>>> Rally 14 length: 18351'
'>>> Rally 15 length: 16629'
'>>> Rally 16 length: 11906'
'>>> Rally 17 length: 12482'
'>>> Rally 18 length: 19059'
'>>> Rally 19 length: 15646'
'>>> Rally 20 length: 19325'
'>>> Rally 21 length: 13165'
'>>> Rally 22 length: 11902'
'>>> Rally 23 length: 8570'
'>>> Rally 24 length: 15375'
'>>> Rally 25 length: 14479'
'>>> Rally 26 length: 12752'
'>>> Rally 27 length: 16218'
'>>> Rally 28 length: 3016'
'>>> Rally 29 length: 14459'
'>>> Rally 30 length: 15064'
'>>> Rally 31 length: 12457'
'>>> Rally 32 length: 8942'
'>>> Rally 33 length: 14664'
'>>> Rally 34 length: 8255'


Let's take a look at the full length of the entire dataset at once:

In [14]:
tokenized_dataset_dict = tokenized_dataset.to_dict()
# tokenized_dataset_dict = tokenized_dataset[:2]
concatenated_dataset = {
    k: sum(tokenized_dataset_dict[k], []) for k in tokenized_dataset_dict.keys()
}
total_length = len(concatenated_dataset["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 494670'


In [15]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_dataset.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk length: 256'
'>>> Chunk lengt

And now to put it all in a function to map to our dataset:

In [16]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_dataset = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_dataset[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_dataset.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

At the end of the group_texts function we create a labels column which is a copy of the input_ids. That is needed in masked language modeling in order to provide the ground truth for our language model to learn from.

Now let's map the function to the tokenized dataset:

In [17]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 1932
})

By Grouping and then splitting the text into chunks we now have ended up with quite a few more additional examples but those examples contain all of the data present in our texts, most of which would have been lost without taking this approach.

Let's have a look at he first rally speech decoded from the tokens using the .decode() method:

In [18]:
tokenizer.decode(lm_datasets[0]["input_ids"])

'[CLS] thank you. thank you. thank you to vice president pence. he\'s a good guy. we\'ve done a great job together. and merry christmas, michigan. thank you, michigan. what a victory we had in michigan. what a victory was that. one of the greats. was that the greatest evening? but i\'m thrilled to be here with thousands of hardworking patriots as we celebrate the miracle of christmas, the greatness of america and the glory of god. thank you very much. and did you notice that everybody is saying merry christmas again? did you notice? saying merry christmas. i remember when i first started this beautiful trip, this beautiful journey, i just said to the first lady, " you are so lucky. i took you on this fantastic journey. it\'s so much fun. they want to impeach you. they want to do worse than that. " by the way, by the way, by the way, it doesn\'t really feel like we\'re being impeached. the country is doing better than ever before. we did nothing wrong. we did nothing wrong. and we have 

And now the labels of that same speech:

In [19]:
tokenizer.decode(lm_datasets[0]["labels"])

'[CLS] thank you. thank you. thank you to vice president pence. he\'s a good guy. we\'ve done a great job together. and merry christmas, michigan. thank you, michigan. what a victory we had in michigan. what a victory was that. one of the greats. was that the greatest evening? but i\'m thrilled to be here with thousands of hardworking patriots as we celebrate the miracle of christmas, the greatness of america and the glory of god. thank you very much. and did you notice that everybody is saying merry christmas again? did you notice? saying merry christmas. i remember when i first started this beautiful trip, this beautiful journey, i just said to the first lady, " you are so lucky. i took you on this fantastic journey. it\'s so much fun. they want to impeach you. they want to do worse than that. " by the way, by the way, by the way, it doesn\'t really feel like we\'re being impeached. the country is doing better than ever before. we did nothing wrong. we did nothing wrong. and we have 

We have the exact same thing in both columns as is to be expected. 

## Fine-tuning DistilBERT with the Trainer API
Next step is to insert the mask tokens into the ids which we do via the use of a data collator. All we need to pass it is the tokenizer that we are using and the <b>mlm_probability</b> argument that specifies what fraction of the tokens to mask.=:

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Let's take a look at what the masked texts which the collator produces. It expects a list of dicts, where each dict represents a single chunk of contiguous text so we need to first iterate over the dataset before feeding the batch to the collator. We remove the "word_ids" key for this data collator as it does not expect it:

In [21]:
#samples = lm_datasets.to_list()
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] thank you. [MASK] [MASK]. thank you to [MASK] president pence. he's a good guy. we've done a great job together. maia merry christmas, michigan. thank you, michigan. what [MASK] victory we [MASK] in michigan. what a victory was that. one of the greats. was that the greatest evening? but i cavendish m thrilled to be [MASK] [MASK] thousands of hardworking patriots as we celebrate the miracle of christmas, the greatness of america and the [MASK] [MASK] god. thank [MASK] very much. and did you notice [MASK] everybody is saying merry christmas again? did you notice? saying merry christmas. i remember when i first started this beautiful trip, this beautiful [MASK], i just said to the [MASK] lady [MASK] " you are so lucky [MASK] i took you on this fantastic [MASK]. it's [MASK] much fun. they want to impeach youdp they want to do worse [MASK] that. " by the way, by [MASK] way, by the way [MASK] it doesn't really feel like [MASK]'re being impeached. the country is doing better than 

We can see that the [MASK] token has been randomly inserted at various locations in our text. These will be the tokens which our model will have to predict during training and those masks will be randomised with each batch during training.

When training models for masked language modeling, one technique that can be used is to mask whole words together, not just individual tokens. This approach is called whole word masking. If we want to use whole word masking, we will need to build a data collator ourselves. A data collator is just a function that takes a list of samples and converts them into a batch, so let’s do this now! We’ll use the word IDs computed earlier to make a map between word indices and the corresponding tokens, then randomly decide which words to mask and apply that mask on the inputs. Note that the labels are all -100 except for the ones corresponding to mask words:

In [22]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

Let's test it on the same sample as before:

In [23]:
# samples = lm_datasets.to_list()
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] thank you. thank you. [MASK] you to vice president [MASK] [MASK]. [MASK]'s a good guy [MASK] we'[MASK] [MASK] a [MASK] job together. and merry christmas, michigan. thank you [MASK] michigan. [MASK] a victory we had in michigan. [MASK] a victory was that [MASK] one [MASK] the [MASK] [MASK] [MASK] was [MASK] [MASK] greatest evening [MASK] but [MASK]'m thrilled to be here with [MASK] of hardworking patriots [MASK] we [MASK] the miracle [MASK] christmas, the greatness of america and the glory of god. thank [MASK] very [MASK]. and did you notice that everybody [MASK] saying merry [MASK] [MASK]? did you notice? [MASK] merry christmas. i [MASK] when i first started [MASK] [MASK] [MASK], this beautiful journey, [MASK] just said to [MASK] first lady, " you are so lucky. i [MASK] [MASK] on this fantastic journey. it's so much fun. they want to [MASK] [MASK] [MASK] you [MASK] they want to do [MASK] [MASK] [MASK] [MASK] " by [MASK] way [MASK] by [MASK] way, [MASK] the [MASK], it doesn'

### Train/Test Split
We now need to split the data into train and test datasets. We can make use of Dataset.train_test_split():

In [24]:
train_size = round(0.9 * len(lm_datasets))
test_size = (len(lm_datasets) - train_size)

dataset_split = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
dataset_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1739
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 193
    })
})

### Login

In [25]:
from huggingface_hub import notebook_login

notebook_login()

# huggingface-cli login

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\ksbon\.huggingface\token
Login successful


In [26]:
tf_train_dataset = model.prepare_tf_dataset(
    dataset_split["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    dataset_split["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [204]:
# Load model from local dir
save_dir = "C:/Users/ksbon/Desktop/Jupyter/repos/Hugging Face models/distilbert-base-uncased-finetuned-dt-rally-speeches"
# model = TFAutoModelForMaskedLM.from_pretrained(save_dir)

# Load model from Huggingface hub
model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-uncased-finetuned-dt-rally-speeches")

All model checkpoint layers were used when initializing TFDistilBertForMaskedLM.

All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased-finetuned-dt-rally-speeches.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [205]:
# from transformers import AdamWeightDecay
# optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [206]:
# num_train_steps = len(tf_train_dataset)
num_train_steps = 250
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)
model.compile(optimizer=optimizer, 
#               metrics=['accuracy']
             )

# Train in mixed-precision float16
# tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]
callback = PushToHubCallback(
#     output_dir=f"{model_name}-finetuned-dt-rally-speeches", 
    output_dir=save_dir, 
    tokenizer=tokenizer,
#     hub_model_id="distilbert-base-uncased-finetuned-dt-rally-speeches",
#     hub_token="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",
    save_strategy="no" # getting an error with anything other than no
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
C:\Users\ksbon\Desktop\Jupyter\repos\Hugging Face models\distilbert-base-uncased-finetuned-dt-rally-speeches is already a clone of https://huggingface.co/Shmendel/distilbert-base-uncased-finetuned-dt-rally-speeches. Make sure you pull the latest changes with `repo.git_pull()`.


In [207]:
model.fit(x=tf_train_dataset, 
          validation_data=tf_eval_dataset,
          epochs=10,
          callbacks=[callback]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Upload file tf_model.h5:   0%|          | 1.00/347M [00:00<?, ?B/s]

To https://huggingface.co/Shmendel/distilbert-base-uncased-finetuned-dt-rally-speeches
   76ad3d9..9b05dbb  main -> main



<keras.callbacks.History at 0x26b003f6f70>

In [222]:
# Manual model push to hub
# model.push_to_hub("distilbert-base-uncased-finetuned-dt-rally-speeches")

In [208]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 5.13


In [37]:
base_model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [38]:
# Base model in fill-mask configuration
base_mask_filler = pipeline(
    task="fill-mask", 
    model=base_model, 
    tokenizer=tokenizer
)

# Fine tuned model in fill-mask configuration
mask_filler = pipeline(
    task="fill-mask", 
    model=model, 
#     model="huggingface-Shmendel/distilbert-base-uncased-finetuned-dt-rally-speeches",
    tokenizer=tokenizer
)

In [240]:
# text = "It's [MASK] news" # fake is first

In [246]:
text = 'One [MASK] nation' # great is first and other similar ones are high up

In [247]:
# text = 'Let\'s make America [MASK] again!'  # great is second
# text = 'The [MASK] American nation'  # great is second

In [248]:
# text = 'Make [MASK] great again'
# text = "The [MASK] virus"
# text = "kung [MASK]"

# text = 'We will make America [MASK] again!'
# text = 'keep fighting keep [MASK]'


In [249]:
base_preds = base_mask_filler(text)

for pred in base_preds:
    print(f">>> {pred['sequence']} -> {pred['score']:.2f}")

>>> one hundred nation -> 0.04
>>> one - nation -> 0.04
>>> one sovereign nation -> 0.02
>>> one african nation -> 0.02
>>> one nation nation -> 0.02


In [250]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']} -> {pred['score']:.2f}")

>>> one great nation -> 0.09
>>> one proud nation -> 0.04
>>> one strong nation -> 0.04
>>> one - nation -> 0.04
>>> one more nation -> 0.03


In [213]:
preds

[{'score': 0.42231783270835876,
  'token': 7098,
  'token_str': 'proud',
  'sequence': "let's make america proud again!"},
 {'score': 0.07160758972167969,
  'token': 2307,
  'token_str': 'great',
  'sequence': "let's make america great again!"},
 {'score': 0.06872954964637756,
  'token': 2844,
  'token_str': 'strong',
  'sequence': "let's make america strong again!"},
 {'score': 0.027919413521885872,
  'token': 3407,
  'token_str': 'happy',
  'sequence': "let's make america happy again!"},
 {'score': 0.01937711425125599,
  'token': 6428,
  'token_str': 'stronger',
  'sequence': "let's make america stronger again!"}]

In [148]:
# import jupyterthemes as jt
# from jupyterthemes import get_themes
# from jupyterthemes.stylefx import set_nb_theme
# # jt -t monokai -f fira -fs 10 -nf ptsans -nfs 11 -N -kl -cursw 2 -cursc r -cellw 95% -T

In [149]:
# Manual model save
# save_dir = "C:/Users/ksbon/Desktop/Jupyter/repos/Donald-Trump-Rally-Speeches-NLP/distilbert-base-uncased-finetuned-dt-rally-speeches"
# tokenizer.save_pretrained(save_dir)
# model.save_pretrained(save_dir)