In [1]:
from transformers import BertForMaskedLM, BertTokenizer
import torch

# Load BERT for MLM
bert_model = BertForMaskedLM.from_pretrained("bert-large-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
bert_model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-

In [2]:
import random

def mask_random_words(text, mask_prob=0.15):
    words = text.split()  # Split by whitespace to get whole words
    masked_text = []
    masked_indices = []

    for i, word in enumerate(words):
        # Apply masking probability to whole words
        if random.random() < mask_prob:
            subwords = bert_tokenizer.tokenize(word)  # Ensure correct tokenization
            masked_text.extend(["[MASK]"] * len(subwords))  # Mask all subword pieces
            masked_indices.append(i)  # Keep track of masked word index
        else:
            masked_text.extend(bert_tokenizer.tokenize(word))  # Keep word as is

    return " ".join(masked_text), masked_indices, words

In [3]:
def predict_with_bert(masked_text):
    inputs = bert_tokenizer(masked_text, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)

    predictions = outputs.logits
    masked_indices = (inputs.input_ids == bert_tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    predicted_tokens = []
    for idx in masked_indices:
        predicted_id = predictions[0, idx].argmax(dim=-1).item()
        predicted_tokens.append(bert_tokenizer.decode(predicted_id))

    return predicted_tokens

In [4]:
import pandas as pd
text = pd.read_parquet('../data/text8/test.parquet')['text'][0]
chunks = [text[i : i + 1024] for i in range(0, len(text), 1024)]
masked_text, masked_indices, original_tokens = mask_random_words(chunks[0], 0.05)
predicted_tokens = predict_with_bert(masked_text)


for i, pred in zip(masked_indices, predicted_tokens):
    original_tokens[i] = pred
augmented_text = " ".join(original_tokens)

print(augmented_text)

once ejected and hold it there in the chamber to ensure it is clear allow the action to go forward under control push the forward assist release the action and close the ejection port cover safety precaution magazine fitted perform an unload if the above safety precaution is used with a charged magazine fitted a round will be chambered load insert a charged magazine into the magazine housing there is a slight audible click when the magazine is properly fitted the secureness of the magazine can be tested by gently trying to remove the magazine strike it against the base with medium firmness to make sure it locks home ready pull the charging handle all the way to the rear and release push the forward assist if the weapon is not to be fired immediately then put the fire selector to safe and close the ejection port cover fire marksmanship is a complex and subtle art and is beyond the scope of this article however in general the weapon is fired by putting the fire selector on either semi or

In [20]:
print(chunks[0])

be ejected and hold it there examine the chamber to ensure it is clear allow the action to go forward under control push the forward assist fire the action and close the ejection port cover safety precaution magazine fitted perform an unload if the above safety precaution is used with a charged magazine fitted a round will be chambered load insert a charged magazine into the magazine housing there is a slight audible click when the magazine is properly fitted the secureness of the magazine can be tested by gently trying to remove the magazine strike it from the base with medium firmness to make sure it locks home ready pull the charging handle all the way to the rear and release push the forward assist if the weapon is not to be fired immediately then put the fire selector to safe and close the ejection port cover fire marksmanship is a complex and subtle art and is beyond the scope of this article however in general the weapon is fired by putting the fire selector on either semi or bu

In [21]:
print(masked_text)

be ejected and hold it there examine the chamber to ensure it is clear allow the action to go forward under [MASK] push the [MASK] assist fire the action and close the e ##ject ##ion port cover safety pre ##ca ##ution magazine fitted perform [MASK] un ##load if the above safety pre ##ca ##ution is used with a charged magazine [MASK] [MASK] round will be chamber ##ed load insert a charged [MASK] into the magazine housing there is a slight audible click when the magazine is properly fitted the secure ##ness of the magazine can be tested by gently trying to remove the magazine strike it from the base with medium firm ##ness to make sure it locks home ready pull the charging handle all the way to the rear and release push the forward assist if the [MASK] [MASK] not to be [MASK] immediately then put the fire selector to safe and close the e ##ject ##ion port cover fire marks ##manship is a complex and subtle [MASK] and is beyond the scope of this article however in [MASK] the weapon is fire