In [None]:
from transformers import BertForMaskedLM, BertTokenizer
import torch

# Load BERT for MLM
bert_model = BertForMaskedLM.from_pretrained("bert-large-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

In [4]:
import numpy as np

def data_augment(text: str, p_bert: float) -> str:
    """Data augmentation with mask language modeling."""

    # Extract the subwords tokens from the text
    tokenized_text = np.array(bert_tokenizer.tokenize(text))

    # Randomly mask tokens in the text
    mask = np.random.rand(len(tokenized_text)) < p_bert
    masked_text = np.where(mask, "[MASK]", tokenized_text).tolist()
    mask_indices = np.where(mask)[0].tolist()

    # Convert to BERT input format
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(masked_text)
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict masked tokens
    with torch.no_grad():
        outputs = bert_model(tokens_tensor)
        predictions = outputs[0]

    # Extract predicted token indices for all masked positions at once
    predicted_indices = torch.argmax(predictions[0, mask_indices], dim=1).tolist()
    predicted_tokens = bert_tokenizer.convert_ids_to_tokens(predicted_indices)

    # Replace MASK tokens with predictions
    augmented_tokens = tokenized_text.copy()
    for idx, mask_pos in enumerate(mask_indices):
        augmented_tokens[mask_pos] = predicted_tokens[idx]

    # Convert back to text
    return bert_tokenizer.convert_tokens_to_string(augmented_tokens)

In [7]:
import pandas as pd
text = pd.read_parquet('../data/text8/test.parquet')['text']
text = text[0][:2000]
augmented_text = data_augment(text, p_bert=0.2)

print(augmented_text)

be ejected and hold it there examine the chamber to ensure it is clear , the magazine to go back under control push the manual assist fire the action and close the ejection port cover safety precaution magazine fitted perform to unload if the above safety precaution is used with a charged magazine fitted a round will be chambered load insert the charged magazine into the magazine housing there is a loud audible click when the magazine is properly fitted the firmness of the magazine can be tested by gently trying to remove the magazine strike it from its base with medium firmness to make sure it is home . pull the charging handle all the way to the rear and release with the forward assist if the weapon is not to be used immediately then put the fire selector to safe or close the ejection port cover fire marksmanship is a fine and thou art and is beyond the scope of this article however in case the weapon is fired by putting the fire selectorr either semi or burst auto on some occasions 

In [6]:
print(text)

be ejected and hold it there examine the chamber to ensure it is clear allow the action to go forward under control push the forward assist fire the action and close the ejection port cover safety precaution magazine fitted perform an unload if the above safety precaution is used with a charged magazine fitted a round will be chambered load insert a charged magazine into the magazine housing there is a slight audible click when the magazine is properly fitted the secureness of the magazine can be tested by gently trying to remove the magazine strike it from the base with medium firmness to make sure it locks home ready pull the charging handle all the way to the rear and release push the forward assist if the weapon is not to be fired immediately then put the fire selector to safe and close the ejection port cover fire marksmanship is a complex and subtle art and is beyond the scope of this article however in general the weapon is fired by putting the fire selector on either semi or bu