In [1]:
from transformers import BertForMaskedLM, BertTokenizer
import torch

# Load BERT for MLM
bert_model = BertForMaskedLM.from_pretrained("bert-large-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import numpy as np

def data_augment(text: str, p_bert: float) -> str:
    """Data augmentation with mask language modeling."""

    # Extract the subwords tokens from the text
    tokenized_text = np.array(text)

    # Randomly mask tokens in the text
    mask = np.random.rand(len(tokenized_text)) < p_bert
    masked_text = np.where(mask, "[MASK]", tokenized_text).tolist()
    mask_indices = np.where(mask)[0].tolist()

    # Convert to BERT input format
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(masked_text)
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict masked tokens
    with torch.no_grad():
        outputs = bert_model(tokens_tensor)
        predictions = outputs[0]

    # Extract predicted token indices for all masked positions at once
    predicted_indices = torch.argmax(predictions[0, mask_indices], dim=1).tolist()
    predicted_tokens = bert_tokenizer.convert_ids_to_tokens(predicted_indices)

    # Replace MASK tokens with predictions
    augmented_tokens = tokenized_text.copy()
    for idx, mask_pos in enumerate(mask_indices):
        augmented_tokens[mask_pos] = predicted_tokens[idx]

    # Convert back to text
    return bert_tokenizer.convert_tokens_to_string(augmented_tokens)

In [3]:
import pandas as pd
texts = pd.read_parquet('../data/text8/test.parquet')['text']
text = texts[0][:1000]
augmented_text = data_augment(text, p_bert=0.2)

print(augmented_text)

TypeError: len() of unsized object

In [4]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [9]:
import warnings

# Suppress specific warning
warnings.filterwarnings("ignore", message=".*indexing errors.*")

tokenized_text = tokenizer.encode(texts[0][:10000], add_special_tokens=True)
tokenized_text = tokenized_text[:512]

text = tokenizer.decode(tokenized_text)
tokenized_text = bert_tokenizer.tokenize(text)

augmented = data_augment(tokenized_text[:512], p_bert=0.2)
augmented = tokenizer.encode(augmented,truncation=True, max_length=512)

input = augmented[:-1] # pad to 512
target = augmented[1:] # pad to 512

ff


In [10]:
input = augmented[:-1]
target = augmented[1:]

511