In [48]:
!pip install transformers datasets torch




[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Import required libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
import torch
import random
from torch.utils.data import DataLoader

In [50]:
# Load dataset in streaming mode
dataset = load_dataset("bookcorpus", trust_remote_code=True, streaming=True)

# Get an iterator over the training dataset
train_iter = iter(dataset["train"])

# Fetch and print 5 text samples
for _ in range(5):
    sample = next(train_iter)
    print(sample["text"])


usually , he would be tearing around the living room , playing with his toys .
but just one look at a minion sent him practically catatonic .
that had been megan 's plan when she got him dressed earlier .
he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .
she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .


In [None]:
import random
from transformers import BertTokenizer

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize and create Masked Language Modeling (MLM) examples
def mask_text(text, tokenizer, mlm_probability=0.3):  # Applies random masking with 30% probability.
    """
    This function masks random tokens in the text to create the Masked Language Modeling (MLM) task.
    """
    print("\nOriginal Text:", text)

    # Tokenize the input text into tokens (subwords for BERT)
    tokens = tokenizer.tokenize(text)
    print("Tokenized Text:", tokens)

    # Create a list to store the masked tokens
    masked_tokens = []
    masked_count = 0  # Counter to ensure that some tokens were replaced

    for token in tokens:
        if random.random() < mlm_probability:
            masked_tokens.append('[MASK]')
            masked_count += 1  # Token replaced
        else:
            masked_tokens.append(token)

    print("Masked Tokens:", masked_tokens)
    print(f"Total Masked Tokens: {masked_count}")  # Print the number of replaced words

    # Convert the masked tokens back to text
    masked_text = tokenizer.convert_tokens_to_string(masked_tokens)
    print("Masked Text:", masked_text)

    return masked_text

# Test the function with a sample text
sample_text = "The quick brown fox jumps over the lazy dog."
masked_sample = mask_text(sample_text, tokenizer)



Original Text: The quick brown fox jumps over the lazy dog.
Tokenized Text: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Masked Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', '[MASK]', 'the', 'lazy', '[MASK]', '.']
Total Masked Tokens: 2
Masked Text: the quick brown fox jumps [MASK] the lazy [MASK] .


In [None]:
# Step 3: Apply masking on a sample text (fetching from dataset)
train_iter = iter(dataset["train"])  # Reinitialize the iterator

# Get the first sample text from the dataset
sample_text = next(train_iter)["text"]

# Mask some of the words
masked_sample = mask_text(sample_text, tokenizer)

print("\nOriginal Text: ", sample_text[:300])  # Print first 300 chars of original text
print("\nMasked Text: ", masked_sample[:300])  # Print first 300 chars of masked text

# Convert the masked text into input format for BERT (convert to token ids)
inputs = tokenizer(masked_sample, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Print the tokenized input ids
print("\nTokenized Input IDs:", inputs["input_ids"])



Original Text: usually , he would be tearing around the living room , playing with his toys .
Tokenized Text: ['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']
Masked Tokens: ['usually', ',', '[MASK]', 'would', '[MASK]', '[MASK]', 'around', 'the', 'living', 'room', ',', 'playing', 'with', '[MASK]', 'toys', '[MASK]']
Total Masked Tokens: 5
Masked Text: usually , [MASK] would [MASK] [MASK] around the living room , playing with [MASK] toys [MASK]

Original Text:  usually , he would be tearing around the living room , playing with his toys .

Masked Text:  usually , [MASK] would [MASK] [MASK] around the living room , playing with [MASK] toys [MASK]

Tokenized Input IDs: tensor([[  101,  2788,  1010,   103,  2052,   103,   103,  2105,  1996,  2542,
          2282,  1010,  2652,  2007,   103, 10899,   103,   102]])


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MaskedLanguageModelingDataset(Dataset):
    def __init__(self, dataset, tokenizer, mlm_probability=0.15):
        """
        This class prepares the dataset for training, applying masking to the text.
        """
        super().__init__()  # Ensure proper initialization
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability

        print(f"✅ Dataset initialized with {len(dataset)} samples")  # Confirm number of samples

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get the raw text from the dataset
        text = self.dataset[idx]

        print(f"\n🔹 Original Text ({idx}):", text[:100])  # Print only the first 100 characters to avoid long output

        # Apply masking to the text
        masked_text = mask_text(text, self.tokenizer, self.mlm_probability)

        print(f"🔹 Masked Text ({idx}):", masked_text[:100])  # Print first 100 characters of masked text

        # Tokenize the masked text into input ids
        inputs = self.tokenizer(masked_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        print(f"🔹 Tokenized Input IDs Shape: {inputs['input_ids'].shape}")  # Confirm dimensions

        # Convert input ids to PyTorch tensors and return them
        return {
            'input_ids': inputs['input_ids'].squeeze(0),  # Remove the batch dimension ([1, 512] → [512]).
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': inputs['input_ids'].squeeze(0)  # Labels are same as input_ids for MLM
        }

# Step 5: Instantiate the dataset (only using the first 100 texts for testing)
filtered_texts = [next(iter(dataset["train"]))["text"] for _ in range(100)]  # Extract 100 samples from the data
train_dataset = MaskedLanguageModelingDataset(filtered_texts, tokenizer)

# Create a DataLoader for batching
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Test a batch
batch = next(iter(train_dataloader))
print("\n✅ Batch successfully loaded!")
print("🔹 Batch input_ids shape:", batch["input_ids"].shape)
print("🔹 Batch labels shape:", batch["labels"].shape)


✅ Dataset initialized with 100 samples

🔹 Original Text (46): usually , he would be tearing around the living room , playing with his toys .

Original Text: usually , he would be tearing around the living room , playing with his toys .
Tokenized Text: ['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']
Masked Tokens: ['usually', ',', 'he', 'would', 'be', 'tearing', '[MASK]', 'the', 'living', '[MASK]', ',', 'playing', 'with', 'his', '[MASK]', '.']
Total Masked Tokens: 3
Masked Text: usually , he would be tearing [MASK] the living [MASK] , playing with his [MASK] .
🔹 Masked Text (46): usually , he would be tearing [MASK] the living [MASK] , playing with his [MASK] .
🔹 Tokenized Input IDs Shape: torch.Size([1, 512])

🔹 Original Text (78): usually , he would be tearing around the living room , playing with his toys .

Original Text: usually , he would be tearing around the living room , playing with his toys .
Toke

In [None]:
# Extract the first 100 texts manually from dataset["train"]
train_texts = [next(iter(dataset["train"]))["text"] for _ in range(100)]

# Print the first 3 samples to ensure data is loading correctly
print("\n✅ Sample extracted texts:")
for i in range(3):
    print(f"🔹 Sample {i+1}: {train_texts[i][:100]}...")  # Print only the first 100 characters

# Step 5: Instantiate the dataset
train_dataset = MaskedLanguageModelingDataset(train_texts, tokenizer)

# Create a DataLoader for batching
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Test loading a batch of data
batch = next(iter(train_dataloader))

# Print batch dimensions to ensure everything is working correctly
print("\n✅ Batch successfully loaded!")
print("🔹 Batch input_ids shape:", batch["input_ids"].shape)
print("🔹 Batch labels shape:", batch["labels"].shape)


✅ Sample extracted texts:
🔹 Sample 1: usually , he would be tearing around the living room , playing with his toys ....
🔹 Sample 2: usually , he would be tearing around the living room , playing with his toys ....
🔹 Sample 3: usually , he would be tearing around the living room , playing with his toys ....
✅ Dataset initialized with 100 samples

🔹 Original Text (18): usually , he would be tearing around the living room , playing with his toys .

Original Text: usually , he would be tearing around the living room , playing with his toys .
Tokenized Text: ['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']
Masked Tokens: ['[MASK]', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', '[MASK]', 'his', 'toys', '.']
Total Masked Tokens: 2
Masked Text: [MASK] , he would be tearing around the living room , playing [MASK] his toys .
🔹 Masked Text (18): [MASK] , he would be tearing ar

In [66]:
# Step 6: Load the pre-trained BERT model for Masked Language Modeling (MLM)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Step 7: Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✅ Optimizer initialized with {num_trainable_params:,} trainable parameters.")

# Set up a learning rate scheduler
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Print training configuration details
print(f"✅ Training Configuration:")
print(f"🔹 Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"🔹 Total epochs: {epochs}")
print(f"🔹 Total steps: {total_steps}")
print(f"🔹 Warmup steps: 0 (No warmup)")

# Print information about the first update in the scheduler
optimizer.step()  # Perform one update before training
scheduler.step()  # Update the learning rate
print(f"🔹 Updated Learning Rate after one step: {optimizer.param_groups[0]['lr']}")


✅ Optimizer initialized with 109,514,298 trainable parameters.
✅ Training Configuration:
🔹 Learning rate: 5e-05
🔹 Total epochs: 1
🔹 Total steps: 13
🔹 Warmup steps: 0 (No warmup)
🔹 Updated Learning Rate after one step: 4.615384615384616e-05


In [69]:
# Step 8: Training Loop (Example: 1 Epoch)
model.train()  # Set model to training mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loop over the batches of data
for epoch in range(epochs):
    for batch in train_dataloader:
        # Move the batch to the GPU (if available)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass: Compute model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Get the loss
        loss = outputs.loss

        # Backward pass: Compute gradients
        loss.backward()

        # Optimizer step: Update model parameters
        optimizer.step()
        # Scheduler step: Update learning rate
        scheduler.step()

        # Zero the gradients for the next step
        optimizer.zero_grad()

        # Print the loss for tracking
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# After training, you can save the model and tokenizer
model.save_pretrained('./bert_finetuned')
tokenizer.save_pretrained('./bert_finetuned')


🔹 Original Text (84): usually , he would be tearing around the living room , playing with his toys .

Original Text: usually , he would be tearing around the living room , playing with his toys .
Tokenized Text: ['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']
Masked Tokens: ['usually', ',', 'he', 'would', '[MASK]', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']
Total Masked Tokens: 1
Masked Text: usually , he would [MASK] tearing around the living room , playing with his toys .
🔹 Masked Text (84): usually , he would [MASK] tearing around the living room , playing with his toys .
🔹 Tokenized Input IDs Shape: torch.Size([1, 512])

🔹 Original Text (54): usually , he would be tearing around the living room , playing with his toys .

Original Text: usually , he would be tearing around the living room , playing with his toys .
Tokenized Text: ['usually', ',', 'he', 'wou

('./bert_finetuned\\tokenizer_config.json',
 './bert_finetuned\\special_tokens_map.json',
 './bert_finetuned\\vocab.txt',
 './bert_finetuned\\added_tokens.json')

In [154]:
import torch

def predict_top_k_masked_word(text, model, tokenizer, top_k=5):
    """
    Predict the top `top_k` words for [MASK] within the input text.
    """
    print("\n🔍 Starting masked word prediction...")
    print(f"🔹 Original Text: {text}")

    masked_text = text  # Do not reapply masking
    inputs = tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    masked_indices = torch.where(inputs.input_ids[0] == tokenizer.mask_token_id)[0]

    if masked_indices.numel() == 0:
        print("❌ No [MASK] token found in the input! Returning original text.")
        return masked_text  

    print(f"✅ Found {len(masked_indices)} [MASK] tokens at positions: {masked_indices.tolist()}")

    for masked_index in masked_indices:
        top_k_token_ids = torch.topk(logits[0, masked_index], top_k).indices.tolist()
        top_k_words = tokenizer.convert_ids_to_tokens(top_k_token_ids)

        print(f"🔹 Top {top_k} predictions for [MASK] at position {masked_index}: {top_k_words}")

        # Replace only the first occurrence of [MASK], but this can be made more dynamic if needed
        predicted_word = top_k_words[0]
        masked_text = masked_text.replace("[MASK]", predicted_word, 1)

    print(f"✅ Final Predicted Text: {masked_text}\n")
    return masked_text

# 🛠️ **Test the model with top_k=5**
test_sentences = [
    "The quick brown [MASK] jumps over the lazy dog.",
    "I love to eat [MASK] for breakfast.",
    "She went to the [MASK] to buy some groceries.",
    "It was a [MASK] and stormy night."
]

print("\n🔍 **Model Predictions with top_k=5:**")
for sentence in test_sentences:
    print(f"🔹 Input: {sentence}")
    print(f"✅ Prediction: {predict_top_k_masked_word(sentence, model, tokenizer, top_k=5)}\n")



🔍 **Model Predictions with top_k=5:**
🔹 Input: The quick brown [MASK] jumps over the lazy dog.

🔍 Starting masked word prediction...
🔹 Original Text: The quick brown [MASK] jumps over the lazy dog.
✅ Found 1 [MASK] tokens at positions: [4]
🔹 Top 5 predictions for [MASK] at position 4: ['cat', 'mare', 'eye', 'man', 'coat']
✅ Final Predicted Text: The quick brown cat jumps over the lazy dog.

✅ Prediction: The quick brown cat jumps over the lazy dog.

🔹 Input: I love to eat [MASK] for breakfast.

🔍 Starting masked word prediction...
🔹 Original Text: I love to eat [MASK] for breakfast.
✅ Found 1 [MASK] tokens at positions: [5]
🔹 Top 5 predictions for [MASK] at position 5: ['fish', 'them', 'pancakes', 'peanuts', 'squash']
✅ Final Predicted Text: I love to eat fish for breakfast.

✅ Prediction: I love to eat fish for breakfast.

🔹 Input: She went to the [MASK] to buy some groceries.

🔍 Starting masked word prediction...
🔹 Original Text: She went to the [MASK] to buy some groceries.
✅ Found

In [140]:
def predict_single_masked_word(text, model, tokenizer):
    """
    Predict the most likely word for [MASK] within the input text.
    """
    return predict_top_k_masked_word(text, model, tokenizer, top_k=1)  # Set top_k=1 for single prediction


In [160]:
# Test sentences
test_sentences = [
    "The capital of France is [MASK].",
    "It was a [MASK] and stormy night."
]

print("\n🔍 **Testing Single Word Prediction:**")
for sentence in test_sentences:
    print(f"\n🔹 Input: {sentence}")
    predicted_sentence = predict_single_masked_word(sentence, model, tokenizer)
    print(f"✅ Prediction: {predicted_sentence}")



🔍 **Testing Single Word Prediction:**

🔹 Input: The capital of France is [MASK].

🔍 Starting masked word prediction...
🔹 Original Text: The capital of France is [MASK].
✅ Found 1 [MASK] tokens at positions: [6]
🔹 Top 1 predictions for [MASK] at position 6: ['paris']
✅ Final Predicted Text: The capital of France is paris.

✅ Prediction: The capital of France is paris.

🔹 Input: It was a [MASK] and stormy night.

🔍 Starting masked word prediction...
🔹 Original Text: It was a [MASK] and stormy night.
✅ Found 1 [MASK] tokens at positions: [4]
🔹 Top 1 predictions for [MASK] at position 4: ['cold']
✅ Final Predicted Text: It was a cold and stormy night.

✅ Prediction: It was a cold and stormy night.


In [157]:
# Example prediction
text = "I am a boy my name is [MASK], my mom is named [MASK] and my dad's name is [MASK] ."

print("\n🔍 **Running single prediction test...**")
print(f"🔹 Input Sentence: {text}")

# Run the prediction
predicted_sentence = predict_top_k_masked_word(text, model, tokenizer)

# Print the predicted word inside the sentence
print(f"\n✅ Final Prediction: {predicted_sentence}")



🔍 **Running single prediction test...**
🔹 Input Sentence: I am a boy my name is [MASK], my mom is named [MASK] and my dad's name is [MASK] .

🔍 Starting masked word prediction...
🔹 Original Text: I am a boy my name is [MASK], my mom is named [MASK] and my dad's name is [MASK] .
✅ Found 3 [MASK] tokens at positions: [8, 14, 22]
🔹 Top 5 predictions for [MASK] at position 8: ['adam', 'alex', 'joshua', 'james', 'david']
🔹 Top 5 predictions for [MASK] at position 14: ['jennifer', 'emily', 'michelle', 'samantha', 'sarah']
🔹 Top 5 predictions for [MASK] at position 22: ['peter', 'mark', 'mike', 'george', 'john']
✅ Final Predicted Text: I am a boy my name is adam, my mom is named jennifer and my dad's name is peter .


✅ Final Prediction: I am a boy my name is adam, my mom is named jennifer and my dad's name is peter .
