In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re

In [None]:
from datasets import load_dataset
dataset = load_dataset("NbAiLab/NCC")

nynorsk_dataset = dataset.filter(lambda example: example.get("language", "") == "nn")


print(nynorsk_dataset)

NameError: name 'tf' is not defined

In [8]:
# tokenize_and_train.py
from tokenizers import BertWordPieceTokenizer

# Initialize a new tokenizer instance with appropriate options.
tokenizer = BertWordPieceTokenizer(lowercase=True)

# Train the tokenizer:
tokenizer.train(
    files=nynorsk_dataset["train"]["text"],
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
)

# Save the tokenizer to a directory for later use.
tokenizer.save_model("tokenizer/")








['tokenizer/vocab.txt']

In [9]:
# load_tokenizer.py
from transformers import BertTokenizer

# Load the custom tokenizer from the saved directory.
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
print("Tokenizer loaded. Example tokens:", tokenizer.tokenize("Dette er en test for Nynorsk."))



Tokenizer loaded. Example tokens: ['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]']


In [None]:
# dataset.py
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class NCCDataset(Dataset):
    def __init__(self, texts, tokenizer: BertTokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        # Encode the text using the tokenizer and include special tokens.
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(),         # shape: [max_len]
            'attention_mask': encoded['attention_mask'].squeeze()    # shape: [max_len]
        }


Dataset sample: {'input_ids': tensor([1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import DataLoader

# =======================
# 1. Custom Embedding Modules
# =======================

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, seq_len=128):
        super().__init__()
        pe = torch.zeros(seq_len, d_model)
        for pos in range(seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        self.register_buffer('pe', pe.unsqueeze(0))  # shape: [1, seq_len, d_model]

    def forward(self, x):
        # x shape: [batch_size, seq_len, d_model]
        return x + self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, seq_len=128):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # Here we assume 2 segments (e.g. sentence A and sentence B)
        self.segment_embedding = nn.Embedding(2, d_model)
        self.positional_embedding = PositionalEmbedding(d_model, seq_len)
        self.dropout = nn.Dropout(0.1)

    def forward(self, token_ids, segment_ids):
        # token_ids and segment_ids: [batch_size, seq_len]
        x = self.token_embedding(token_ids) + self.segment_embedding(segment_ids)
        x = self.positional_embedding(x)
        return self.dropout(x)

# =======================
# 2. Custom BERT Model Definition
# =======================

class CustomBERT(nn.Module):
    def __init__(self, vocab_size, d_model=256, seq_len=128, num_layers=4, num_heads=4, hidden_dim=512):
        super().__init__()
        self.embeddings = BERTEmbedding(vocab_size, d_model, seq_len)
        
        # Define a Transformer encoder (stack of encoder layers)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Masked Language Modeling head: project encoder output to the vocabulary space.
        self.mlm_head = nn.Linear(d_model, vocab_size)
    
    def forward(self, token_ids, segment_ids, attention_mask=None):
        # token_ids, segment_ids: [batch_size, seq_len]
        x = self.embeddings(token_ids, segment_ids)  # shape: [batch_size, seq_len, d_model]
        
        # PyTorch's Transformer encoder expects input shape: [seq_len, batch_size, d_model]
        x = x.transpose(0, 1)
        
        # Create key padding mask if provided (mask positions where attention_mask==0)
        key_padding_mask = (attention_mask == 0) if attention_mask is not None else None
        
        # Pass through the encoder layers.
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        
        # Transform back to [batch_size, seq_len, d_model]
        x = x.transpose(0, 1)
        
        # Compute MLM logits.
        logits = self.mlm_head(x)  # shape: [batch_size, seq_len, vocab_size]
        return logits


Output logits shape: torch.Size([8, 128, 30522])




In [None]:
# Instantiate the custom tokenizer (assumed to be already trained and saved)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("tokenizer/")

# Instantiate the custom BERT model.
VOCAB_SIZE = 30000  # Should match the vocabulary size of your tokenizer.
model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=256, seq_len=128,
                   num_layers=4, num_heads=4, hidden_dim=512)

# Place the model on GPU if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer.
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop.
num_epochs = 150

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        segment_ids = batch['segment_ids'].to(device)

        # In a full MLM training scenario, you would apply a masking strategy.
        # For this simplified example, we'll use input_ids as labels directly.
        labels = input_ids.clone()
        
        optimizer.zero_grad()
        logits = model(token_ids=input_ids, segment_ids=segment_ids, attention_mask=attention_mask)
        
        # Compute loss: flatten logits and labels.
        loss_fct = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
        loss = loss_fct(logits.view(-1, VOCAB_SIZE), labels.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss/len(dataloader):.4f}")



TypeError: CustomBERT.forward() missing 1 required positional argument: 'segment_ids'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare a sample sentence with a masked token.
# This example sentence contains a [MASK] token where we want the model to predict the missing word.
test_sentence = "Dette er et [MASK] eksempel på en setning."
print("Test sentence:", test_sentence)

# Tokenize and encode the text, adding special tokens automatically.
encoded = tokenizer.encode_plus(
    test_sentence,
    add_special_tokens=True,
    return_tensors="pt",
    truncation=True,
    max_length=128,
    padding="max_length"
)

input_ids = encoded["input_ids"].to(device)
attention_mask = encoded["attention_mask"].to(device)

# Perform inference with no gradient tracking.
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    # 'outputs.logits' has shape [batch_size, seq_length, vocab_size]
    predictions = outputs.logits

# Find the position of the [MASK] token.
mask_token_index = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
if len(mask_token_index[0]) == 0:
    raise ValueError("No [MASK] token found in the input!")
# Assuming one [MASK] per input, obtain the index.
mask_index = mask_token_index[1].item()

# Get the predicted token for the masked position.
predicted_token_id = predictions[0, mask_index].argmax(dim=-1).item()
predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)
print("Predicted token:", predicted_token)

# Replace the [MASK] token in the original sentence.
predicted_sentence = test_sentence.replace("[MASK]", predicted_token)
print("Predicted sentence:", predicted_sentence)

AttributeError: 'MaskedLMOutput' object has no attribute 'last_hidden_state'