In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re

In [5]:
from datasets import load_dataset
def load_nynorsk_sentences(limit=50000):
    dataset = load_dataset("NbAiLab/NCC", streaming=True)
    train_stream = dataset["train"]
    sentences = []
    for example in train_stream:
        if example.get("lang_fasttext") == "nn":
            text = example["text"]
            for s in re.split(r"[.!?]\\s+", text):
                if len(s.split()) > 3:
                    sentences.append(s.strip())
        if len(sentences) >= limit:
            break
    return sentences

nynorsk_sentences = load_nynorsk_sentences()
print("Number of Nynorsk sentences:", len(nynorsk_sentences))

  from .autonotebook import tqdm as notebook_tqdm


Number of Nynorsk sentences: 50000


In [6]:
def load_validation_sentences():
    dataset = load_dataset("NbAiLab/NCC", split="validation")
    sentences = []
    for example in dataset:
        if example.get("lang_fasttext") == "nn":
            text = example["text"]
            for s in re.split(r"[.!?]\\s+", text):
                if len(s.split()) > 3:
                    sentences.append(s.strip())
    return sentences
nynorsk_validation_sentences = load_validation_sentences()
print("Number of Nynorsk validation sentences:", len(nynorsk_validation_sentences))

Number of Nynorsk validation sentences: 10966


In [7]:
# PARAMETERS
VOCAB_SIZE = 12582912
N_SEGMENTS = 3
MAX_LENGTH = 512
EMBEDDING_DIM = 768
N_LAYERS = 12
ATTN_HEADS = 12
DROPOUT_RATE = 0.1





In [None]:
from torch.utils.data import Dataset

class NorwegianDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Squeeze to remove extra dimension (batch dim inside each sample)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding
from torch.utils.data import DataLoader
from transformers import BertTokenizer
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

dataset = NorwegianDataset(nynorsk_sentences, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [10]:
import torch
import torch.nn as nn
class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, n_segments, max_length, embedding_dim, dropout_rate):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.segment_embedding = nn.Embedding(n_segments, embedding_dim)
        self.position_embedding = nn.Embedding(max_length, embedding_dim)

        self.dropout = nn.Dropout(dropout_rate)
        self.pos_input = torch.tensor([[i for i in range(max_length)]])

    def forward(self, seq, seg):
        embed_values = self.token_embedding(seq) + self.segment_embedding(seg) + self.position_embedding(self.pos_input)
        embed_values = self.dropout(embed_values)
        return embed_values
    
class BERT(nn.Module):
    def __init__(self, vocab_size, n_segments, max_length, embedding_dim, n_layers, attn_heads, dropout_rate):
        super().__init__()
        self.embedding = BERTEmbedding(vocab_size, n_segments, max_length, embedding_dim, dropout_rate)
        self.encoder_layer = nn.TransformerEncoderLayer(embedding_dim, attn_heads, embedding_dim * 4, dropout=dropout_rate, activation="gelu")
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, n_layers)

    def forward(self, seq, seg):
        out = self.embedding(seq, seg)
        out = self.encoder_block(out)
        return out


In [11]:
class BERTMLM(nn.Module):
    def __init__(self, bert_model, vocab_size):
            super().__init__()
            self.bert = bert_model
            self.mlm_head = nn.Linear(bert_model.embedding.token_embedding.embedding_dim, vocab_size)
            
    def forward(self, seq, seg):
        hidden_states = self.bert(seq, seg)
        # Assuming you want to predict for every token
        prediction_scores = self.mlm_head(hidden_states)
        return prediction_scores

In [13]:
import torch

def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    labels = inputs.clone()
    # Create a mask for positions to mask
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    
    # Only compute loss on masked tokens
    labels[~masked_indices] = -100  # Using -100 as ignore index
    
    # 80% replace with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    
    # 10% replace with random token
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]
    
    # The rest 10% keep unchanged (masked_indices & not replaced by mask or random)
    return inputs, labels


In [14]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTMLM(BERT(VOCAB_SIZE, N_SEGMENTS, MAX_LENGTH, EMBEDDING_DIM, N_LAYERS, ATTN_HEADS, DROPOUT_RATE), VOCAB_SIZE)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

num_epochs = 3  # Adjust as needed

model.train()
for epoch in range(num_epochs):
    for batch in dataloader:
        # Move input tensors to device
        input_ids = batch['input_ids'].to(device)
        # Use segment ids if available; otherwise, create a dummy tensor
        token_type_ids = batch.get('token_type_ids', torch.zeros_like(input_ids)).to(device)

        # Apply the MLM masking function
        inputs_masked, labels = mask_tokens(input_ids.clone(), tokenizer)
        inputs_masked = inputs_masked.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(inputs_masked, token_type_ids)
        
        # Compute loss; reshape to (batch_size*seq_length, vocab_size)
        loss = loss_fn(outputs.view(-1, outputs.size(-1)), labels.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch} Loss: {loss.item()}")




: 