In [1]:
import numpy as np
import re

In [15]:
from datasets import load_dataset
from itertools import islice

# Load the dataset in streaming mode.
dataset = load_dataset("NbAiLab/NCC", streaming=True)

# Use the 'train' split from the dataset.
# Note: Make sure to filter on the correct language field.
nynorsk_dataset = dataset["train"].filter(lambda example: example.get("lang_fasttext", "") == "nn")

# Write a subset (e.g., first 1000 examples) to a file using islice.
with open("nynorsk_corpus.txt", "w", encoding="utf-8") as f:
    for i, example in enumerate(islice(nynorsk_dataset, 10000)):
        f.write(example["text"].strip() + "\n")


In [16]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(lowercase=True)
tokenizer.train(
    files=["nynorsk_corpus.txt"],
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
)
tokenizer.save_model("tokenizer/")







['tokenizer/vocab.txt']

In [17]:
# load_tokenizer.py
from transformers import BertTokenizer

# Load the custom tokenizer from the saved directory.
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
print("Tokenizer loaded. Example tokens:", tokenizer.tokenize("Dette er ein test for Nynorsk."))
# Check the vocabulary size
print("Vocabulary size:", tokenizer.vocab_size)
# Check the special tokens
print("Special tokens:", tokenizer.special_tokens_map)



Tokenizer loaded. Example tokens: ['dette', 'er', 'ein', 'test', 'for', 'nynorsk', '.']
Vocabulary size: 30000
Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [19]:
import torch
from torch.utils.data import Dataset
class NCCTorchDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer: BertTokenizer, max_len=128):
        self.hf_dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.hf_dataset)
    
    def __getitem__(self, index):
        # Assume each item has a 'text' field.
        item = self.hf_dataset[index]
        text = item['text']
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),         # shape: [max_len]
            'attention_mask': encoded['attention_mask'].squeeze(0),   # shape: [max_len]
            # If no segment info is available, default to zeros.
            'segment_ids': torch.zeros(encoded['input_ids'].shape, dtype=torch.long).squeeze(0)
        }

# -----------------------------------------
# 4. Load the NCC Dataset from Hugging Face
# -----------------------------------------
# Load the "NbAiLab/NCC" dataset using the Hugging Face datasets library.
# For demonstration, we select a small subset (e.g., first 100 samples).
hf_dataset = load_dataset("NbAiLab/NCC", split='train').select(range(5000))

# Load the custom tokenizer (assumed to be trained and saved in the "tokenizer/" directory).
tokenizer = BertTokenizer.from_pretrained("tokenizer/")

import torch
from torch.utils.data import DataLoader

# Create our PyTorch dataset.
dataset = NCCTorchDataset(hf_dataset, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import DataLoader

# =======================
# 1. Custom Embedding Modules
# =======================

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, seq_len=128):
        super().__init__()
        pe = torch.zeros(seq_len, d_model)
        for pos in range(seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        self.register_buffer('pe', pe.unsqueeze(0))  # shape: [1, seq_len, d_model]

    def forward(self, x):
        # x shape: [batch_size, seq_len, d_model]
        return x + self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, seq_len=128):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # Here we assume 2 segments (e.g. sentence A and sentence B)
        self.segment_embedding = nn.Embedding(2, d_model)
        self.positional_embedding = PositionalEmbedding(d_model, seq_len)
        self.dropout = nn.Dropout(0.1)

    def forward(self, token_ids, segment_ids):
        # token_ids and segment_ids: [batch_size, seq_len]
        x = self.token_embedding(token_ids) + self.segment_embedding(segment_ids)
        x = self.positional_embedding(x)
        return self.dropout(x)

# =======================
# 2. Custom BERT Model Definition
# =======================

class CustomBERT(nn.Module):
    def __init__(self, vocab_size, d_model=256, seq_len=128, num_layers=4, num_heads=4, hidden_dim=512):
        super().__init__()
        self.embeddings = BERTEmbedding(vocab_size, d_model, seq_len)
        
        # Define a Transformer encoder (stack of encoder layers)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Masked Language Modeling head: project encoder output to the vocabulary space.
        self.mlm_head = nn.Linear(d_model, vocab_size)
    
    def forward(self, token_ids, segment_ids, attention_mask=None):
        # token_ids, segment_ids: [batch_size, seq_len]
        x = self.embeddings(token_ids, segment_ids)  # shape: [batch_size, seq_len, d_model]
        
        # PyTorch's Transformer encoder expects input shape: [seq_len, batch_size, d_model]
        x = x.transpose(0, 1)
        
        # Create key padding mask if provided (mask positions where attention_mask==0)
        key_padding_mask = (attention_mask == 0) if attention_mask is not None else None
        
        # Pass through the encoder layers.
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        
        # Transform back to [batch_size, seq_len, d_model]
        x = x.transpose(0, 1)
        
        # Compute MLM logits.
        logits = self.mlm_head(x)  # shape: [batch_size, seq_len, vocab_size]
        return logits
    
    def save_model(self, path):
        torch.save(self.state_dict(), path)


In [21]:
# Instantiate the custom tokenizer (assumed to be already trained and saved)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
print("Tokenizer loaded. Example tokens:", tokenizer.tokenize("Dette er en test for Nynorsk."))

# Instantiate the custom BERT model.
VOCAB_SIZE = 30000  # Should match the vocabulary size of your tokenizer.
model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=256, seq_len=128,
                   num_layers=4, num_heads=4, hidden_dim=512)

# Place the model on GPU if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer.
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop.
num_epochs = 3

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        segment_ids = batch['segment_ids'].to(device)

        # In a full MLM training scenario, you would apply a masking strategy.
        # For this simplified example, we'll use input_ids as labels directly.
        labels = input_ids.clone()
        
        optimizer.zero_grad()
        logits = model(token_ids=input_ids, segment_ids=segment_ids, attention_mask=attention_mask)
        
        # Compute loss: flatten logits and labels.
        loss_fct = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
        loss = loss_fct(logits.view(-1, VOCAB_SIZE), labels.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss/len(dataloader):.4f}")
# Save the model after training.
import os
directory = "custom_bert_model/"
os.makedirs(directory, exist_ok=True)
model.save_model(os.path.join(directory, "custom_bert_model.pth"))

Tokenizer loaded. Example tokens: ['dette', 'er', 'en', 'test', 'for', 'nynorsk', '.']
Epoch 1/3 - Loss: 4.0497
Epoch 2/3 - Loss: 1.4543
Epoch 3/3 - Loss: 0.6466


In [22]:
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
print("Tokenizer loaded. Example tokens:", tokenizer.tokenize("Dette er en test for Nynorsk."))
model.load_state_dict(torch.load("custom_bert_model/custom_bert_model.pth"))
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a test sentence with a [MASK] token.
test_sentence = "Dette er et godt eksempel [MASK] en setning."
print("Test sentence:", test_sentence)

# Tokenize input. Ensure the [MASK] token is preserved.
encoded = tokenizer.encode_plus(
    test_sentence,
    add_special_tokens=True,
    max_length=128,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

input_ids = encoded["input_ids"].to(device)
attention_mask = encoded["attention_mask"].to(device)
# For single-sentence input without segment differentiation, set segment_ids to zeros.
segment_ids = torch.zeros_like(input_ids).to(device)

# Run the model
with torch.no_grad():
    logits = model(token_ids=input_ids, segment_ids=segment_ids, attention_mask=attention_mask)

# Find the index of the [MASK] token in the input_ids.
mask_token_index = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
if len(mask_token_index[0]) == 0:
    raise ValueError("No [MASK] token found in the input!")
# Here, we assume only one [MASK] token per input.
mask_index = mask_token_index[1].item()

# Extract logits for the masked position and get the predicted token ID.
predicted_token_logits = logits[0, mask_index]
predicted_token_id = torch.argmax(predicted_token_logits).item()
predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)

print("Predicted token:", predicted_token)
# Replace [MASK] with the predicted token in the string.
predicted_sentence = test_sentence.replace("[MASK]", predicted_token)
print("Predicted sentence:", predicted_sentence)

Tokenizer loaded. Example tokens: ['dette', 'er', 'en', 'test', 'for', 'nynorsk', '.']
Test sentence: Dette er et godt eksempel [MASK] en setning.
Predicted token: lin
Predicted sentence: Dette er et godt eksempel lin en setning.


In [23]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

# Assume CustomBERT is defined in your code (see previous examples)
# from custom_bert import CustomBERT

# Load the custom tokenizer and model.
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
model = CustomBERT(vocab_size=30000, d_model=256, seq_len=128, num_layers=4, num_heads=4, hidden_dim=512)
model.load_state_dict(torch.load("custom_bert_model/custom_bert_model.pth"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Helper function to get the embedding vector for a given word.
def get_token_vector(word, tokenizer, model):
    # Tokenize the word.
    tokens = tokenizer.tokenize(word)
    if not tokens:
        raise ValueError(f"Word '{word}' could not be tokenized.")
    # For simplicity, assume the word corresponds to a single token.
    token_id = tokenizer.convert_tokens_to_ids(tokens[0])
    # Return the token embedding.
    return model.embeddings.token_embedding.weight[token_id]

word_a = "konge"
word_b = "mann"
word_c = "kvinne" 

vec_a = get_token_vector(word_a, tokenizer, model)
vec_b = get_token_vector(word_b, tokenizer, model)
vec_c = get_token_vector(word_c, tokenizer, model)

target_vector = vec_a - vec_b + vec_c

# Compute cosine similarities with all token embeddings.
all_embeddings = model.embeddings.token_embedding.weight
target_vector_norm = F.normalize(target_vector.unsqueeze(0), dim=-1)
all_embeddings_norm = F.normalize(all_embeddings, dim=-1)
cosine_sim = torch.matmul(target_vector_norm, all_embeddings_norm.transpose(0, 1))

# Retrieve the top 3 tokens with highest cosine similarity.
topk = torch.topk(cosine_sim, k=3)
top_values = topk.values.squeeze(0).tolist()
top_indices = topk.indices.squeeze(0).tolist()

print(f"Vector arithmetic result: {word_a} - {word_b} + {word_c} yields:")
for i, (score, idx) in enumerate(zip(top_values, top_indices)):
    token = tokenizer.convert_ids_to_tokens(idx)
    print(f"{i+1}: {token} (cosine similarity: {score:.4f})")


Vector arithmetic result: konge - mann + kvinne yields:
1: kvinne (cosine similarity: 0.6046)
2: konge (cosine similarity: 0.5614)
3: livsl (cosine similarity: 0.2522)
