In [1]:
import numpy as np
import re

# Frist we need to load our dataset
We have selected the NbAiLab/NCC dataset
This is a dataset contructed by the national library optimized for training large language models
We only want the nynorsk eksamples since our bert model are focusing on it

In [None]:
from datasets import load_dataset
from itertools import islice

dataset = load_dataset("NbAiLab/NCC", streaming=True)

nynorsk_dataset = dataset["train"].filter(lambda example: example.get("lang_fasttext", "") == "nn")

with open("nynorsk_corpus.txt", "w", encoding="utf-8") as f:
    for i, example in enumerate(islice(nynorsk_dataset, 800)):
        f.write(example["text"].strip() + "\n")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
test_dataset = load_dataset("NbAiLab/NCC", split="validation", streaming=True)

test_nynorsk_dataset = test_dataset.filter(lambda example: example.get("lang_fasttext", "") == "nn")

for example in islice(test_nynorsk_dataset, 5):
    print(example)

with open("nynorsk_test_corpus.txt", "w", encoding="utf-8") as f:
    for i, example in enumerate(islice(test_nynorsk_dataset, 800)):
        f.write(example["text"].strip() + "\n")

{'id': 'maalfrid_4b84113933f7134b94f68fbf804a70405b08e68e_23', 'doc_type': 'maalfrid_ssb', 'publish_year': 2021, 'lang_fasttext': 'nn', 'lang_fasttext_conf': '0.446', 'text': 'u) Nr ul c) u) o_ Nr Nr 001 0) 0 1,- 0) e-- CiD CO CD CD 0 Nt LO Nr Tr Co Tr ,- Nr c) c4 0) co_ 0) co r- Tr c» o Nr ul u) Tr co r- Nr a) Nr u)'}
{'id': 'maalfrid_f1ebddd1220542416ef98935770372f0fc8c011b_61', 'doc_type': 'maalfrid_regjeringen', 'publish_year': 2021, 'lang_fasttext': 'nn', 'lang_fasttext_conf': '0.607', 'text': 'Boktittelen Språk er makt av psykologen Rolv Mik\xad kel Blakar frå 1973 opna eit nytt perspektiv på språk og vart raskt eit munnhell. No er det også vorte vanleg å seia at språk er ei viktig kjelde til makt, eller at språk og språkbruk representerer makt. Språk og språkbruk er integrert i samfunnet, «der kontroll, påvirkning og makt er uttrykk for sentrale mellommenneskelege relasjonar», skriv Blakar. Men språk er ikkje alltid makt, og makt dreier seg om mykje meir enn språk og språkbruk. 

# Tokenizing and settin special tokens
Before we start creating hour BERT model we first define our tokenizer. The tokenizer builds a tokenizermodel working with a set vocabulary size.

It is also to define some special tokens. These tokens each have different responsibilities:

-  [PAD] tokens are added to sentences so all of them have equal length
-  [CLS] tokens are Classification tokens, it serves as a Start of sentence(SOS) and represent the meaning of the sentence
- [SEP] represents the End of entence(EOS) and also separation between sentences
- [MASK] is used for wordreplacement during masked language tasks
- [UNK] is used for filling in for words that are out of the vocabulary og the tokenizer

In [4]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(lowercase=True)
tokenizer.train(
    files=["nynorsk_corpus.txt"],
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
)

tokenizer.save_model("tokenizer/")







['tokenizer/vocab.txt']

# Just a small script for loading in the tokenizer model from a directory
Now we do not need to train it every time we want to test something quickly. Yippieee

In [None]:
# load_tokenizer.py
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained("tokenizer/")
print("Tokenizer loaded. Example tokens:", tokenizer.tokenize("Dette er ein test for Nynorsk."))
print("Vocabulary size:", tokenizer.vocab_size)
print("Special tokens:", tokenizer.special_tokens_map)



Tokenizer loaded. Example tokens: ['dette', 'er', 'ein', 'test', 'for', 'nynorsk', '.']
Vocabulary size: 10000
Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


# NCCTorchDataset
Before sending our dataset into a BERT model we need to prepare it.
In this class we use the tokenizer we trained earlier to encode the text included in the dataset.

Here an important feature to mention is the DataCollectorForLanguageModdeling class. This masks a procentage of the dataset so it will be prime to use for training a BERT model for mlm tasks. For each batch that are pulled from the dataloader a new masking pattern is created, securing that there are differences in each batch.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, DataCollatorForLanguageModeling

class NCCTorchDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer: BertTokenizer, max_len=128):
        self.hf_dataset = hf_dataset
        self.tokenizer  = tokenizer
        self.max_len    = max_len

    def __len__(self):
        return len(self.hf_dataset)
    
    def __getitem__(self, index):
        text = self.hf_dataset[index]['text']
        enc  = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids      = enc['input_ids'].squeeze(0)
        attention_mask = enc['attention_mask'].squeeze(0)
        token_type_ids = torch.zeros_like(input_ids, dtype=torch.long)

        return {
            'input_ids':      input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

from datasets import load_dataset
from sklearn.model_selection import train_test_split
hf_dataset = load_dataset("text", data_files={"alle": "nynorsk_corpus.txt"}, split="alle")
split = hf_dataset.train_test_split(test_size=0.1, seed=42)

train_hf = split['train']
dev_hf   = split['test']

tokenizer = BertTokenizer.from_pretrained("tokenizer/")

train_ds = NCCTorchDataset(train_hf, tokenizer, max_len=128)

dev_ds = NCCTorchDataset(dev_hf, tokenizer, max_len=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

train_loader = DataLoader(
    train_ds,
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator,
)

dev_loader = DataLoader(
    dev_ds,
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator,
)

print("Train batch shapes:")
batch = next(iter(train_loader))
for k, v in batch.items():
    print(f"  {k:15s}: {tuple(v.shape)}")

print("\nDev batch shapes:")
batch = next(iter(dev_loader))
for k, v in batch.items():
    print(f"  {k:15s}: {tuple(v.shape)}")

Generating alle split: 800 examples [00:00, 82094.37 examples/s]

Train batch shapes:
  input_ids      : (32, 128)
  attention_mask : (32, 128)
  token_type_ids : (32, 128)
  labels         : (32, 128)

Dev batch shapes:
  input_ids      : (32, 128)
  attention_mask : (32, 128)
  token_type_ids : (32, 128)
  labels         : (32, 128)





# Now the good stuff
In the following cell we have prepared classes for positional embedding, BERT embedding and a CustomBERT model that we are going to train later. 

## Postitional embedding

Positional embedding is the first part of the pipeline. It is a clever way of to destingush where in a sequence a word is placed, and also the distance to other words in the sequence. Sine and cosine functions are used to create unique positional encodings for each position in the sequence

## BERT embedding
The BERTembedding class is used to create input embeddings for the BERT model
It combines token embeddings, segment embeddings and positional embeddings.
The token embeddings are learned from the vocabularym while the segment embeddings are used tp diffriencate between two segments in the input. The positional embeddings are added to the token embeddings to provide information about the position of each token. Dropout layer is to prevent overfitting...

# CustomBERT
The CustomBERT class is a simplified version of the BERT model.
It consists of an embedding layer, a transformer encoder, and a masked language modeling (MLM) head.
The transformer encoder is composed of multiple layers, each containing multi-head self-attention
and feedforward neural networks.
The MLM head is used to predict the masked tokens in the input sequence.
The forward method takes token IDs, segment IDs, and an optional attention mask as input,
and returns the logits for the masked language modeling task.
The save_model method allows saving the model's state dictionary to a specified path.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, seq_len=128):
        super().__init__()
        pe = torch.zeros(seq_len, d_model)
        for pos in range(seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, seq_len=128):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.segment_embedding = nn.Embedding(2, d_model)
        self.positional_embedding = PositionalEmbedding(d_model, seq_len)
        self.dropout = nn.Dropout(0.1)

    def forward(self, token_ids, segment_ids):
        x = self.token_embedding(token_ids) + self.segment_embedding(segment_ids)
        x = self.positional_embedding(x)
        return self.dropout(x)
    

class CustomBERT(nn.Module):
    def __init__(self, vocab_size, d_model=256, seq_len=128, num_layers=4, num_heads=4, dim_feedforward=512):
        super().__init__()
        self.embeddings = BERTEmbedding(vocab_size, d_model, seq_len)        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Masked Language Modeling head: project encoder output to the vocabulary space.
        self.mlm_head = nn.Linear(d_model, vocab_size)
    
    def forward(self, token_ids, segment_ids, attention_mask=None):
        # token_ids, segment_ids: [batch_size, seq_len]
        x = self.embeddings(token_ids, segment_ids)  # shape: [batch_size, seq_len, d_model]
        # PyTorch's Transformer encoder expects input shape: [seq_len, batch_size, d_model]
        x = x.transpose(0, 1)
        # Create key padding mask if provided (mask positions where attention_mask==0)
        key_padding_mask = (attention_mask == 0) if attention_mask is not None else None 
        # Pass through the encoder layers.
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        # Transform back to [batch_size, seq_len, d_model]
        x = x.transpose(0, 1)
        # Compute MLM logits.
        logits = self.mlm_head(x)  # shape: [batch_size, seq_len, vocab_size]
        return logits
    
    def save_model(self, path):
        torch.save(self.state_dict(), path)


# First we train our own customBERT


## Following steps are used

Create a model and what hyperparameters it should have.

Optimizer
We use an adamW optimizer for updating the models parameters based on gradients computed during back propagation

Loss function
The loss function is a crossentropu loss that computes the difference between our models predicted token distribution, and the "true" tokens it want it to predikt
by setting all unmasked positions in the labels to -100, we kan ignore index -100 to skip those positions when averaging the loss. This penalizes the model for getting masked tokens wrong, matching BERT mlm objective

GradScaler
Then we use a gradient scaler. This utilizes 16.bit operations which uses a lott less memory than the 32 bit operations. CHATGPT said this will help so we tried it

## Then the epochs

each epoch loops trhough the whole dataset once. The first lines load the tensors from the batch. Then zero_grad() is added to clear gradient buffers from previous steps.
Autocast enables mixed-precision, and reshapes the tensors from [B*L,V] whre B are batch size, L sequence lenght and V the Vocab_length, down to [B*L] this makes it possible for CrossEntropyLoss to compute the average negative log-likelygood over the masked tokens

Backpropagation is done with the gradscaler to correct the model taking the los as an input

These teqniques were not used earlier for some models. 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm

device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = tokenizer.vocab_size

model = CustomBERT(
    vocab_size=VOCAB_SIZE,
    d_model=768,
    seq_len=128,
    num_layers=4,
    num_heads=12,
    dim_feedforward=3072
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs   = 3
total_steps  = num_epochs * len(train_loader)
warmup_steps = int(0.1 * total_steps)
scheduler    = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
scaler  = torch.GradScaler("cpu")

best_dev_loss = float("inf")

print("Training started...\n")

for epoch in range(1, num_epochs + 1):
    # — Training —
    model.train()
    running_train_loss = 0.0
    train_loop = tqdm(train_loader, desc=f"Epoch {epoch} [train]", unit="batch")
    for step, batch in enumerate(train_loop, start=1):
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels         = batch["labels"].to(device)

        optimizer.zero_grad()
        with torch.autocast("cpu"):
            logits = model(
                token_ids      = input_ids,
                segment_ids    = token_type_ids,
                attention_mask = attention_mask
            )
            loss = loss_fn(
                logits.view(-1, VOCAB_SIZE),
                labels.view(-1)
            )
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_train_loss += loss.item()
        train_loop.set_postfix(train_loss=f"{running_train_loss/step:.4f}")

    avg_train_loss = running_train_loss / len(train_loader)
    print(f"\n→ Epoch {epoch} Train Loss: {avg_train_loss:.4f}")

    # — Evaluation on Dev —
    model.eval()
    running_dev_loss = 0.0
    dev_loop = tqdm(dev_loader, desc=f"Epoch {epoch} [dev]  ", unit="batch")

    for step, batch in enumerate(dev_loop, start=1):
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels         = batch["labels"].to(device)

        with torch.no_grad():
            logits = model(
                token_ids      = input_ids,
                segment_ids    = token_type_ids,
                attention_mask = attention_mask
            )
            loss = loss_fn(
                logits.view(-1, VOCAB_SIZE),
                labels.view(-1)
            )
            running_dev_loss += loss.item()

        dev_loop.set_postfix(dev_loss=f"{running_dev_loss/step:.4f}")

    avg_dev_loss = running_dev_loss / len(dev_loader)
    print(f"Epoch {epoch} Dev   Loss: {avg_dev_loss:.4f}")


    # — Save best model —
    if avg_dev_loss < best_dev_loss:
        best_dev_loss = avg_dev_loss
        torch.save(model.state_dict(), "custom_bert_model/best_mlm_model.pth")
        print("  * New best dev loss, model saved.")

print("\nTraining complete.")


Training started...



Epoch 1 [train]:   0%|          | 0/23 [00:00<?, ?batch/s]

In [None]:
import torch
from transformers import BertTokenizer  # pip install transformers

# 1) Load tokenizer & model
tokenizer = BertTokenizer.from_pretrained("tokenizer/")

VOCAB_SIZE = tokenizer.vocab_size

# Model trained on 500 examples with a VOCAB_SIZE of 10000, and tuned parameters, + masked data and backpropagation.
model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=768, seq_len=128,num_layers=4, num_heads=12, dim_feedforward==3072)


# Load the custom BERT model with tuned parameters trained on 2000 examples.
#model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=768, seq_len=128,num_layers=12, num_heads=12, dim_feedforward=512)


# Load model tuned with parameters trained on 8000 examples.
# model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=768, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)

# Load model with smaller parameters trainied on 10000 examples.
# model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=256, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)


# Load model trained on 100000 examples using the same parameters as the small, with a gpu.
#model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=256, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)

# Load model from directory.
current_model = "mlm_finetuned.pth"
checkpoint = torch.load(
    "custom_bert_model/" + current_model,
    map_location=torch.device("cpu")
)
model.load_state_dict(checkpoint)
model.to("cpu")
model.eval()

# 2) Prepare a masked sentence
sentence = "Dette er ein [MASK] for Nynorsk."
inputs = tokenizer(
    sentence,
    padding='max_length',
    max_length=128,
    truncation=True,
    return_tensors='pt'
)
token_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
segment_ids = torch.zeros_like(token_ids)

# 3) Forward pass
with torch.no_grad():
    logits = model(token_ids, segment_ids, attention_mask)

# 4) Find masked position(s) and predict
mask_token_index = (token_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
batch_idx, seq_pos = mask_token_index[0][0], mask_token_index[1][0]
mask_logits = logits[batch_idx, seq_pos]
# Get the 5 most likely tokens for the masked position.
top_k = 5
top_k_indices = torch.topk(mask_logits, top_k).indices
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices.tolist())
# Print the top-k predictions
print(f"Top {top_k} predictions for [MASK]: {top_k_tokens}")
# 5) Get the predicted token ID and convert it to a token
predicted_id = mask_logits.argmax(dim=-1).item()
predicted_token = tokenizer.convert_ids_to_tokens(predicted_id)
# 5) Print the prediction
print(f"Original sentence: {sentence}")
print(f"Masked token ID: {predicted_id}")
print(f"Masked token: {tokenizer.mask_token}")
print(f"Prediction for [MASK]: {predicted_token}")
print(f"Dette er ein {predicted_token} for Nynorsk.")


Top 10 predictions for [MASK]: ['.', 'i', "'", ',', '##ד', '##el', '##ed', '##in', '##et', '##en']
Original sentence: Dette er ein [MASK] for Nynorsk.
Masked token ID: 18
Masked token: [MASK]
Prediction for [MASK]: .
Dette er ein . for Nynorsk.




# Lets try to score the models on classification
We will wrap the CustomBERT in a BertForSequenceClassification

Then we gather all preds and all labels as numpy arrays

for metrics we create a classifivation_report that shows precision/recall/F! scores

end we will visualize using a confusion matrix

In [None]:
import math
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from typing import Dict

def evaluate_mlm(
    model: nn.Module,
    dataloader: DataLoader,
    vocab_size: int,
    device: torch.device
) -> Dict[str, float]:
    """
    Evaluate a masked language model on a dataloader.

    Returns a dictionary with:
      - avg_loss: float
      - perplexity: float
      - top1_acc: float
      - top5_acc: float
    """
    model.eval()
    model.to(device)
    loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    total_loss = 0.0
    total_masked = 0
    correct1 = 0
    correct5 = 0
    vocab_size = vocab_size

    with torch.no_grad():
        for batch in dataloader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels         = batch["labels"].to(device)

            logits = model(
                token_ids = input_ids,
                segment_ids = token_type_ids,
                attention_mask = attention_mask
            )  # [B, L, V]

            # Compute loss over masked tokens
            loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1))
            num_masked = (labels.view(-1) != -100).sum().item()
            total_loss += loss.item() * num_masked
            total_masked += num_masked

            # Compute top-k accuracy
            probs = torch.softmax(logits, dim=-1)
            mask_pos = labels != -100
            masked_preds = probs[mask_pos]  # [N_masked, V]
            true_ids = labels[mask_pos]     # [N_masked]

            # Top-1
            pred1 = masked_preds.argmax(dim=-1)
            correct1 += (pred1 == true_ids).sum().item()

            # Top-5
            top5 = torch.topk(masked_preds, k=5, dim=-1).indices  # [N_masked, 5]
            for i in range(top5.size(0)):
                if true_ids[i].item() in top5[i].tolist():
                    correct5 += 1

    avg_loss = total_loss / total_masked
    perplexity = math.exp(avg_loss)
    top1_acc = correct1 / total_masked
    top5_acc = correct5 / total_masked

    return {
        "avg_loss": avg_loss,
        "perplexity": perplexity,
        "top1_accuracy": top1_acc,
        "top5_accuracy": top5_acc
    }

custom_bert1 = CustomBERT(vocab_size=30000, d_model=256, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)
checkpoint = torch.load(
    "custom_bert_model/custom_bert_model.pth",
    map_location=torch.device("cpu")
)
custom_bert1.load_state_dict(checkpoint)
custom_bert1.to(device)
custom_bert1.eval()

custom_bert2 = CustomBERT(vocab_size=30000, d_model=768, seq_len=128,num_layers=12, num_heads=12, dim_feedforward=512)
checkpoint = torch.load(
    "custom_bert_model/custom_bert_model2.pth",
    map_location=torch.device("cpu")
)
custom_bert2.load_state_dict(checkpoint)
custom_bert2.to(device)
custom_bert2.eval()

# Evaluate the models
dev_results = evaluate_mlm(
    model=custom_bert1,
    dataloader=dev_loader,
    vocab_size=30000,
    device=device
)
print("CustomBert Evaluation results on dev set:")
for key, value in dev_results.items():
    print(f"  {key}: {value:.4f}")

cb_2_results = evaluate_mlm(
    model=custom_bert2,
    dataloader=dev_loader,
    vocab_size=30000,
    device=device
)

print("\n Custombert2 Evaluation results on dev set:")
for key, value in cb_2_results.items():
    print(f"  {key}: {value:.4f}")

mlmf_results = evaluate_mlm(
    model=model,
    dataloader=dev_loader,
    vocab_size=VOCAB_SIZE,
    device=device
)
print("\nMLM Finetuned Evaluation results on dev set:")
for key, value in mlmf_results.items():
    print(f"  {key}: {value:.4f}")

#model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=256, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)
# Load model from directory.
i1_bert = CustomBERT(vocab_size=30000, d_model=256, seq_len=128,num_layers=4, num_heads=4, dim_feedforward=512)
checkpoint = torch.load(
    "custom_bert_model/i1_custom_bert_model.pth",
    map_location=torch.device("cpu")
)
i1_bert.load_state_dict(checkpoint)
i1_bert.to(device)
i1_bert.eval()

# Evaluate the models   
i1_results = evaluate_mlm(
    model=i1_bert,
    dataloader=dev_loader,
    vocab_size=30000,
    device=device
)
print("\nI1 CustomBert Evaluation results on dev set:")
for key, value in i1_results.items():
    print(f"  {key}: {value:.4f}")
    


CustomBert Evaluation results on dev set:
  avg_loss: 11.1442
  perplexity: 69164.2232
  top1_accuracy: 0.0635
  top5_accuracy: 0.0642

 Custombert2 Evaluation results on dev set:
  avg_loss: 10.5152
  perplexity: 36871.4029
  top1_accuracy: 0.0430
  top5_accuracy: 0.0909

MLM Finetuned Evaluation results on dev set:
  avg_loss: 8.4661
  perplexity: 4750.8034
  top1_accuracy: 0.0395
  top5_accuracy: 0.1118

I1 CustomBert Evaluation results on dev set:
  avg_loss: 12.4231
  perplexity: 248486.2151
  top1_accuracy: 0.0536
  top5_accuracy: 0.0536


# Now lets train a predefined bert model from huggingface

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch import optim

from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

import random

def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """
    Prepare masked tokens inputs/labels for masked language modeling.
    This function randomly masks tokens with a probability of mlm_probability.
    """
    labels = inputs.clone()
    # Create a mask of positions to mask
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
    probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # Replace masked indices with the [MASK] token id
    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    return inputs, labels

import torch
from tqdm.auto import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        # Move inputs to device:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['segment_ids'].to(device)

        # Apply masking for MLM:
        masked_input_ids, labels = mask_tokens(input_ids.clone(), tokenizer, mlm_probability=0.15)
        
        outputs = model(
            input_ids=masked_input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels.to(device)
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

In [42]:
# Save the model after training.
directory = "custom_bert_model/"
import os
os.makedirs(directory, exist_ok=True)
model_name = "finetuned_bert_model"
model.save_pretrained(os.path.join(directory, "BERTmlm.pth"))

AttributeError: 'CustomBERT' object has no attribute 'save_pretrained'

In [43]:
from transformers import pipeline, BertTokenizer, BertForMaskedLM

# Load your fine-tuned model and your custom tokenizer
model_dir = "custom_bert_model/BERTmlm"
tokenizer_dir = "tokenizer/"

model = BertForMaskedLM.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)

# Create the fill-mask pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Provide a test sentence with a [MASK] token.
# (Make sure the special mask token in your tokenizer matches what you use here.)
test_sentence = "Dette er et eksempel på [MASK] norsk setning."
results = fill_mask(test_sentence)

# Print the results
for result in results:
    print(f"Prediction: {result['token_str']}, Score: {result['score']:.4f}")


SafetensorError: Error while deserializing header: MetadataIncompleteBuffer

In [53]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

# Assume CustomBERT is defined in your code (see previous examples)
# from custom_bert import CustomBERT

# Load the custom tokenizer and model.
tokenizer = BertTokenizer.from_pretrained("tokenizer/")
# model = CustomBERT(vocab_size=VOCAB_SIZE, d_model=768, seq_len=128,num_layers=12, num_heads=12, hidden_dim=512)
model = CustomBERT(vocab_size=30000, d_model=256, seq_len=128, num_layers=4, num_heads=4, hidden_dim=512)
model.load_state_dict(torch.load("custom_bert_model/i1_custom_bert_model.pth", map_location="cpu"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_token_vector(word, tokenizer, model):
    tokens = tokenizer.tokenize(word)
    if not tokens:
        raise ValueError(f"Word '{word}' could not be tokenized.")
    token_id = tokenizer.convert_tokens_to_ids(tokens[0])
    return model.embeddings.token_embedding.weight[token_id]

word_a = "dag"
word_b = "seint"

vec_a = get_token_vector(word_a, tokenizer, model)
vec_b = get_token_vector(word_b, tokenizer, model)

target_vector = vec_a + vec_b

# Compute cosine similarities with all token embeddings.
all_embeddings = model.embeddings.token_embedding.weight
target_vector_norm = F.normalize(target_vector.unsqueeze(0), dim=-1)
all_embeddings_norm = F.normalize(all_embeddings, dim=-1)
cosine_sim = torch.matmul(target_vector_norm, all_embeddings_norm.transpose(0, 1))

# Retrieve the top 3 tokens with highest cosine similarity.
topk = torch.topk(cosine_sim, k=6)
top_values = topk.values.squeeze(0).tolist()
top_indices = topk.indices.squeeze(0).tolist()

print(f"Vector arithmetic result: {word_a} + {word_b} yields:")
for i, (score, idx) in enumerate(zip(top_values, top_indices)):
    token = tokenizer.convert_ids_to_tokens(idx)
    print(f"{i+1}: {token} (cosine similarity: {score:.4f})")


Vector arithmetic result: dag + seint yields:
1: seint (cosine similarity: 0.7218)
2: dag (cosine similarity: 0.6949)
3: ##nende (cosine similarity: 0.2359)
4: admin (cosine similarity: 0.2355)
5: ferdene (cosine similarity: 0.2219)
6: ##snemnda (cosine similarity: 0.2189)
