In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import DatasetDict
from transformers import AutoTokenizer
from collections import Counter
import matplotlib.pyplot as plt

from src.configuration.set_up_config_device import (
    get_allowed_cpu_count,
    set_up_config_device,
    set_up_device,
)
from src.data_processing.loading import load_iob_data
from src.data_processing.tokenization import create_tokenized_dataset
from src.data_processing.descriptive_statistics import descriptive_statistics
from src.model.transformer import TransformerForNER
from src.model.train import (
    train_model,
    plot_train_val_loss,
)
from src.evaluate.evaluate import evaluate_ner_model

In [None]:
device = set_up_device()
cpu_count = get_allowed_cpu_count()
n_process = set_up_config_device(cpu_count)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "GroNLP/bert-base-dutch-cased", add_prefix_space=True
)

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
NB_EPOCHS = 1

In [None]:
MODEL_PARAMETERS = {
    "pad_idx": tokenizer.pad_token_id,
    "vocab_size": tokenizer.vocab_size,
    "hidden_size": 256,
    "n_head": 4,
    "max_len": 512,
    "ffn_hidden": 512,
    "n_layers": 4,
}

# **LOAD & PROCESS DATA**

In [None]:
sentences, labels = load_iob_data("data/raw/train-nl.tsv")

In [None]:
tokenized_dataset, label2id, id2label = create_tokenized_dataset(
    sentences, labels, tokenizer, save_path="data/tokenized/tokenized_dataset.json"
)

In [None]:
# Split train/test
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Split test into validation/test
val_test = train_test["test"].train_test_split(test_size=0.5, seed=42)

# Recombine into final DatasetDict
split_dataset = DatasetDict(
    {
        "train": train_test["train"],
        "validation": val_test["train"],
        "test": val_test["test"],
    }
)

In [None]:
split_dataset

In [None]:
def DataCollatorForToken(batch):
    return {
        "input_ids": torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["input_ids"]) for x in batch],
            batch_first=True,
            padding_value=tokenizer.pad_token_id,
        ),
        "attention_mask": torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["attention_mask"]) for x in batch],
            batch_first=True,
            padding_value=0,
        ),
        "labels": torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["labels"]) for x in batch],
            batch_first=True,
            padding_value=label2id["O"],
        ),
    }


train_dataset = split_dataset["train"].remove_columns(["tokens", "ner_tags"])
val_dataset = split_dataset["validation"].remove_columns(["tokens", "ner_tags"])

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=DataCollatorForToken
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, collate_fn=DataCollatorForToken
)

# **DESCRIPTIVE STATISTICS**

In [None]:
descriptive_statistics(sentences, labels)

# **DEFINE & TRAIN MODEL**

In [None]:
model = TransformerForNER(**MODEL_PARAMETERS, num_labels=len(label2id)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
train_losses, val_losses = train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    epochs=NB_EPOCHS,
    save_path="checkpoints/best_model.pt",
)

In [None]:
plot_train_val_loss(train_losses, val_losses, title="Training vs Validation Loss")

# **EVALUATE RESULTS**

In [None]:
test_dataset = split_dataset["test"].remove_columns(["tokens", "ner_tags"])

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator)

In [None]:
evaluate_ner_model(model, test_loader, id2label, device=device)