In [None]:
import torch
import torch.nn as nn
from datasets import DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer
from collections import Counter

from src.configuration.set_up_config_device import (
    get_allowed_cpu_count,
    set_up_config_device,
    set_up_device,
)
from src.data_processing.loading import load_iob_data
from src.data_processing.tokenization import create_tokenized_dataset
from src.model.transformer import (
    Transformer,
    TransformerForNER,
)
from src.model.train import train_model

In [None]:
device = set_up_device()
cpu_count = get_allowed_cpu_count()
n_process = set_up_config_device(cpu_count)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", add_prefix_space=True)

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
NB_EPOCHS = 5
LOG_STEPS = 1

In [None]:
MODEL_PARAMETERS = {
    "pad_idx":0,
    "voc_size"=tokenizer.vocab_size,
    "hidden_size"=256,
    "n_head"=4,
    "max_len"=512,
    "dec_max_len"=128,
    "ffn_hidden"=512,
    "n_layers"=4
}

# **LOAD & PROCESS DATA**

In [None]:
sentences, labels = load_iob_data("data/raw/train-nl.tsv")

In [None]:
tokenized_dataset = create_tokenized_dataset(
    sentences, labels, save_path="data/tokenized/tokenized_dataset.json"
)

In [None]:
# Split train/test
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Split test into validation/test
val_test = train_test["test"].train_test_split(test_size=0.5, seed=42)

# Recombine into final DatasetDict
split_dataset = DatasetDict(
    {
        "train": train_test["train"],
        "validation": val_test["train"],
        "test": val_test["test"],
    }
)

In [None]:
split_dataset

In [None]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, padding=True, max_length=512, return_tensors="pt"
)

train_dataset = split_dataset["train"].remove_columns(["tokens", "ner_tags"])
val_dataset = split_dataset["validation"].remove_columns(["tokens", "ner_tags"])

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator
)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator)

# **DESCRIPTIVE STATISTICS**

# **DEFINE & TRAIN MODEL**

In [None]:
all_labels = [
    label for example in split_dataset["train"] for label in example["ner_tags"]
]
unique_ids = sorted(set(all_labels))
label2id = {label: label for label in unique_ids}

In [None]:
base_model = Transformer(**MODEL_PARAMETERS)

In [None]:
model = TransformerForNER(base_model, hidden_size=256, num_labels=len(label2id)).to(
    device
)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
train_losses, val_losses = train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    device,
    epochs=NB_EPOCHS,
    log_steps=LOG_STEPS,
)

# **EVALUATE RESULTS**