In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer
from collections import Counter

from src.configuration.set_up_config_device import (
    get_allowed_cpu_count,
    set_up_config_device,
    set_up_device,
)
from src.data_processing.loading import load_iob_data
from src.data_processing.tokenization import create_tokenized_dataset
from src.data_processing.descriptive_statistics import descriptive_statistics
from src.model.transformer import (
    Transformer,
    TransformerForNER,
)
from src.model.train import train_model

In [3]:
device = set_up_device()
cpu_count = get_allowed_cpu_count()
n_process = set_up_config_device(cpu_count)

2025-04-11 13:08:41 - INFO - Using cpu device
2025-04-11 13:08:41 - INFO - Using 128 CPUs
2025-04-11 13:08:41 - INFO - torch set up to use 64 processes


In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", add_prefix_space=True)

In [5]:
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
NB_EPOCHS = 5
LOG_STEPS = 1

In [6]:
MODEL_PARAMETERS = {
    "pad_idx": 0,
    "voc_size": tokenizer.vocab_size,
    "hidden_size": 256,
    "n_head": 4,
    "max_len": 512,
    "dec_max_len": 128,
    "ffn_hidden": 512,
    "n_layers": 4,
}

# **LOAD & PROCESS DATA**

In [7]:
sentences, labels = load_iob_data("data/raw/train-nl.tsv")

In [8]:
tokenized_dataset = create_tokenized_dataset(
    sentences, labels, save_path="data/tokenized/tokenized_dataset.json"
)

Map: 100%|██████████| 2199/2199 [00:02<00:00, 1092.83 examples/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  5.57ba/s]

Tokenized dataset saved at: data/tokenized/tokenized_dataset.json





In [9]:
# Split train/test
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Split test into validation/test
val_test = train_test["test"].train_test_split(test_size=0.5, seed=42)

# Recombine into final DatasetDict
split_dataset = DatasetDict(
    {
        "train": train_test["train"],
        "validation": val_test["train"],
        "test": val_test["test"],
    }
)

In [10]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1759
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 220
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 220
    })
})

In [11]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, padding=True, max_length=512, return_tensors="pt"
)

train_dataset = split_dataset["train"].remove_columns(["tokens", "ner_tags"])
val_dataset = split_dataset["validation"].remove_columns(["tokens", "ner_tags"])

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator
)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator)

# **DESCRIPTIVE STATISTICS**

In [17]:
descriptive_statistics(sentences, labels)

Corpus Summary:
   num_sentences  min_length  mean_length  max_length
0           2199           1   227.498863         703

IOB Tag Statistics (per sentence):
                                 min        mean    max
B-Organization                   0.0    0.483856    7.0
B-Organization,B-Place           0.0    0.001364    1.0
B-Organization,I-Person          0.0    0.001819    2.0
B-Organization,I-Place           0.0    0.016371    2.0
B-Person                         0.0    5.238290   47.0
B-Person,B-Place                 0.0    0.000910    1.0
B-Person,I-Place                 0.0    0.011369    1.0
B-Place                          0.0    1.858572   12.0
I-Organization                   0.0    1.435198   25.0
I-Organization,B-Place           0.0    0.135516    3.0
I-Organization,I-Person          0.0    0.005912    7.0
I-Organization,I-Person,B-Place  0.0    0.000910    1.0
I-Organization,I-Person,I-Place  0.0    0.000910    1.0
I-Organization,I-Place           0.0    0.073215    5.0


# **DEFINE & TRAIN MODEL**

In [None]:
all_labels = [
    label for example in split_dataset["train"] for label in example["ner_tags"]
]
unique_ids = sorted(set(all_labels))
label2id = {label: label for label in unique_ids}

In [None]:
base_model = Transformer(**MODEL_PARAMETERS)

In [None]:
model = TransformerForNER(base_model, hidden_size=256, num_labels=len(label2id)).to(
    device
)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
train_losses, val_losses = train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    device,
    epochs=NB_EPOCHS,
    log_steps=LOG_STEPS,
)

# **EVALUATE RESULTS**