In [1]:
%load_ext autoreload

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"

# Download and convert data

In [3]:
%autoreload now

In [4]:
from document_segmentation.pagexml.annotations.generale_missiven import GeneraleMissiven
from document_segmentation.pagexml.datamodel.inventory import Inventory
from document_segmentation.settings import GENERALE_MISSIVEN_SHEET

N = 5

inventories: list[Inventory] = list(
    GeneraleMissiven(GENERALE_MISSIVEN_SHEET).all_annotated_inventories(
        N, skip_errors=True
    )
)

Loading Inventories: 100%|██████████| 5/5 [00:02<00:00,  2.19inventory/s]


# Load Data

In [5]:
%autoreload now

In [6]:
TRAINING_DATA = 0.8

In [7]:
import random

random.seed(0)
random.shuffle(inventories)

split = int(len(inventories) * TRAINING_DATA)

training_data: list[Inventory] = inventories[:split]
validation_data: dict[str, list[Inventory]] = {
    GENERALE_MISSIVEN_SHEET.stem: inventories[split:]
}

In [8]:
training_data

[Inventory(inv_nr=1072, inventory_part=, pages=12 pages),
 Inventory(inv_nr=1073, inventory_part=, pages=20 pages),
 Inventory(inv_nr=1070, inventory_part=, pages=31 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=35 pages),
 Inventory(inv_nr=1072, inventory_part=, pages=16 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=24 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=7 pages),
 Inventory(inv_nr=1072, inventory_part=, pages=2 pages),
 Inventory(inv_nr=1073, inventory_part=, pages=11 pages),
 Inventory(inv_nr=1071, inventory_part=, pages=21 pages),
 Inventory(inv_nr=1071, inventory_part=, pages=21 pages),
 Inventory(inv_nr=1070, inventory_part=, pages=9 pages)]

In [9]:
validation_data

{'Generale Missiven': [Inventory(inv_nr=1068, inventory_part=, pages=19 pages),
  Inventory(inv_nr=1070, inventory_part=, pages=52 pages),
  Inventory(inv_nr=1073, inventory_part=, pages=10 pages),
  Inventory(inv_nr=1072, inventory_part=, pages=3 pages)]}

# Train Model

In [10]:
EPOCHS = 3

In [11]:
%autoreload now

In [12]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

In [13]:
tagger._device

'mps'

In [14]:
tagger

PageSequenceTagger(
  (_page_embedding): PageEmbedding(
    (_region_model): RegionEmbeddingSentenceTransformer(
      (_transformer_model): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
        (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
      )
      (_region_type): Embedding(10, 16)
      (_linear): Linear(in_features=784, out_features=512, bias=True)
    )
    (_rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (_linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (_rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (_linear): Linear(in_features=512, out_featur

In [15]:
tagger.train_(training_data, validation_data, epochs=EPOCHS)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcarschno[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training: 100%|██████████| 12/12 [00:12<00:00,  1.08s/inventory]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.18inventory/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
        [1],
        [3],
        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 12/12 [00:00<00:00, 24.56inventory/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 27.68inventory/s]
        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 12/12 [00:00<00:00, 26.39inventory/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 34.31inventory/s]
        [4]]) classes have zero instances

VBox(children=(Label(value='0.105 MB of 0.105 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
inventory length,█▅▂▂▆▃▁▇▃▄▅▅▅▅▄█▂▆▃▇▂▅▁▃▁▂█▅▄▂▃▆▇▅▃▅
loss,█▄▅▆▇▄▅▃▄▃▃▂▃▂▂▅▃▆▂▁▂▁▂▁▁▂▄▁▁▂▁▅▁▁▁▁

0,1
epoch,2.0
inventory length,21.0
loss,5.13455


# Evaluate Model

In [16]:
import sys

precision, recall, f1, accuracy, results = tagger.eval_(
    validation_data["Generale Missiven"]
)

Evaluating: 100%|██████████| 4/4 [00:00<00:00, 18.92inventory/s]


In [17]:
import csv

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout, fieldnames=["Metric"] + [label.name for label in Label], delimiter="\t"
)

writer.writeheader()

for metric in precision, recall, f1:
    assert metric.average is None

    scores: list[float] = metric.compute().tolist()
    writer.writerow(
        {"Metric": metric.__class__.__name__}
        | {label.name: f"{score:.4f}" for label, score in zip(Label, scores)}
    )

assert accuracy.average is not None

print(
    f"{accuracy.__class__.__name__} ({accuracy.average} average):\t{accuracy.compute().item():.4f}",
    file=sys.stdout,
)

        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.


Metric	UNK	BEGIN	IN	END	OUT
MulticlassPrecision	0.0000	1.0000	0.9714	0.4000	0.0000
MulticlassRecall	0.0000	0.8000	0.9189	0.8000	0.0000
MulticlassF1Score	0.0000	0.8889	0.9444	0.5333	0.0000
MulticlassAccuracy (micro average):	0.9048


In [18]:
results

Unnamed: 0,Predicted,Actual,Page ID,Text,Scores
0,BEGIN,BEGIN,NL-HaNA_1.04.02_1068_0021.jpg,427 Cockels foly ende; schepen Int landt heeft...,"[0.009620788507163525, 0.6504104733467102, 0.3..."
1,IN,IN,NL-HaNA_1.04.02_1068_0022.jpg,maer Een schip sel; Molucor banda; Ende Amboin...,"[0.00669676810503006, 0.34863150119781494, 0.6..."
2,IN,IN,NL-HaNA_1.04.02_1068_0023.jpg,Nota opt gebreck van; timmerluyden; overichgew...,"[0.004080756567418575, 0.13704806566238403, 0...."
3,IN,END,NL-HaNA_1.04.02_1068_0024.jpg,daeroverdicht dat haer Capt; dat met pratyck v...,"[0.0027421233244240284, 0.06012945994734764, 0..."
4,IN,BEGIN,NL-HaNA_1.04.02_1068_0025.jpg,Erntfeste Wijse voorsinnige seer Discrete; mau...,"[0.0022508902475237846, 0.03735770657658577, 0..."
...,...,...,...,...,...
79,END,IN,NL-HaNA_1.04.02_1073_0311.jpg,de swaere garnisoenen die nu niet doen dan de ...,"[0.006712354253977537, 0.00901026651263237, 0...."
80,END,END,NL-HaNA_1.04.02_1073_0312.jpg,"van d' oude, ende opmaeckinge vande nieuwe leg...","[0.00875696912407875, 0.006495659705251455, 0...."
81,BEGIN,BEGIN,NL-HaNA_1.04.02_1072_0737.jpg,zedert myne Jonchte met Schip Walcheren syn hi...,"[0.08865992724895477, 0.38069605827331543, 0.2..."
82,END,IN,NL-HaNA_1.04.02_1072_0738.jpg,wel syn te becomen tot penvelatie Harwaerts ae...,"[0.0906975269317627, 0.20733831822872162, 0.30..."
