In [1]:
# This reload library is just used for developing the REPUBLIC hOCR parser 
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2


# This is needed to add the repo dir to the path so jupyter
# can load the republic modules directly from the notebooks
import os
import sys
repo_name = 'republic-project'
repo_dir = os.path.split(os.getcwd())[0].split(repo_name)[0] + repo_name
print("adding project dir to path:", repo_dir)
if repo_dir not in sys.path:
    sys.path = [repo_dir] + sys.path
else:
    sys.path.remove(repo_dir)
    sys.path = [repo_dir] + sys.path
    


adding project dir to path: /Users/marijnkoolen/Code/Huygens/republic-project


In [2]:
import datetime

import torch

from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, FlairEmbeddings
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


device = torch.device('mps')




In [None]:
flair_dir = f'{repo_dir}/data/embeddings/flair_embeddings/'

# 1. get the corpus
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
layer_name = 'HOE'
layer_name = 'single_layer'
train_size = 1.0
data_folder = f'{repo_dir}/ground_truth/entities/tag_de_besluiten/flair_training_{layer_name}'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=f'train_{train_size}.txt',
                              test_file='test.txt',
                              dev_file='validate.txt')

print(corpus)

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize embeddings
gysbert_embeddings = TransformerWordEmbeddings('emanjavacas/GysBERT',
                                               layers="-1",
                                               allow_long_sentences=False,
                                               model_max_length=512)
embedding_types = [
    FlairEmbeddings(f'{flair_dir}/resources/taggers/language_model_bw_char/best-lm.pt'),
    FlairEmbeddings(f'{flair_dir}/resources/taggers/language_model_fw_char/best-lm.pt'),
    #WordEmbeddings(''),
    # CharacterEmbeddings(),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

model_dir = f'{flair_dir}/resources/taggers/ner-tbd-{layer_name}-train_{train_size}-{datetime.date.today().isoformat()}'
# 7. start training
trainer.train(model_dir,
              learning_rate=0.05,
              mini_batch_size=32,
              max_epochs=10)

2023-04-25 16:02:45,147 Reading data from /Users/marijnkoolen/Code/Huygens/republic-project/ground_truth/entities/tag_de_besluiten/flair_training_single_layer
2023-04-25 16:02:45,147 Train: /Users/marijnkoolen/Code/Huygens/republic-project/ground_truth/entities/tag_de_besluiten/flair_training_single_layer/train_1.0.txt
2023-04-25 16:02:45,148 Dev: /Users/marijnkoolen/Code/Huygens/republic-project/ground_truth/entities/tag_de_besluiten/flair_training_single_layer/validate.txt
2023-04-25 16:02:45,148 Test: /Users/marijnkoolen/Code/Huygens/republic-project/ground_truth/entities/tag_de_besluiten/flair_training_single_layer/test.txt
Corpus: 1330 train + 154 dev + 147 test sentences
2023-04-25 16:02:47,370 Computing label dictionary. Progress:


1330it [00:00, 36745.37it/s]

2023-04-25 16:02:47,419 Dictionary created for label 'ner' with 9 values: HOE (seen 5017 times), LOC (seen 4584 times), PER (seen 3188 times), ORG (seen 2466 times), DAT (seen 2218 times), RES (seen 489 times), COM (seen 287 times), NAM (seen 237 times)
Dictionary with 9 tags: <unk>, HOE, LOC, PER, ORG, DAT, RES, COM, NAM





2023-04-25 16:02:49,400 SequenceTagger predicts: Dictionary with 33 tags: O, S-HOE, B-HOE, E-HOE, I-HOE, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG, S-DAT, B-DAT, E-DAT, I-DAT, S-RES, B-RES, E-RES, I-RES, S-COM, B-COM, E-COM, I-COM, S-NAM, B-NAM, E-NAM, I-NAM
2023-04-25 16:02:49,405 ----------------------------------------------------------------------------------------------------
2023-04-25 16:02:49,405 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(138, 100)
        (rnn): LSTM(100, 128)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(138, 100)
        (rnn): LSTM(100, 128)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn

In [None]:
import datetime

import torch

from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

import flair


flair.device = torch.device('mps')




In [None]:
flair_dir = f'{repo_dir}/data/embeddings/flair_embeddings/'

# 1. get the corpus
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
layer_name = 'single_layer'
data_folder = f'{repo_dir}/ground_truth/entities/tag_de_besluiten/flair_training_{layer_name}'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='validate.txt')

print(corpus)

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize embeddings
embedding_types = [
    FlairEmbeddings(f'{flair_dir}/resources/taggers/language_model_bw_char/best-lm.pt'),
    FlairEmbeddings(f'{flair_dir}/resources/taggers/language_model_fw_char/best-lm.pt'),
    #WordEmbeddings(''),
    # CharacterEmbeddings(),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True).to(device)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

model_dir = f'{flair_dir}/resources/taggers/ner-tbd-{layer_name}-{datetime.date.today().isoformat()}'
# 7. start training
trainer.train(model_dir,
              learning_rate=0.1,
              mini_batch_size=32,
              embeddings_storage_mode='mps',
              max_epochs=10)

In [None]:
import sys

!{sys.executable} -m pip install flair --upgrade