In [None]:
import glob
import re
import pathlib
import csv
import numpy as np

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, FlairEmbeddings
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
def parse_webanno_format(wf):
    
    output_folder = "webanno_annotations/formatted/"
    pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)
    wfilename = wf.split("/")[-2].split(".txt")[0]
    
    with open(wf) as fr:
        lines = fr.readlines()
        
    dTags = {"alternate\_name": "altname",
            "closing\_date": "closed",
            "company": "company",
            "opening\_date": "opened",
            "station": "station",
            "station\_in": "stationIn",
            "station\_near": "stationNear"}

    annotated = False # Flag: if there is no annotation in the file, we'll discard it
    annotations = []
    numbering = 0
    for line in lines:
        if not line.startswith("#"):
            line = line.strip()
            line = line.split("\t")
            tag = ""
            if len(line) > 3:
                token = line[2]
                anntag = line[3].split("[")[0]
                if anntag in dTags.keys():
                    annotated = True
                    tag = dTags[anntag]
                    if line[3].endswith("]"):
                        newnumber = str(line[3].split("]")[0].split("[")[1])
                        if newnumber == numbering:
                            tag = "I-" + tag
                        else:
                            tag = "B-" + tag
                        numbering = newnumber
                    else:
                        tag = "B-" + tag
                else:
                    tag = "O"
                annotations.append([token, tag])
            elif len(line) > 2:
                token = line[2]
                annotations.append([token, "O"])
        else:
#             annotations.append(["[SEP]", "O"])
            annotations.append("")
    
    if annotated == True:
        with open(output_folder + wfilename + ".tsv", "w") as csv_file:
            writer = csv.writer(csv_file, delimiter='\t')
            for a in annotations:
                writer.writerow(a)

In [None]:
# Parse webanno annotations:
for i in glob.glob("webanno_annotations/Quick_2021-01-06_0810/annotation/*/*.tsv"):
    parse_webanno_format(i)

In [None]:
# Split into train, val, and test
formatted_files = glob.glob("webanno_annotations/formatted/*")
train, val, test = np.split(formatted_files, [int(len(formatted_files)*0.6), int(len(formatted_files)*0.9)])
    
output_folder = "webanno_annotations/datasets/"
pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)

dSplits = {"train": train, "val": val, "test": test}

for spl in dSplits:
    with open(output_folder + spl + ".txt", 'w') as outfile:
        for fname in dSplits[spl]:
            with open(fname) as infile:
                outfile.write(infile.read())
                outfile.write("\n")

In [None]:
### FROM: https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md

# define columns
columns = {0: 'text', 1: 'tag'}

# this is the folder in which train, test and dev files reside
data_folder = "webanno_annotations/datasets/"

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='val.txt')

# get the corpus
print(corpus)

In [None]:
# make the tag dictionary from the corpus
tag_type = 'tag'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

In [None]:
### FROM: https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

# initialize embeddings
embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
#     CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
#     FlairEmbeddings('news-forward'),
#     FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [None]:
# start training
trainer.train('models/example-tag',
              learning_rate=0.05,
              mini_batch_size=16,
              max_epochs=50,
              anneal_with_restarts=True,
              patience=3)

In [None]:
s = """[[[STATION: FRONGOCH]]] [GW] op 1 November 1882 (N Wales Express 3 rd ) ; clo 4 January 1960 (RM March) ."""

In [None]:
# load the model you trained
model = SequenceTagger.load('models/example-tag/best-model.pt')

# create example sentence
sentence = Sentence(s)

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())