In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: 'text', 1: '_', 2: '_', 3: 'ner'}

data_folder = './data'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')
print(corpus)
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

In [None]:
from flair.embeddings import ELMoEmbeddings,BertEmbeddings,FlairEmbeddings,XLNetEmbeddings
from flair.models import SequenceTagger
from ensemble_tagger import EnsembleTagger
from typing import List

elmo_tagger = SequenceTagger(hidden_size=256,
                             embeddings=ELMoEmbeddings('small'),
                             tag_dictionary=tag_dictionary,
                             tag_type=tag_type,
                             use_crf=True)
bert_tagger = SequenceTagger(hidden_size=256,
                             embeddings=BertEmbeddings(),
                             tag_dictionary=tag_dictionary,
                             tag_type=tag_type,
                             use_crf=True)
xlnet_tagger = SequenceTagger(hidden_size=256,
                              embeddings=XLNetEmbeddings(),
                              tag_dictionary=tag_dictionary,
                              tag_type=tag_type,
                              use_crf=True)
# flair_tagger = SequenceTagger(hidden_size=256,
#                               embeddings=FlairEmbeddings('en-forward'),
#                               tag_dictionary=tag_dictionary,
#                               tag_type=tag_type,
#                               use_crf=True)
ensemble_tagger = EnsembleTagger(models=[xlnet_tagger, elmo_tagger, bert_tagger],
                                 tag_type=tag_type,
                                 mode='loss')
print(str(ensemble_tagger))

In [None]:
from flair.trainers import ModelTrainer
from datetime import datetime

trainer: ModelTrainer = ModelTrainer(ensemble_tagger, corpus)
model_path = "/hdd1/kurisu/cs6207/log/ensemble/'
trainer.train(model_path,
              learning_rate=0.01,
              mini_batch_size=32,
              max_epochs=1)

In [None]:
test_ensemble_tagger = EnsembleTagger.load(model_path + 'best-model.pt')

sentence = corpus.test[0]

for entity in sentence.get_spans('ner'):
    print(entity)

for token in sentence.tokens:
    print(str(token.get_tag("ner")))
    print(str(token.get_tags_proba_dist("ner")))

test_ensemble_tagger.predict(sentence,all_tag_prob=True)

for token in sentence.tokens:
    print(token.get_tag("ner").value)
    print(token.get_tags_proba_dist("ner"))

In [None]:
def test(model, data):
    results = []
    for sentence in data:
        model.predict(sentence,all_tag_prob=True)
        for token in sentence.tokens:
            results.append(token.get_tag("ner").value)
    return results

elmo_pred = test(elmo_tagger, corpus.test)
bert_pred = test(bert_tagger, corpus.test)
flair_pred = test(flair_tagger, corpus.test)
ensemble_pred = test(ensemble_tagger, corpus.test)

In [None]:
from conlleval import evaluate

real = []
for sentence in corpus.test:
    for token in sentence.tokens:
        real.append(token.get_tag("ner").value)

print(evaluate(real, ensemble_pred))
print(evaluate(real, elmo_pred))
print(evaluate(real, bert_pred))
print(evaluate(real, flair_pred))