In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, PooledFlairEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


def read_data():
    # dirname = './conll_format'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'np'}
    # this is the folder in which train, test and dev files reside
    data_folder = '../data/conll_format'
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='arr.train',
                                                              test_file='arr.test',
                                                              dev_file='arr.dev')
    return corpus


In [2]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.WIKINER_ENGLISH).downsample(0.1)


2019-04-09 00:47:19,419 Reading data from /root/.flair/datasets/wikiner_english
2019-04-09 00:47:19,421 Train: /root/.flair/datasets/wikiner_english/aij-wikiner-en-wp3.train
2019-04-09 00:47:19,423 Dev: None
2019-04-09 00:47:19,424 Test: None


In [3]:
#corpus: TaggedCorpus = read_data()
print(corpus)

TaggedCorpus: 11515 train + 1280 dev + 1422 test sentences


In [4]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'B-MISC', b'I-MISC', b'E-MISC', b'B-PER', b'E-PER', b'B-ORG', b'I-ORG', b'E-ORG', b'S-ORG', b'S-PER', b'S-MISC', b'S-LOC', b'B-LOC', b'I-LOC', b'E-LOC', b'I-PER', b'<START>', b'<STOP>']


filename = '../data/embeddings/wikipedia-pubmed-and-PMC-w2v.bin'
word_vectors = KeyedVectors.load_word2vec_format(filename, binary=True)
word_vectors.save('../data/embeddings/wiki.bg.vec.gensim')

In [None]:
filename = '../data/embeddings/wiki.bg.vec.gensim'
pretrained_word2vec = WordEmbeddings(filename)

In [5]:
embedding_types: List[TokenEmbeddings] = [
    #pretrained_word2vec,
    WordEmbeddings('glove'),
    PooledFlairEmbeddings('news-forward', pooling='min'),
    PooledFlairEmbeddings('news-backward', pooling='min'),
]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=200,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [None]:
# 7. start training
if os.path.isdir('../data/model') == False:
    os.mkdir('../data/model/my_flair_model')

In [None]:
trainer.train('../data/model/my_flair_model',
              learning_rate=0.015,
              mini_batch_size=10,
              max_epochs=20)

2019-04-09 00:50:29,050 ----------------------------------------------------------------------------------------------------
2019-04-09 00:50:29,051 Evaluation method: MICRO_F1_SCORE
2019-04-09 00:50:29,076 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-04-09 00:50:29,567 epoch 1 - iter 0/1152 - loss 52.30263062
2019-04-09 00:51:43,934 epoch 1 - iter 115/1152 - loss 17.87512668
2019-04-09 00:52:44,487 epoch 1 - iter 230/1152 - loss 14.78056660
2019-04-09 00:53:46,840 epoch 1 - iter 345/1152 - loss 13.05060101
2019-04-09 00:54:48,631 epoch 1 - iter 460/1152 - loss 11.90511425
2019-04-09 00:55:49,741 epoch 1 - iter 575/1152 - loss 11.05598874
2019-04-09 00:56:54,705 epoch 1 - iter 690/1152 - loss 10.39897153
2019-04-09 00:57:55,441 epoch 1 - iter 805/1152 - loss 9.87324285
2019-04-09 00:58:58,518 epoch 1 - iter 920/1152 - loss 9.48050676
2019-04-09 01:00:01,120 epoch

2019-04-09 01:26:26,773 epoch 7 - iter 345/1152 - loss 3.18163303
2019-04-09 01:26:50,796 epoch 7 - iter 460/1152 - loss 3.20267837
2019-04-09 01:27:16,182 epoch 7 - iter 575/1152 - loss 3.17125471
2019-04-09 01:27:38,726 epoch 7 - iter 690/1152 - loss 3.14326364
2019-04-09 01:28:00,827 epoch 7 - iter 805/1152 - loss 3.14058395
2019-04-09 01:28:22,300 epoch 7 - iter 920/1152 - loss 3.14359263
2019-04-09 01:28:44,679 epoch 7 - iter 1035/1152 - loss 3.13719327
2019-04-09 01:29:07,166 epoch 7 - iter 1150/1152 - loss 3.15582068
2019-04-09 01:29:09,243 ----------------------------------------------------------------------------------------------------
2019-04-09 01:29:09,244 EPOCH 7 done: loss 3.1549 - lr 0.0150 - bad epochs 0
2019-04-09 01:29:26,346 DEV  : loss 2.75811052 - f-score 0.7894 - acc 0.6521
2019-04-09 01:29:45,184 TEST : loss 2.53786564 - f-score 0.7992 - acc 0.6656
2019-04-09 01:30:00,901 ------------------------------------------------------------------------------------------

2019-04-09 01:56:52,945 ----------------------------------------------------------------------------------------------------
2019-04-09 01:56:52,946 EPOCH 13 done: loss 2.4017 - lr 0.0150 - bad epochs 0
2019-04-09 01:57:11,325 DEV  : loss 2.31659245 - f-score 0.8035 - acc 0.6716
2019-04-09 01:57:30,404 TEST : loss 2.10184789 - f-score 0.8201 - acc 0.6950
2019-04-09 01:57:46,606 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-04-09 01:57:46,779 epoch 14 - iter 0/1152 - loss 3.30039215
2019-04-09 01:58:08,367 epoch 14 - iter 115/1152 - loss 2.27912123
2019-04-09 01:58:30,067 epoch 14 - iter 230/1152 - loss 2.20734091
2019-04-09 01:58:53,057 epoch 14 - iter 345/1152 - loss 2.23892775
2019-04-09 01:59:15,422 epoch 14 - iter 460/1152 - loss 2.29783445
2019-04-09 01:59:38,377 epoch 14 - iter 575/1152 - loss 2.28633769
2019-04-09 02:00:00,395 epoch 14 - iter 690/1152 - los

In [None]:
import datetime
print(datetime.now())