In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
def read_data():
    # dirname = './conll_format'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'np'}
    # this is the folder in which train, test and dev files reside
    data_folder = '../data/conll_format'
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='arr.train',
                                                              test_file='arr.test',
                                                              dev_file='arr.dev')
    return corpus


In [3]:
corpus: TaggedCorpus = read_data()
print(corpus)

2019-04-04 16:47:53,157 Reading data from ../data/conll_format
2019-04-04 16:47:53,158 Train: ../data/conll_format/arr.train
2019-04-04 16:47:53,159 Dev: ../data/conll_format/arr.dev
2019-04-04 16:47:53,160 Test: ../data/conll_format/arr.test
TaggedCorpus: 124 train + 127 dev + 127 test sentences


In [4]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'', b'<START>', b'<STOP>']


filename = '../data/embeddings/wikipedia-pubmed-and-PMC-w2v.bin'
word_vectors = KeyedVectors.load_word2vec_format(filename, binary=True)
word_vectors.save('../data/embeddings/wiki.bg.vec.gensim')

In [5]:
filename = '../data/embeddings/wiki.bg.vec.gensim'
pretrained_word2vec = WordEmbeddings(filename)

In [6]:
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),

    pretrained_word2vec,

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use contextual string embeddings
    CharLMEmbeddings('news-forward'),
    CharLMEmbeddings('news-backward'),
]

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [7]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


In [8]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=200,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [9]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [10]:
# 7. start training
if os.path.isdir('../data/model') == False:
    os.mkdir('../data/model/my_flair_model')

In [11]:
trainer.train('../data/model/ner_model',
              learning_rate=0.015,
              mini_batch_size=10,
              max_epochs=50)

2019-04-04 16:49:49,671 ----------------------------------------------------------------------------------------------------
2019-04-04 16:49:49,672 Evaluation method: MICRO_F1_SCORE
2019-04-04 16:49:49,691 ----------------------------------------------------------------------------------------------------
2019-04-04 16:49:49,837 epoch 1 - iter 0/13 - loss 12.57546234
2019-04-04 16:49:54,864 epoch 1 - iter 1/13 - loss 11.29389915
2019-04-04 16:49:57,306 epoch 1 - iter 2/13 - loss 10.17862447
2019-04-04 16:49:59,625 epoch 1 - iter 3/13 - loss 10.19319077
2019-04-04 16:50:01,744 epoch 1 - iter 4/13 - loss 9.97314301
2019-04-04 16:50:03,870 epoch 1 - iter 5/13 - loss 9.76995010
2019-04-04 16:50:05,995 epoch 1 - iter 6/13 - loss 9.07564223
2019-04-04 16:50:07,996 epoch 1 - iter 7/13 - loss 8.94653563
2019-04-04 16:50:10,231 epoch 1 - iter 8/13 - loss 8.91288872
2019-04-04 16:50:12,324 epoch 1 - iter 9/13 - loss 8.72805431
2019-04-04 16:50:14,625 epoch 1 - iter 10/13 - loss 8.45175660
2019-

RuntimeError: CUDA out of memory. Tried to allocate 73.62 MiB (GPU 0; 15.90 GiB total capacity; 8.42 GiB already allocated; 23.56 MiB free; 6.85 GiB cached)