In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, FlairEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
def read_data():
    # dirname = './conll_format'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'np'}
    # this is the folder in which train, test and dev files reside
    data_folder = '../data/conll_format'
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='arr.train',
                                                              test_file='arr.test',
                                                              dev_file='arr.dev')
    return corpus


In [None]:
corpus: TaggedCorpus = read_data()
print(corpus)

In [7]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'', b'<START>', b'<STOP>']


filename = '../data/embeddings/wikipedia-pubmed-and-PMC-w2v.bin'
word_vectors = KeyedVectors.load_word2vec_format(filename, binary=True)
word_vectors.save('../data/embeddings/wiki.bg.vec.gensim')

In [8]:
filename = '../data/embeddings/wiki.bg.vec.gensim'
pretrained_word2vec = WordEmbeddings(filename)

In [9]:
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),

    pretrained_word2vec,

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use contextual string embeddings
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

In [10]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=50,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [12]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [13]:
# 7. start training
if os.path.isdir('../data/model') == False:
    os.mkdir('../data/model/my_flair_model')

In [14]:
trainer.train('../data/model/ner_model',
              learning_rate=0.015,
              mini_batch_size=10,
              max_epochs=50)

2019-04-04 17:07:53,181 ----------------------------------------------------------------------------------------------------
2019-04-04 17:07:53,181 Evaluation method: MICRO_F1_SCORE
2019-04-04 17:07:53,204 ----------------------------------------------------------------------------------------------------
2019-04-04 17:07:53,257 epoch 1 - iter 0/13 - loss 3.66242485
2019-04-04 17:07:55,425 epoch 1 - iter 1/13 - loss 2.88163862
2019-04-04 17:07:57,449 epoch 1 - iter 2/13 - loss 4.09905949
2019-04-04 17:07:59,519 epoch 1 - iter 3/13 - loss 3.84855251
2019-04-04 17:08:01,459 epoch 1 - iter 4/13 - loss 3.39944244
2019-04-04 17:08:03,392 epoch 1 - iter 5/13 - loss 2.97377151
2019-04-04 17:08:05,330 epoch 1 - iter 6/13 - loss 2.76312207
2019-04-04 17:08:07,259 epoch 1 - iter 7/13 - loss 2.55219790
2019-04-04 17:08:09,124 epoch 1 - iter 8/13 - loss 2.36225913
2019-04-04 17:08:11,042 epoch 1 - iter 9/13 - loss 2.19429740
2019-04-04 17:08:12,981 epoch 1 - iter 10/13 - loss 2.05927547
2019-04-0

RuntimeError: CUDA out of memory. Tried to allocate 73.62 MiB (GPU 0; 23.88 GiB total capacity; 12.76 GiB already allocated; 16.56 MiB free; 10.41 GiB cached)