In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, PooledFlairEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


def read_data():
    # dirname = './conll_format'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'np'}
    # this is the folder in which train, test and dev files reside
    data_folder = '../data/conll_format'
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='arr.train',
                                                              test_file='arr.test',
                                                              dev_file='arr.dev')
    return corpus


In [None]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.WIKINER_ENGLISH).downsample(0.1)


2019-04-09 00:43:28,058 Reading data from /root/.flair/datasets/wikiner_english
2019-04-09 00:43:28,061 Train: /root/.flair/datasets/wikiner_english/aij-wikiner-en-wp3.train
2019-04-09 00:43:28,062 Dev: None
2019-04-09 00:43:28,063 Test: None


In [None]:
#corpus: TaggedCorpus = read_data()
print(corpus)

In [None]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

filename = '../data/embeddings/wikipedia-pubmed-and-PMC-w2v.bin'
word_vectors = KeyedVectors.load_word2vec_format(filename, binary=True)
word_vectors.save('../data/embeddings/wiki.bg.vec.gensim')

In [None]:
filename = '../data/embeddings/wiki.bg.vec.gensim'
pretrained_word2vec = WordEmbeddings(filename)

In [None]:
embedding_types: List[TokenEmbeddings] = [
    #pretrained_word2vec,
    WordEmbeddings('glove'),
    PooledFlairEmbeddings('news-forward', pooling='min'),
    PooledFlairEmbeddings('news-backward', pooling='min'),
]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=200,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [None]:
# 7. start training
if os.path.isdir('../data/model') == False:
    os.mkdir('../data/my_flair_model')

trainer.train('./model/ner_model',
              learning_rate=0.015,
              mini_batch_size=20,
              max_epochs=150)

In [None]:
import datetime
print(datetime.now())