In [1]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

'''
@version:0.1
@author:Cai Qingpeng
@file: test.py
@time: 2020/3/18 7:30 PM
'''



import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: '_', 2: '_', 3: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = './data'  # /path/to/data/folder

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')

print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)


2020-03-18 22:10:57,678 Reading data from data
2020-03-18 22:10:57,679 Train: data/train.txt
2020-03-18 22:10:57,680 Dev: data/valid.txt
2020-03-18 22:10:57,681 Test: data/test.txt
Corpus: 14041 train + 3250 dev + 3453 test sentences
Dictionary with 12 tags: <unk>, O, B-ORG, B-MISC, B-PER, I-PER, B-LOC, I-ORG, I-MISC, I-LOC, <START>, <STOP>


In [29]:
dir(tag_dictionary),tag_dictionary.get_items()

(['__class__',
  '__delattr__',
  '__dict__',
  '__dir__',
  '__doc__',
  '__eq__',
  '__format__',
  '__ge__',
  '__getattribute__',
  '__gt__',
  '__hash__',
  '__init__',
  '__init_subclass__',
  '__le__',
  '__len__',
  '__lt__',
  '__module__',
  '__ne__',
  '__new__',
  '__reduce__',
  '__reduce_ex__',
  '__repr__',
  '__setattr__',
  '__sizeof__',
  '__str__',
  '__subclasshook__',
  '__weakref__',
  'add_item',
  'get_idx_for_item',
  'get_idx_for_items',
  'get_item_for_index',
  'get_items',
  'idx2item',
  'item2idx',
  'load',
  'load_from_file',
  'multi_label',
  'save'],
 ['<unk>',
  'O',
  'B-ORG',
  'B-MISC',
  'B-PER',
  'I-PER',
  'B-LOC',
  'I-ORG',
  'I-MISC',
  'I-LOC',
  '<START>',
  '<STOP>'])

In [15]:
# 4. initialize embeddings
from flair.embeddings import ELMoEmbeddings
from flair.embeddings import BertEmbeddings
elmo_embedding = ELMoEmbeddings("small")

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=elmo_embedding,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

elmo_model = SequenceTagger.load('./log/elmo/best-model.pt')

2020-03-18 22:19:40,678 loading file ./log/elmo/best-model.pt


In [3]:
# create example sentence
sentence = corpus.test[0]

for entity in sentence.get_spans('ner'):
    print(entity)

for token in sentence.tokens:
    print(str(token.get_tag("ner")))
    print(str(token.get_tags_proba_dist("ner")))

LOC-span [3]: "JAPAN"
PER-span [8]: "CHINA"
O (1.0)
[]
O (1.0)
[]
B-LOC (1.0)
[]
O (1.0)
[]
O (1.0)
[]
O (1.0)
[]
O (1.0)
[]
B-PER (1.0)
[]
O (1.0)
[]
O (1.0)
[]
O (1.0)
[]
O (1.0)
[]


In [4]:
elmo_model.predict(sentence,all_tag_prob=True)

for token in sentence.tokens:
    print(token.get_tag("ner").value)
    print(token.get_tags_proba_dist("ner"))

O
[<unk> (3.2936790375970304e-05), O (0.9889442920684814), B-ORG (0.0054200575686991215), B-MISC (0.0007958008209243417), B-PER (0.00099275226239115), I-PER (4.4818771129939705e-05), B-LOC (0.0030163165647536516), I-ORG (0.0003464781621005386), I-MISC (5.788022463093512e-05), I-LOC (0.00011808297131210566), <START> (0.0), <STOP> (0.00023058376973494887)]
O
[<unk> (3.7415318843159184e-07), O (0.9998051524162292), B-ORG (2.11851074709557e-05), B-MISC (9.066449820238631e-06), B-PER (2.9690656447201036e-05), I-PER (5.832266651850659e-07), B-LOC (9.572742419550195e-05), I-ORG (3.318772314742091e-06), I-MISC (2.276536406498053e-06), I-LOC (5.094615858070028e-07), <START> (0.0), <STOP> (3.221273800591007e-05)]
B-LOC
[<unk> (6.541900074807927e-05), O (0.0016433220589533448), B-ORG (0.08798980712890625), B-MISC (0.020808326080441475), B-PER (0.23673340678215027), I-PER (0.00021848923643119633), B-LOC (0.6437327861785889), I-ORG (0.00015517014253418893), I-MISC (2.072861389024183e-05), I-LOC (0.

In [5]:
real = []

for sentence in corpus.test:
    for token in sentence.tokens:
        real.append(token.get_tag("ner").value)
str(real)

"['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [6]:
pred = []
for sentence in corpus.test:
    elmo_model.predict(sentence,all_tag_prob=True)
    for token in sentence.tokens:
        pred.append(token.get_tag("ner").value)
str(pred)

"['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [7]:
from conlleval import evaluate
print(evaluate(real,pred))

processed 46435 tokens with 5648 phrases; found: 5684 phrases; correct: 5117.
accuracy:  91.77%; (non-O)
accuracy:  97.93%; precision:  90.02%; recall:  90.60%; FB1:  90.31
              LOC: precision:  92.27%; recall:  92.27%; FB1:  92.27  1669
             MISC: precision:  77.12%; recall:  80.20%; FB1:  78.63  730
              ORG: precision:  88.33%; recall:  88.44%; FB1:  88.39  1663
              PER: precision:  95.25%; recall:  95.61%; FB1:  95.43  1622
(90.02463054187191, 90.5984419263456, 90.31062477938579)


In [8]:
os.listdir('./log')

['bert', 'elmo']

In [31]:
import numpy as np
labels = tag_dictionary.get_items()
print(labels)

bert_model = SequenceTagger.load('./log/bert/best-model.pt')
elmo_model = SequenceTagger.load('./log/elmo/best-model.pt')


['<unk>', 'O', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC', '<START>', '<STOP>']
2020-03-18 22:33:13,617 loading file ./log/bert/best-model.pt
2020-03-18 22:33:14,109 loading file ./log/elmo/best-model.pt


In [45]:
def cal_proba_score(model,sentence):
    model.predict(sentence,all_tag_prob=True)
    score = []
    for t_id, token in enumerate(sentence.tokens):
        # print(token.get_tag("ner").value)
        # print(token.get_tags_proba_dist("ner"))
        for index,item in enumerate(token.get_tags_proba_dist("ner")):
            # print(item.value)
            # print(item.score)
            score.append(item.score)
    return score

mix_pred = []
for sentence in corpus.test:
    scores = []
    elmo_score = cal_proba_score(bert_model,sentence)
    scores.append(elmo_score) 

    bert_score = cal_proba_score(elmo_model,sentence)
    scores.append(bert_score) 

    scores = np.mean(scores,axis=0)
    result = np.reshape(scores,(len(sentence),len(labels)))

    id_result = np.argmax(result,axis=1)
    la_result = [tag_dictionary.get_item_for_index(i) for i in id_result]
    
    mix_pred.extend(la_result)


In [47]:
print(evaluate(real,mix_pred))

processed 46435 tokens with 5648 phrases; found: 5693 phrases; correct: 5188.
accuracy:  92.90%; (non-O)
accuracy:  98.33%; precision:  91.13%; recall:  91.86%; FB1:  91.49
              LOC: precision:  92.77%; recall:  92.21%; FB1:  92.49  1659
             MISC: precision:  79.56%; recall:  82.05%; FB1:  80.79  724
              ORG: precision:  88.54%; recall:  90.67%; FB1:  89.59  1701
              PER: precision:  97.39%; recall:  96.97%; FB1:  97.18  1609
(91.12945722817494, 91.85552407932012, 91.4910501719425)
