In [None]:
# !pip install flair

In [10]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

'/home/ubuntu/ML_cylynx_nlp/training/NER/Flair Model'

In [16]:
columns = {0: 'text', 1: 'ner'}
data_folder = '../data'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                             train_file='news_train_sep.conll',
                             test_file='news_test_sep.conll')

baseline_corpus: Corpus = ColumnCorpus(data_folder, columns,
                             train_file='baseline_news_train_sep.conll',
                             test_file='baseline_news_test_sep.conll')

2021-06-30 14:00:23,222 Reading data from ../data
2021-06-30 14:00:23,225 Train: ../data/news_train_sep.conll
2021-06-30 14:00:23,226 Dev: None
2021-06-30 14:00:23,228 Test: ../data/news_test_sep.conll
2021-06-30 14:00:24,078 Reading data from ../data
2021-06-30 14:00:24,080 Train: ../data/baseline_news_train_sep.conll
2021-06-30 14:00:24,081 Dev: None
2021-06-30 14:00:24,082 Test: ../data/baseline_news_test_sep.conll


In [4]:
print(len(corpus.train))
print(len(corpus.test))
print(corpus.train[0].to_tagged_string('ner'))

732
349
Former Peopleâ€™s Bank <B-Entity> of <I-Entity> China <I-Entity> deputy governor calls Bitcoin <B-Entity> a â€œcommercial successâ€, Despite Chinaâ€™s reluctance to regulate cryptocurrencies in the country, many anecdotes suggest higher officials both understand and acknowledge the potential of a digital currency-based economy.


In [5]:
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

Dictionary with 6 tags: <unk>, O, B-Entity, I-Entity, <START>, <STOP>


In [None]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [11]:
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [14]:
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/crypto-headlines-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2021-03-22 05:39:29,771 ----------------------------------------------------------------------------------------------------
2021-03-22 05:39:29,773 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, b

{'dev_loss_history': [5.395751953125,
  3.7161755561828613,
  3.1134839057922363,
  2.9538779258728027,
  3.087730884552002,
  2.6872496604919434,
  2.7048585414886475,
  2.459933280944824,
  2.5288567543029785,
  2.106947898864746,
  2.1538662910461426,
  2.223433494567871,
  2.1805286407470703,
  2.1350769996643066,
  1.95393705368042,
  1.9650219678878784,
  1.9530932903289795,
  2.2455735206604004,
  2.150578498840332,
  2.0434327125549316,
  2.075129985809326,
  2.006521224975586,
  1.9323076009750366,
  2.219982385635376,
  1.9710736274719238,
  1.9719645977020264,
  1.9666823148727417,
  2.0047688484191895,
  1.95192551612854,
  1.945572853088379,
  1.9847053289413452,
  1.9655345678329468,
  1.9777559041976929,
  1.9728507995605469,
  1.9832299947738647,
  1.9691569805145264,
  1.9768331050872803,
  1.971834659576416,
  1.9799174070358276,
  1.9725704193115234,
  1.9746026992797852,
  1.9800546169281006,
  1.9806995391845703,
  1.9834359884262085,
  1.9809513092041016,
  1.9799

In [8]:
from flair.models import SequenceTagger
model = SequenceTagger.load('/home/ubuntu/ML_cylynx_nlp/cryptonews/prediction/models/ner-model.pt')
base_model = SequenceTagger.load("ner")
from flair.data import Sentence

sentence = Sentence("Coinbase Settles With CFTC for $6.5M Over Old Trading Practices. Coinbase will pay a $6.5 million fine to settle allegations it self-traded cryptocurrencies between 2015 and 2018.")
model.predict(sentence)

print(sentence.to_tagged_string())

sentence = Sentence("Coinbase Settles With CFTC for $6.5M Over Old Trading Practices. Coinbase will pay a $6.5 million fine to settle allegations it self-traded cryptocurrencies between 2015 and 2018.")
base_model.predict(sentence)
print(sentence.to_tagged_string())


2021-06-30 13:32:48,783 loading file /home/ubuntu/ML_cylynx_nlp/cryptonews/prediction/models/ner-model.pt
2021-06-30 13:32:52,623 --------------------------------------------------------------------------------
2021-06-30 13:32:52,625 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2021-06-30 13:32:52,626  - The most current version of the model is automatically downloaded from there.
2021-06-30 13:32:52,627  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2021-06-30 13:32:52,628 --------------------------------------------------------------------------------
2021-06-30 13:32:53,501 loading file /home/ubuntu/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
Coinbase <B-Entity> Settles With CFTC <B-Entity> for $ 6.5M Over Old Tradin

In [18]:
sentence = Sentence("Bitcoin Price Chart Shows Bull Fatigue as Analyst Sees ‘Rising Wedge’")
model.predict(sentence)

print(sentence.to_tagged_string())

Bitcoin <B-Entity> Price Chart Shows Bull Fatigue as Analyst Sees ‘ Rising Wedge’


In [19]:
sentence = Sentence("$4.6M in Filecoin ‘Double Deposited’ on Binance; Exploit Open on Other Exchanges")
model.predict(sentence)

print(sentence.to_tagged_string())

$ 4.6M in Filecoin <B-Entity> ‘ Double Deposited’ on Binance <B-Entity> ; Exploit Open on Other Exchanges


In [14]:
result, score = model.evaluate(corpus.test, mini_batch_size=32, out_path=f"predictions.txt")
print(result.detailed_results)


Results:
- F1-score (micro) 0.7929
- F1-score (macro) 0.7929

By class:
Entity     tp: 490 - fp: 141 - fn: 115 - precision: 0.7765 - recall: 0.8099 - f1-score: 0.7929


In [17]:
result, score = base_model.evaluate(baseline_corpus.test, mini_batch_size=32, out_path=f"predictions.txt")
print(result.detailed_results)


Results:
- F1-score (micro) 0.3223
- F1-score (macro) 0.1030

By class:
LOC        tp: 0 - fp: 68 - fn: 0 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
MISC       tp: 0 - fp: 155 - fn: 0 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
ORG        tp: 215 - fp: 224 - fn: 390 - precision: 0.4897 - recall: 0.3554 - f1-score: 0.4119
PER        tp: 0 - fp: 67 - fn: 0 - precision: 0.0000 - recall: 0.0000 - f1-score: 0.0000
