In [1]:
%%capture
!pip install "flair" -q
# !pip install "scispacy" -q
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz -q

In [12]:
import pandas as pd
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TokenEmbeddings
from typing import List

In [13]:
file_path = "/content/"
filename = "flair_train.txt"
test_file = "flair_test.txt"

In [14]:
columns = {0:"text", 1:"ner"}

In [15]:
corpus: Corpus = ColumnCorpus(file_path, columns,
                              train_file=filename,
                              test_file=test_file)

2023-04-04 00:10:48,639 Reading data from /content
2023-04-04 00:10:48,641 Train: /content/flair_train.txt
2023-04-04 00:10:48,645 Dev: None
2023-04-04 00:10:48,647 Test: /content/flair_test.txt


In [16]:
print(len(corpus.train))

36464


In [17]:
# print(corpus.train[0].to_tagged_string("ner"))

In [18]:
tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


In [19]:
embedding_types : List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        ## other embeddings
        ]
embeddings : StackedEmbeddings = StackedEmbeddings(
                                 embeddings=embedding_types)

In [20]:
from flair.models import SequenceTagger
tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

2023-04-04 00:11:52,786 SequenceTagger predicts: Dictionary with 3 tags: O, <START>, <STOP>
SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)


In [None]:
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2023-04-04 00:11:52,817 ----------------------------------------------------------------------------------------------------
2023-04-04 00:11:52,819 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2023-04-04 00:11:52,824 ----------------------------------------------------------------------------------------------------
2023-04-04 00:11:52,827 Corpus: "Corpus: 36464 train + 4052 dev + 26050 test sentences"
2023-04-04 00:11:52,829 ----------------------------------------------------------------------------------------------------
2023-04-04 00:11:52,83

 13%|█▎        | 106/815 [00:11<01:27,  8.11it/s]

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
# load the trained model
final_model = "/content/resources/taggers/example-ner/final-model.pt"
model = SequenceTagger.load(final_model)
# create example sentence
sentence = Sentence('I have aspirin in my butt')
# predict the tags
model.predict(sentence)
print(sentence.to_tagged_string())