In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install "flair" -q

LSTM-CRF : [link](https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md)

Transformer: [link](https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_TRAINING_SEQUENCE_LABELER.md)

In [3]:
import flair
from flair.data import Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import (
    WordEmbeddings, TransformerWordEmbeddings, StackedEmbeddings
)
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
flair.__version__

'0.12.2'

In [4]:
DATA_PATH = "/content/drive/Shareddrives/CIS522-Project/data"
MODEL_PATH = "/content/drive/Shareddrives/CIS522-Project/models"

In [5]:
columns = {0:"text", 1:"ner"}

filename = "flair_ner_train.txt"
test_file = "flair_ner_test.txt"

corpus = ColumnCorpus(
    DATA_PATH, columns, train_file=filename, test_file=test_file
)
tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)
print(tag_dictionary.get_items())

2023-04-18 23:16:52,013 Reading data from /content/drive/Shareddrives/CIS522-Project/data
2023-04-18 23:16:52,018 Train: /content/drive/Shareddrives/CIS522-Project/data/flair_ner_train.txt
2023-04-18 23:16:52,019 Dev: None
2023-04-18 23:16:52,021 Test: /content/drive/Shareddrives/CIS522-Project/data/flair_ner_test.txt
2023-04-18 23:17:47,830 Computing label dictionary. Progress:


35399it [00:01, 26153.32it/s]

2023-04-18 23:17:49,228 Dictionary created for label 'ner' with 9 values: Drug (seen 84101 times), Strength (seen 59318 times), Form (seen 56550 times), Frequency (seen 48929 times), Route (seen 40228 times), Dosage (seen 32789 times), Reason (seen 13189 times), Duration (seen 3240 times), ADE (seen 2021 times)
['Drug', 'Strength', 'Form', 'Frequency', 'Route', 'Dosage', 'Reason', 'Duration', 'ADE']





In [None]:
weight_dict = {
    'Drug': 87168/87168,
    'Strength': 87168/60400,
    'Form': 87168/57184,
    'Frequency': 87168/49699,
    'Route': 87168/41022,
    'Dosage': 87168/33289,
    'Reason': 87168/14242,
    'Duration': 87168/3350,
    'ADE': 87168/2260,
}
weight_dict

{'Drug': 1.0,
 'Strength': 1.4431788079470198,
 'Form': 1.5243424734191382,
 'Frequency': 1.753918589911266,
 'Route': 2.1249085856369754,
 'Dosage': 2.6185226351046893,
 'Reason': 6.120488695407948,
 'Duration': 26.020298507462687,
 'ADE': 38.56991150442478}

See https://github.com/flairNLP/flair/issues/2801 for `model_max_length` issue.

In [None]:
# # 4. initialize fine-tuneable transformer embeddings WITH document context
# tf_embeddings = TransformerWordEmbeddings(
#     model='emilyalsentzer/Bio_ClinicalBERT',
#     layers="-1",
#     subtoken_pooling="mean",
#     fine_tune=True,
#     use_context=True,
#     model_max_length=256 # required to fix a weird tensor size mismatch error 
# )

# embedding_types = [
#     # word embeddings trained on PubMed and PMC
#     # WordEmbeddings("pubmed"),
#     tf_embeddings
# ]

# embeddings = StackedEmbeddings(embeddings=embedding_types)

# # 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
# tf_tagger = SequenceTagger(
#     hidden_size=256,
#     embeddings=embeddings,
#     tag_dictionary=tag_dictionary,
#     tag_type='ner',
#     tag_format='BIOES',
#     use_crf=True,
#     use_rnn=False,
#     reproject_embeddings=False,
#     loss_weights=weight_dict
# )
tf_tagger = SequenceTagger.load(f"{MODEL_PATH}/taggers/clinicalbert-crf/final-model.pt")

2023-04-15 02:21:10,106 SequenceTagger predicts: Dictionary with 39 tags: O, S-Drug, B-Drug, E-Drug, I-Drug, S-Strength, B-Strength, E-Strength, I-Strength, S-Form, B-Form, E-Form, I-Form, S-Frequency, B-Frequency, E-Frequency, I-Frequency, S-Route, B-Route, E-Route, I-Route, S-Dosage, B-Dosage, E-Dosage, I-Dosage, S-Reason, B-Reason, E-Reason, I-Reason, S-Duration, B-Duration, E-Duration, I-Duration, S-ADE, B-ADE, E-ADE, I-ADE, <START>, <STOP>


In [None]:
# 6. initialize trainer
trainer = ModelTrainer(tf_tagger, corpus)

# 7. train on the target corpus
trainer.train(
    base_path=f"{MODEL_PATH}/taggers/clinicalbert-crf",
    train_with_dev=False,
    max_epochs=50,
    learning_rate=0.005,
    mini_batch_size=16,
    embeddings_storage_mode='none',
    # use_amp=True
)



2023-04-15 02:22:40,544 ----------------------------------------------------------------------------------------------------
2023-04-15 02:22:40,550 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28997, 768)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_fea

100%|██████████| 246/246 [02:24<00:00,  1.71it/s]


2023-04-15 03:12:26,477 Evaluating as a multi-label problem: False
2023-04-15 03:12:26,791 DEV : loss 0.11919624358415604 - f1-score (micro avg)  0.9394
2023-04-15 03:12:26,886 BAD EPOCHS (no improvement): 0
2023-04-15 03:12:26,891 saving best model
2023-04-15 03:12:28,491 ----------------------------------------------------------------------------------------------------
2023-04-15 03:17:23,289 epoch 2 - iter 221/2213 - loss 0.22836939 - time (sec): 294.80 - samples/sec: 439.05 - lr: 0.005000
2023-04-15 03:22:17,367 epoch 2 - iter 442/2213 - loss 0.22741216 - time (sec): 588.87 - samples/sec: 437.88 - lr: 0.005000
2023-04-15 03:27:09,303 epoch 2 - iter 663/2213 - loss 0.22671612 - time (sec): 880.81 - samples/sec: 438.78 - lr: 0.005000
2023-04-15 03:32:02,152 epoch 2 - iter 884/2213 - loss 0.22493185 - time (sec): 1173.66 - samples/sec: 439.61 - lr: 0.005000
2023-04-15 03:36:54,623 epoch 2 - iter 1105/2213 - loss 0.22425766 - time (sec): 1466.13 - samples/sec: 439.78 - lr: 0.005000
20

100%|██████████| 246/246 [02:27<00:00,  1.66it/s]

2023-04-15 04:03:48,261 Evaluating as a multi-label problem: False





2023-04-15 04:03:48,544 DEV : loss 0.08421977609395981 - f1-score (micro avg)  0.9522
2023-04-15 04:03:48,639 BAD EPOCHS (no improvement): 0
2023-04-15 04:03:48,646 saving best model
2023-04-15 04:03:50,564 ----------------------------------------------------------------------------------------------------
2023-04-15 04:08:40,995 epoch 3 - iter 221/2213 - loss 0.21032126 - time (sec): 290.43 - samples/sec: 440.68 - lr: 0.005000
2023-04-15 04:13:34,600 epoch 3 - iter 442/2213 - loss 0.21138953 - time (sec): 584.03 - samples/sec: 439.61 - lr: 0.005000
2023-04-15 04:18:29,261 epoch 3 - iter 663/2213 - loss 0.21217257 - time (sec): 878.69 - samples/sec: 438.49 - lr: 0.005000
2023-04-15 04:23:23,292 epoch 3 - iter 884/2213 - loss 0.21111031 - time (sec): 1172.72 - samples/sec: 438.51 - lr: 0.005000
2023-04-15 04:28:19,724 epoch 3 - iter 1105/2213 - loss 0.21050615 - time (sec): 1469.16 - samples/sec: 438.63 - lr: 0.005000
2023-04-15 04:33:07,474 epoch 3 - iter 1326/2213 - loss 0.20975839 - 

100%|██████████| 246/246 [02:22<00:00,  1.73it/s]

2023-04-15 04:55:09,117 Evaluating as a multi-label problem: False





2023-04-15 04:55:09,415 DEV : loss 0.09575002640485764 - f1-score (micro avg)  0.9494
2023-04-15 04:55:09,510 BAD EPOCHS (no improvement): 1
2023-04-15 04:55:09,517 ----------------------------------------------------------------------------------------------------
2023-04-15 05:00:08,747 epoch 4 - iter 221/2213 - loss 0.19917283 - time (sec): 299.23 - samples/sec: 430.17 - lr: 0.005000


In [6]:
tf_tagger = SequenceTagger.load(f"{MODEL_PATH}/taggers/clinicalbert-crf/final-model.pt")

# run evaluation procedure
result = tf_tagger.evaluate(
    corpus.test, gold_label_type='ner', mini_batch_size=64
)
print(result.detailed_results)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

2023-04-18 23:18:11,439 SequenceTagger predicts: Dictionary with 39 tags: O, S-Drug, B-Drug, E-Drug, I-Drug, S-Strength, B-Strength, E-Strength, I-Strength, S-Form, B-Form, E-Form, I-Form, S-Frequency, B-Frequency, E-Frequency, I-Frequency, S-Route, B-Route, E-Route, I-Route, S-Dosage, B-Dosage, E-Dosage, I-Dosage, S-Reason, B-Reason, E-Reason, I-Reason, S-Duration, B-Duration, E-Duration, I-Duration, S-ADE, B-ADE, E-ADE, I-ADE, <START>, <STOP>


100%|██████████| 397/397 [13:19<00:00,  2.01s/it]


2023-04-18 23:31:32,356 Evaluating as a multi-label problem: False

Results:
- F-score (micro) 0.9121
- F-score (macro) 0.8242
- Accuracy 0.8446

By class:
              precision    recall  f1-score   support

        Drug     0.8968    0.9390    0.9174     61167
    Strength     0.9414    0.9587    0.9500     42957
        Form     0.9257    0.9212    0.9234     41417
   Frequency     0.8670    0.8726    0.8698     36495
       Route     0.9457    0.9617    0.9536     30583
      Dosage     0.9236    0.9473    0.9353     23506
      Reason     0.6905    0.7782    0.7317      9533
    Duration     0.7659    0.7876    0.7766      1982
         ADE     0.4224    0.3141    0.3603      1299

   micro avg     0.9018    0.9227    0.9121    248939
   macro avg     0.8199    0.8312    0.8242    248939
weighted avg     0.9020    0.9227    0.9121    248939



In [None]:
# create example sentence
sentence = Sentence("Patients on 40 mg of Topelfate and Topoxy twice a day for stomachache generally suffer from headache")

# token level predictions
tf_tagger.predict(sentence, force_token_predictions=True)
print(sentence.to_tagged_string())

# predict tags and print
tf_tagger.predict(sentence)
print(sentence.to_tagged_string())

Sentence[15]: "Patients on 40 mg of Topelfate and Topoxy twice a day generally suffer from headache" → ["40"/B-Strength, "mg"/E-Strength, "Topelfate"/S-Drug, "Topoxy"/S-Drug, "twice"/B-Frequency, "a"/I-Frequency, "day"/E-Frequency, "headache"/S-Reason]
Sentence[15]: "Patients on 40 mg of Topelfate and Topoxy twice a day generally suffer from headache" → ["40 mg"/Strength, "Topelfate"/Drug, "Topoxy"/Drug, "twice a day"/Frequency, "headache"/Reason]
