In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install "flair" -q
!pip install "scispacy" -q
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz -q

LSTM-CRF : [link](https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md)

Transformer: [link](https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_TRAINING_SEQUENCE_LABELER.md)

In [1]:
import flair
from flair.data import Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import (
    WordEmbeddings, TransformerWordEmbeddings, StackedEmbeddings
)
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
flair.__version__

'0.12.2'

In [2]:
DATA_PATH = "/content/drive/MyDrive/Courses/2. Spring 23/1. CIS522/Project/data"
MODEL_PATH = "/content/drive/MyDrive/Courses/2. Spring 23/1. CIS522/Project/models"

In [3]:
columns = {0:"text", 1:"ner"}

filename = "flair_train.txt"
test_file = "flair_test.txt"

corpus = ColumnCorpus(
    DATA_PATH, columns, train_file=filename, test_file=test_file
)
tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)
print(tag_dictionary.get_items())

2023-04-04 15:31:01,566 Reading data from /content/drive/MyDrive/Courses/2. Spring 23/1. CIS522/Project/data
2023-04-04 15:31:01,571 Train: /content/drive/MyDrive/Courses/2. Spring 23/1. CIS522/Project/data/flair_train.txt
2023-04-04 15:31:01,573 Dev: None
2023-04-04 15:31:01,575 Test: /content/drive/MyDrive/Courses/2. Spring 23/1. CIS522/Project/data/flair_test.txt
2023-04-04 15:31:52,955 Computing label dictionary. Progress:


36464it [00:01, 24693.16it/s]

2023-04-04 15:31:54,443 Dictionary created for label 'ner' with 9 values: Drug (seen 87012 times), Strength (seen 60254 times), Form (seen 57204 times), Frequency (seen 49697 times), Route (seen 41101 times), Dosage (seen 33271 times), Reason (seen 14201 times), Duration (seen 3390 times), ADE (seen 2243 times)
['Drug', 'Strength', 'Form', 'Frequency', 'Route', 'Dosage', 'Reason', 'Duration', 'ADE']





In [4]:
weight_dict = {
    'Drug': 87168/87168,
    'Strength': 87168/60400,
    'Form': 87168/57184,
    'Frequency': 87168/49699,
    'Route': 87168/41022,
    'Dosage': 87168/33289,
    'Reason': 87168/14242,
    'Duration': 87168/3350,
    'ADE': 87168/2260,
}
weight_dict

{'Drug': 1.0,
 'Strength': 1.4431788079470198,
 'Form': 1.5243424734191382,
 'Frequency': 1.753918589911266,
 'Route': 2.1249085856369754,
 'Dosage': 2.6185226351046893,
 'Reason': 6.120488695407948,
 'Duration': 26.020298507462687,
 'ADE': 38.56991150442478}

In [19]:
# 4. initialize fine-tuneable transformer embeddings WITH document context
tf_embeddings = TransformerWordEmbeddings(
    model='emilyalsentzer/Bio_ClinicalBERT',
    layers="-1",
    subtoken_pooling="mean",
    fine_tune=True,
    use_context=True,
    model_max_length=512
)

embedding_types = [
    # word embeddings trained on PubMed and PMC
    # WordEmbeddings("pubmed"),
    tf_embeddings
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
tf_tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    use_crf=True,
    use_rnn=False,
    reproject_embeddings=False,
    loss_weights=weight_dict
)

2023-04-04 15:54:53,718 SequenceTagger predicts: Dictionary with 37 tags: O, S-Drug, B-Drug, E-Drug, I-Drug, S-Strength, B-Strength, E-Strength, I-Strength, S-Form, B-Form, E-Form, I-Form, S-Frequency, B-Frequency, E-Frequency, I-Frequency, S-Route, B-Route, E-Route, I-Route, S-Dosage, B-Dosage, E-Dosage, I-Dosage, S-Reason, B-Reason, E-Reason, I-Reason, S-Duration, B-Duration, E-Duration, I-Duration, S-ADE, B-ADE, E-ADE, I-ADE


In [None]:
# 6. initialize trainer
trainer = ModelTrainer(tf_tagger, corpus)

# 7. train on the target corpus
trainer.train(
    base_path=f"{MODEL_PATH}/taggers/clinicalbert-crf",
    train_with_dev=False,
    max_epochs=1,
    learning_rate=5.0e-6,
    mini_batch_size=16,
    embeddings_storage_mode='none'
)

2023-04-04 15:55:04,817 ----------------------------------------------------------------------------------------------------
2023-04-04 15:55:04,828 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28997, 768)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_fea

In [None]:
# loaded_model = SequenceTagger.load(f"{MODEL_PATH}/taggers/lstm-crf/final-model.pt")

In [None]:
# # create example sentence
# sentence = Sentence("Women who smoke 20 cigarettes a day are four times more likely to develop breast cancer.")

# # predict tags and print
# loaded_model.predict(sentence)

# print(sentence.to_tagged_string())