In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install "flair" -q
!pip install "scispacy" -q
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz -q

In [16]:
import flair
from flair.data import Sentence
from flair.datasets import DataLoader
from flair.datasets import ColumnCorpus
from flair.embeddings import (
    WordEmbeddings, FlairEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
)
from flair.models import RelationExtractor, SequenceTagger
from flair.trainers import ModelTrainer
flair.__version__

'0.12.2'

In [6]:
DATA_PATH = "/content/drive/Shareddrives/CIS522-Project/data/"
MODEL_PATH = "/content/drive/Shareddrives/CIS522-Project/models"

In [13]:
corpus = ColumnCorpus(
    DATA_PATH, {1: 'text', 2: 'ner'},
    train_file='flair_rel_train.txt', test_file='flair_rel_test.txt',
    comment_symbol="# "
)

2023-04-09 23:46:43,210 Reading data from /content/drive/Shareddrives/CIS522-Project/data
2023-04-09 23:46:43,213 Train: /content/drive/Shareddrives/CIS522-Project/data/flair_rel_train.txt
2023-04-09 23:46:43,221 Dev: None
2023-04-09 23:46:43,223 Test: /content/drive/Shareddrives/CIS522-Project/data/flair_rel_test.txt


In [6]:
corpus.train[0]

Sentence[29]: "He also may have recurrent seizures which should be treated with ativan IV or IM and do not neccessarily indicate patient needs to return to hospital unless they continue" → ["recurrent seizures"/Reason, "recurrent seizures -> ativan"/Reason-Drug, "ativan"/Drug, "IV"/Route, "IM"/Route]

In [7]:
corpus.test[0]

Sentence[14]: "MEDICATIONS : Lipitor , Tylenol with Codeine , Dilantin , previously on Decadron q.i.d" → ["Lipitor"/Drug, "Tylenol with Codeine"/Drug, "Dilantin"/Drug, "Decadron"/Drug, "q.i.d -> Decadron"/Frequency-Drug, "q.i.d"/Frequency]

In [4]:
label_dictionary = corpus.make_label_dictionary(label_type="relation", add_unk=False)
label_dictionary.add_item('O') # important to capture invalid relations
print(label_dictionary.get_items())

2023-04-09 20:00:26,870 Computing label dictionary. Progress:


32711it [00:00, 47668.19it/s]

2023-04-09 20:00:27,566 Dictionary created for label 'relation' with 8 values: Strength-Drug (seen 6031 times), Form-Drug (seen 5978 times), Frequency-Drug (seen 5635 times), Route-Drug (seen 5001 times), Reason-Drug (seen 4689 times), Dosage-Drug (seen 3806 times), ADE-Drug (seen 991 times), Duration-Drug (seen 580 times)
['Strength-Drug', 'Form-Drug', 'Frequency-Drug', 'Route-Drug', 'Reason-Drug', 'Dosage-Drug', 'ADE-Drug', 'Duration-Drug', 'O']





In [6]:
weight_dict = {
    'Strength-Drug': 6021/6021,
    'Form-Drug': 6021/6005,
    'Frequency-Drug': 6021/5696,
    'Route-Drug': 6021/4934,
    'Reason-Drug': 6021/4669,
    'Dosage-Drug': 6021/3811,
    'ADE-Drug': 6021/996,
    'Duration-Drug': 6021/579,
}
weight_dict

{'Strength-Drug': 1.0,
 'Form-Drug': 1.0026644462947543,
 'Frequency-Drug': 1.057057584269663,
 'Route-Drug': 1.220308066477503,
 'Reason-Drug': 1.2895695009638037,
 'Dosage-Drug': 1.5799002886381528,
 'ADE-Drug': 6.045180722891566,
 'Duration-Drug': 10.398963730569948}

In [7]:
# embedding_types = [
#     # word embeddings trained on PubMed and PMC
#     # WordEmbeddings("pubmed"),
#     # flair embeddings trained on PubMed and PMC
#     FlairEmbeddings("pubmed-forward", fine_tune=True),
#     FlairEmbeddings("pubmed-backward", fine_tune=True),
# ]

# tf_embeddings = TransformerWordEmbeddings(
#     model='emilyalsentzer/Bio_ClinicalBERT',
#     layers="-1",
#     subtoken_pooling="mean",
#     fine_tune=True,
#     use_context=True,
#     model_max_length=256 # required to fix a weird tensor size mismatch error 
# )

# embedding_types = [
#     # word embeddings trained on PubMed and PMC
#     # WordEmbeddings("pubmed"),
#     tf_embeddings
# ]

# embeddings = StackedEmbeddings(embeddings=embedding_types)

# rel_extractor = RelationExtractor(
#     embeddings=embeddings,
#     label_type="relation",
#     entity_label_type='ner',
#     pooling_operation="first_last",
#     label_dictionary=label_dictionary,
#     loss_weights=weight_dict,
#     entity_pair_filters=[
#         ('Strength', 'Drug'),
#         ('Form', 'Drug'),
#         ('Frequency', 'Drug'),
#         ('Route', 'Drug'),
#         ('Reason', 'Drug'),
#         ('Dosage', 'Drug'),
#         ('ADE', 'Drug'),
#         ('Duration', 'Drug')
#     ]
# )

rel_extractor = RelationExtractor.load(f"{MODEL_PATH}/extractors/flair-embedding-rel/best-model.pt")

In [15]:
trainer = ModelTrainer(rel_extractor, corpus)

trainer.train(
    base_path=f"{MODEL_PATH}/extractors/flair-embedding-rel",
    train_with_dev=False,
    max_epochs=5,
    learning_rate=0.1,
    mini_batch_size=8,
    embeddings_storage_mode='none'
)

In [25]:
# run evaluation procedure
result, score = rel_extractor.evaluate(
    corpus.test, gold_label_type='relation', mini_batch_size=1
)
print(result.detailed_results)

In [11]:
tagger = SequenceTagger.load(f"{MODEL_PATH}/taggers/lstm-crf-augmented/final-model.pt")

2023-04-09 23:44:38,641 SequenceTagger predicts: Dictionary with 39 tags: O, S-Drug, B-Drug, E-Drug, I-Drug, S-Strength, B-Strength, E-Strength, I-Strength, S-Form, B-Form, E-Form, I-Form, S-Frequency, B-Frequency, E-Frequency, I-Frequency, S-Route, B-Route, E-Route, I-Route, S-Dosage, B-Dosage, E-Dosage, I-Dosage, S-Reason, B-Reason, E-Reason, I-Reason, S-Duration, B-Duration, E-Duration, I-Duration, S-ADE, B-ADE, E-ADE, I-ADE, <START>, <STOP>


In [12]:
# create example sentence
sentence = Sentence("Patients on 40 mg of Topelfate and Topoxy twice a day generally suffer from headache")

# predict tags and print
tagger.predict(sentence)
rel_extractor.predict(sentence)

print(sentence.to_tagged_string())

Sentence[15]: "Patients on 40 mg of Topelfate and Topoxy twice a day generally suffer from headache" → ["40 mg"/Strength, "Topelfate"/Drug, "headache -> Topelfate"/ADE-Drug, "Topoxy"/Drug, "headache -> Topoxy"/ADE-Drug, "twice a day"/Frequency, "headache"/ADE]
