### Install & Import Packages

In [None]:
%%capture
!pip install "flair" -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import flair
from flair.data import Sentence
from flair.datasets import DataLoader
from flair.datasets import ColumnCorpus
from flair.embeddings import (
    WordEmbeddings, FlairEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
)
from flair.models import RelationExtractor, SequenceTagger
from flair.trainers import ModelTrainer
flair.__version__

'0.12.2'

### Import Data

In [None]:
DATA_PATH = "/content/drive/Shareddrives/CIS522-Project/data/"
MODEL_PATH = "/content/drive/Shareddrives/CIS522-Project/models"

In [None]:
# Import data into flair using ColumnCorpus
corpus = ColumnCorpus(
    DATA_PATH, {1: 'text', 2: 'ner'},
    train_file='flair_rel_train.txt', test_file='flair_rel_test.txt',
    comment_symbol="# "
)

2023-04-12 13:52:21,417 Reading data from /content/drive/Shareddrives/CIS522-Project/data
2023-04-12 13:52:21,429 Train: /content/drive/Shareddrives/CIS522-Project/data/flair_rel_train.txt
2023-04-12 13:52:21,433 Dev: None
2023-04-12 13:52:21,436 Test: /content/drive/Shareddrives/CIS522-Project/data/flair_rel_test.txt


In [None]:
# Example ColumnCorpus train sample
corpus.train[0]

Sentence[29]: "He also may have recurrent seizures which should be treated with ativan IV or IM and do not neccessarily indicate patient needs to return to hospital unless they continue" → ["recurrent seizures"/Reason, "recurrent seizures -> ativan"/Reason-Drug, "ativan"/Drug, "IV"/Route, "IM"/Route]

In [None]:
# Example ColumnCorpus test sample
corpus.test[0]

Sentence[14]: "MEDICATIONS : Lipitor , Tylenol with Codeine , Dilantin , previously on Decadron q.i.d" → ["Lipitor"/Drug, "Tylenol with Codeine"/Drug, "Dilantin"/Drug, "Decadron"/Drug, "q.i.d -> Decadron"/Frequency-Drug, "q.i.d"/Frequency]

In [None]:
# Needed for model initialization
label_dictionary = corpus.make_label_dictionary(label_type="relation", add_unk=False)
label_dictionary.add_item('O') # Important to capture invalid relations
print(label_dictionary.get_items())

2023-04-10 06:59:02,099 Computing label dictionary. Progress:


32711it [00:00, 43156.08it/s]

2023-04-10 06:59:02,899 Dictionary created for label 'relation' with 8 values: Strength-Drug (seen 6063 times), Form-Drug (seen 5968 times), Frequency-Drug (seen 5691 times), Route-Drug (seen 4974 times), Reason-Drug (seen 4642 times), Dosage-Drug (seen 3785 times), ADE-Drug (seen 1011 times), Duration-Drug (seen 577 times)
['Strength-Drug', 'Form-Drug', 'Frequency-Drug', 'Route-Drug', 'Reason-Drug', 'Dosage-Drug', 'ADE-Drug', 'Duration-Drug', 'O']





### Initialize Weight Dictionary
This is the weight dictionary used by the loss function. The weight for a given relationship is set to the ratio between the frequency of the most represented relationship and the frequency of the given relationship.

In [None]:
weight_dict = {
    'Strength-Drug': 6063/6063,
    'Form-Drug': 6063/5968,
    'Frequency-Drug': 6063/5691,
    'Route-Drug': 6063/4974,
    'Reason-Drug': 6063/4642,
    'Dosage-Drug': 6063/3785,
    'ADE-Drug': 6063/1011,
    'Duration-Drug': 6063/577,
}

### Initialize/Load Embeddings & Model

In [None]:
# RUN ONLY DURING EMBEDDINGS/MODEL INITIALIZATION
# embedding_types = [
#     FlairEmbeddings("pubmed-forward", fine_tune=True),
#     FlairEmbeddings("pubmed-backward", fine_tune=True),
# ]
# embeddings = StackedEmbeddings(embeddings=embedding_types)
# rel_extractor = RelationExtractor(
#     embeddings=embeddings,
#     label_type="relation",
#     entity_label_type='ner',
#     pooling_operation="first_last",
#     label_dictionary=label_dictionary,
#     loss_weights=weight_dict,
#     entity_pair_filters=[
#         ('Strength', 'Drug'),
#         ('Form', 'Drug'),
#         ('Frequency', 'Drug'),
#         ('Route', 'Drug'),
#         ('Reason', 'Drug'),
#         ('Dosage', 'Drug'),
#         ('ADE', 'Drug'),
#         ('Duration', 'Drug')
#     ]
# )

# RUN WHEN A COPY OF THE MODEL HAS BEEN SAVED TO DRIVE
rel_extractor = RelationExtractor.load(
    f"{MODEL_PATH}/extractors/flair-embedding-rel/best-model.pt"
)

### Train Model

In [None]:
# Initialize trainer
trainer = ModelTrainer(rel_extractor, corpus)

# Train on corpus
trainer.train(
    base_path=f"{MODEL_PATH}/extractors/flair-embedding-rel",
    train_with_dev=False,
    max_epochs=5,
    learning_rate=0.1,
    mini_batch_size=8,
    embeddings_storage_mode='none'
)

### Evaluate Model

In [None]:
result = rel_extractor.evaluate(corpus.test, gold_label_type='relation', mini_batch_size=64)
print(result.detailed_results)

100%|██████████| 367/367 [04:32<00:00,  1.35it/s]

2023-04-12 14:04:15,026 Evaluating as a multi-label problem: False






Results:
- F-score (micro) 0.8235
- F-score (macro) 0.7998
- Accuracy 0.7124

By class:
                precision    recall  f1-score   support

     Form-Drug     0.8904    0.8267    0.8574      4374
Frequency-Drug     0.8763    0.9251    0.9000      4034
    Route-Drug     0.6760    0.8739    0.7624      3546
 Strength-Drug     0.9228    0.7717    0.8405      4244
   Reason-Drug     0.6987    0.8487    0.7664      3410
   Dosage-Drug     0.8643    0.8883    0.8761      2695
      ADE-Drug     0.5217    0.7872    0.6275       733
 Duration-Drug     0.6818    0.8803    0.7684       426

     micro avg     0.7978    0.8508    0.8235     23462
     macro avg     0.7665    0.8502    0.7998     23462
  weighted avg     0.8153    0.8508    0.8274     23462

