In [234]:
# Load base packages
import pandas as pd


import numpy as np


import matplotlib.pyplot as plt


import seaborn as sns
import os

# load dataset tools
import datasets
from datasets import load_dataset

# load models


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [219]:
# Extract handout.txt from each subdirectory of RawData
def read_handout_txt():
    data = []

    for root, dirs, files in os.walk("./data/RawData/"):
        try:
            with open(os.path.join(root, "handout.txt"), "r") as f:
                handout = f.readlines()

            lines = []
            for line in handout:
                line = line.strip()
                sents = line.split(".")
                for sent in sents:
                    sent = sent.strip()
                    if sent:
                        lines.append(sent)

            # number lines
            lines = [
                {
                    "Drug name": root.split("/")[-1],
                    "Line": line,
                    "Line number": i,
                }
                for i, line in enumerate(lines)
            ]

            data.extend(lines)

        except:
            handout = None
            print(f"{root} Not Found")

    return data

# Load the data

In [220]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
handout_df = pd.DataFrame(read_handout_txt())

./data/RawData/ Not Found
./data/RawData/Coreg Not Found


In [221]:
anno_df.head()

Unnamed: 0,Drug name,Drug number,Line number,Advice Text,AdviceTag1,AdviceTag2,AdviceTag3,AdviceTag4,Medication,Food,Activity,Exercise,Disease
0,Abilify,0,34,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related,,,,,,sitting; lying; rising; getting up;,,
1,Abilify,0,38,This medication may rarely make your blood sug...,Disease or symptom related,,,,,,,,diabetes
2,Abilify,0,43,This medication may rarely cause a condition k...,Disease or symptom related,,,,,,,,tardive dyskinesia
3,Abilify,0,64,This drug may make you dizzy or drowsy or caus...,Disease or symptom related,Activity or lifestyle related,,,,,driving; using machineray; activity requiring ...,,
4,Abilify,0,66,Avoid alcoholic beverages.,Food or beverage related,,,,,alcohol,,,


In [222]:
handout_df.head()

Unnamed: 0,Drug name,Line,Line number
0,Abilify,Patient Educationaripiprazole intramuscular,0
1,Abilify,IMPORTANT: HOW TO USE THIS INFORMATION: This ...,1
2,Abilify,This information does not assure that this pro...,2
3,Abilify,This information is not individual medical adv...,3
4,Abilify,Always ask your health care professional for c...,4


## Combine the raw data and annotations

In [223]:
# merge with annotation data
merged_df = pd.merge(anno_df, handout_df, on=["Drug name", "Line number"])
merged_df.head()

Unnamed: 0,Drug name,Drug number,Line number,Advice Text,AdviceTag1,AdviceTag2,AdviceTag3,AdviceTag4,Medication,Food,Activity,Exercise,Disease,Line
0,Abilify,0,34,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related,,,,,,sitting; lying; rising; getting up;,,,To reduce the risk of dizziness and lightheade...
1,Abilify,0,38,This medication may rarely make your blood sug...,Disease or symptom related,,,,,,,,diabetes,This medication may rarely make your blood sug...
2,Abilify,0,43,This medication may rarely cause a condition k...,Disease or symptom related,,,,,,,,tardive dyskinesia,This medication may rarely cause a condition k...
3,Abilify,0,64,This drug may make you dizzy or drowsy or caus...,Disease or symptom related,Activity or lifestyle related,,,,,driving; using machineray; activity requiring ...,,,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,66,Avoid alcoholic beverages.,Food or beverage related,,,,,alcohol,,,,Avoid alcoholic beverages


## Drop unneeded columns


In [224]:
merged_df = merged_df[
    [
        "Drug number",
        "Line number",
        "Line",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

merged_df.head()

Unnamed: 0,Drug number,Line number,Line,Advice Text,AdviceTag1,AdviceTag2,AdviceTag3,AdviceTag4
0,0,34,To reduce the risk of dizziness and lightheade...,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related,,,
1,0,38,This medication may rarely make your blood sug...,This medication may rarely make your blood sug...,Disease or symptom related,,,
2,0,43,This medication may rarely cause a condition k...,This medication may rarely cause a condition k...,Disease or symptom related,,,
3,0,64,This drug may make you dizzy or drowsy or caus...,This drug may make you dizzy or drowsy or caus...,Disease or symptom related,Activity or lifestyle related,,
4,0,66,Avoid alcoholic beverages,Avoid alcoholic beverages.,Food or beverage related,,,


In [225]:
# look at the first piece of advice
merged_df[merged_df["Advice Text"] == merged_df.loc[0, "Advice Text"]].shape

(16, 8)

### Encode advice as a binary variable

In [226]:
# stack advice columns
merged_df = merged_df.melt(
    id_vars=["Drug number", "Line number", "Line", "Advice Text"],
    value_vars=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    value_name="AdviceTag",
)

merged_df.drop("variable", axis=1, inplace=True)
merged_df.head()

Unnamed: 0,Drug number,Line number,Line,Advice Text,AdviceTag
0,0,34,To reduce the risk of dizziness and lightheade...,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related
1,0,38,This medication may rarely make your blood sug...,This medication may rarely make your blood sug...,Disease or symptom related
2,0,43,This medication may rarely cause a condition k...,This medication may rarely cause a condition k...,Disease or symptom related
3,0,64,This drug may make you dizzy or drowsy or caus...,This drug may make you dizzy or drowsy or caus...,Disease or symptom related
4,0,66,Avoid alcoholic beverages,Avoid alcoholic beverages.,Food or beverage related


In [227]:
# look at the first piece of advice
merged_df[merged_df["Advice Text"] == merged_df.loc[0, "Advice Text"]].shape

(64, 5)

In [228]:
# one hot encode advice tags
merged_df = pd.concat(
    [merged_df, pd.get_dummies(merged_df["AdviceTag"], dtype=int)], axis=1
)
merged_df.drop("AdviceTag", axis=1, inplace=True)
merged_df.head()

Unnamed: 0,Drug number,Line number,Line,Advice Text,Activity or lifestyle related,Disease or symptom related,Drug administration related,Exercise related,Food or beverage related,Other drugs related,Pregnancy related,Temporal
0,0,34,To reduce the risk of dizziness and lightheade...,To reduce the risk of dizziness and lightheade...,1,0,0,0,0,0,0,0
1,0,38,This medication may rarely make your blood sug...,This medication may rarely make your blood sug...,0,1,0,0,0,0,0,0
2,0,43,This medication may rarely cause a condition k...,This medication may rarely cause a condition k...,0,1,0,0,0,0,0,0
3,0,64,This drug may make you dizzy or drowsy or caus...,This drug may make you dizzy or drowsy or caus...,0,1,0,0,0,0,0,0
4,0,66,Avoid alcoholic beverages,Avoid alcoholic beverages.,0,0,0,0,1,0,0,0


## Split dataset into two datasets


The data will be split into two seperate datasets at this point to increase the clarity of the two models and task that will be undergone. 

The first dataset will contain the original line, and the advice text which will then be labels for Information Extraction using BIO tagging. 

The second dataset will contain the advice text, and the enconded labels which will be used for multilabel classification.

In [229]:
extraction_df = merged_df[["Line", "Advice Text"]]
labeling_df = merged_df.drop(["Line", "Line number", "Drug number"], axis=1)

In [230]:
# print the first row
extraction_df.loc[1, "Line"]

'This medication may rarely make your blood sugar level rise, which can cause or worsen diabetes'

In [231]:
extraction_df.loc[1, "Advice Text"]

'This medication may rarely make your blood sugar level rise, which can cause or worsen diabetes. Rarely, very serious conditions such as diabetic coma may occur. Tell your doctor right away if you develop symptoms of high blood sugar, such as increased thirst and urination. If you already have diabetes, be sure to check your blood sugars regularly.'

# Preprocess the data

In [235]:
multi_label = datasets.Dataset.from_pandas(labeling_df)

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The first baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The second baseline will be a majority class baseline, where the most common label is assigned to all advice text.

The baselines will be evaluated using the F1 score, Precision, and Recall.

In [238]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

In [276]:
def mode_baseline_pred(dataset, n_labels=8):
    """
    Predicts the most common label for each label_id in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    label_ids = list(dataset.features.keys())
    label_ids = label_ids[len(label_ids) - n_labels :]

    # can use median since our labels are binary
    modes = np.array([np.median(dataset[label_id]) for label_id in label_ids])

    return np.tile(modes, (len(dataset), 1))

## Evaluate the baseline

In [290]:
label_ids = labeling_df.columns[1:]
ground_truth = labeling_df[label_ids].values
ground_truth.shape

(3728, 8)

In [295]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

(3728, 8)
Random Precision: 0.05219093916683546, Recall: 0.5153333333333333, F1: 0.09478266200723438


In [296]:
mode_preds = mode_baseline_pred(multi_label)
print(mode_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Mode Precision: {precision}, Recall: {recall}, F1: {f1}")

(3728, 8)
Mode Precision: 0.05219093916683546, Recall: 0.5153333333333333, F1: 0.09478266200723438
