In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Raw Annotated

In [14]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")

anno_df = anno_df[
    [
        "Drug number",
        "Line number",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

In [15]:
labels = (
    anno_df[["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"]]
    .fillna("")
    .astype(str)
)

label_ids = list(set(labels.values.flatten()))

# remove the empty string
label_ids.remove("")


n_label_ids = len(label_ids)
n_label_ids

8

In [16]:
# Create new columns for each unique tag and initialize them with 0
for lab in label_ids:
    anno_df[lab] = 0

# Update the values to 1 where the tag is present
for lab in label_ids:
    mask = labels.apply(lambda row: lab in row.values, axis=1)
    anno_df.loc[mask, lab] = 1

# Drop the original AdviceTag columns
anno_df.drop(
    columns=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    inplace=True,
)

# Save the transformed data to a new file
anno_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,Other drugs related,Temporal,Disease or symptom related,Drug administration related,Exercise related,Food or beverage related,Pregnancy related,Activity or lifestyle related
0,0,34,To reduce the risk of dizziness and lightheade...,0,0,0,0,0,0,0,1
1,0,38,This medication may rarely make your blood sug...,0,0,1,0,0,0,0,0
2,0,43,This medication may rarely cause a condition k...,0,0,1,0,0,0,0,0
3,0,64,This drug may make you dizzy or drowsy or caus...,0,0,1,0,0,0,0,1
4,0,66,Avoid alcoholic beverages.,0,0,0,0,0,1,0,0


In [17]:
plot_df = anno_df[anno_df.columns[3:]]
plot_df = plot_df.melt(var_name="AdviceTag", value_name="Count")
plot_df = plot_df.groupby("AdviceTag").sum().reset_index()
plot_df = plot_df.sort_values("Count", ascending=False)
plot_df

Unnamed: 0,AdviceTag,Count
5,Other drugs related,310
4,Food or beverage related,253
1,Disease or symptom related,245
2,Drug administration related,224
6,Pregnancy related,211
7,Temporal,182
0,Activity or lifestyle related,146
3,Exercise related,40


# Advice Labeling

In [18]:
al = pd.read_csv("model_results.csv")
al

Unnamed: 0,model,precision,recall,f1
0,Random Baseline,0.196951,0.489137,0.280827
1,Binary Relevance - Logistic Regression,0.891892,0.634615,0.741573
2,Binary Relevance - Linear SVC,0.886207,0.823718,0.853821
3,Binary Relevance - K-Nearest Neighbors,0.826367,0.823718,0.82504
4,roberta-base,0.873239,0.794872,0.832215


# Information Extraction

In [19]:
ie = pd.read_csv("ie_model_results.csv")
ie

Unnamed: 0,Model,Token-level Accuracy,Span-level Precision,Span-level Recall,Span-level F1
0,Random Span Baseline,0.505671,0.284946,0.283297,0.278494
1,RoBERTa-CRF,0.505671,0.424237,0.65676,0.491793
2,RoBERTa-CRF - Postprocessed,0.367838,0.411704,0.679263,0.487452
3,RoBERTa,0.367838,0.419971,0.635378,0.476158
4,RoBERTa - Postprocessed,0.383428,0.413437,0.69091,0.488762


In [20]:
def read_handout_txt():
    data = []

    for root, dirs, files in os.walk("./data/RawData/"):
        try:
            with open(os.path.join(root, "handout.txt"), "r") as f:
                handout = f.readlines()
        except:
            print(f"{root}/handout.txt Not Found")
            continue

        for i, line in enumerate(handout):
            line = line.strip()

            # number lines
            line_dict = {
                "Drug name": root.split("/")[-1],
                "Line number": i + 1,
                "Line": line,
            }

            data.append(line_dict)

    return data

In [21]:
raw = pd.DataFrame(read_handout_txt())
raw.shape

./data/RawData//handout.txt Not Found
./data/RawData/Coreg/handout.txt Not Found


(4283, 3)