In [261]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load utalities
from tqdm import tqdm
import re
import random
from collections import defaultdict

# load dataset tools
import datasets
from datasets import load_dataset

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# load tokenizing tools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Load data

In [209]:
# Extract handout.txt from each subdirectory of RawData
def read_handout_txt():
    data = []

    for root, dirs, files in os.walk("./data/RawData/"):
        try:
            with open(os.path.join(root, "handout.txt"), "r") as f:
                handout = f.readlines()
        except:
            print(f"{root}/handout.txt Not Found")
            continue

        for i, line in enumerate(handout):
            line = line.strip()

            # number lines
            line_dict = {
                "Drug name": root.split("/")[-1],
                "Line number": i + 1,
                "Line": line,
            }

            data.append(line_dict)

    return data

In [210]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")

In [211]:
raw_df = pd.DataFrame(read_handout_txt())

./data/RawData//handout.txt Not Found
./data/RawData/Coreg/handout.txt Not Found


In [212]:
raw_df.head()

Unnamed: 0,Drug name,Line number,Line
0,Abilify,1,Patient Educationaripiprazole intramuscular
1,Abilify,2,IMPORTANT: HOW TO USE THIS INFORMATION: This ...
2,Abilify,3,ARIPIPRAZOLE EXTENDED RELEASE - INJECTION
3,Abilify,4,(AR-i-PIP-ra-zole)
4,Abilify,5,"COMMON BRAND NAME(S): Abilify Maintena, Aristada"


# Filter data

In [213]:
anno_df = anno_df[["Drug name", "Drug number", "Advice Text"]]
anno_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text
0,Abilify,0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.


# Reassign line numbers

The line number present in the data is determined based off of scentence structure and not line number. We will locate the Advice text in the raw text, and assign it a new line number label based on the corresponing line. 

This will help us to assign IOB tags to the data.

In [214]:
def find_line_number(advice, raw_data_df):

    for i, line in tqdm(raw_data_df.iterrows()):
        if advice in line["Line"]:
            return line["Line number"]
    return None

In [215]:
# Find line number for each advice text
anno_df["Line number"] = anno_df["Advice Text"].apply(
    lambda x: find_line_number(x, raw_df)
)
anno_df.head()

16it [00:00, 8000.58it/s]
19it [00:00, 6328.76it/s]
20it [00:00, 6668.21it/s]
30it [00:00, 7502.33it/s]
30it [00:00, 6000.43it/s]
31it [00:00, 6200.15it/s]
31it [00:00, 10331.62it/s]
31it [00:00, 5164.99it/s]
34it [00:00, 8503.15it/s]
35it [00:00, 8742.30it/s]
36it [00:00, 9005.48it/s]
38it [00:00, 9505.22it/s]
39it [00:00, 9748.38it/s]
40it [00:00, 13333.24it/s]
52it [00:00, 17335.97it/s]
55it [00:00, 18339.03it/s]
55it [00:00, 27521.68it/s]
65it [00:00, 21666.52it/s]
65it [00:00, 16262.81it/s]
79it [00:00, 15795.87it/s]
80it [00:00, 20001.45it/s]
36it [00:00, 18003.45it/s]
83it [00:00, 16590.92it/s]
105it [00:00, 21002.52it/s]
105it [00:00, 26233.14it/s]
113it [00:00, 22608.11it/s]
123it [00:00, 20499.86it/s]
128it [00:00, 15999.73it/s]
129it [00:00, 21505.83it/s]
132it [00:00, 20238.64it/s]
137it [00:00, 22841.34it/s]
137it [00:00, 19579.52it/s]
153it [00:00, 21846.08it/s]
4283it [00:00, 24615.34it/s]
154it [00:00, 22002.34it/s]
154it [00:00, 19253.12it/s]
159it [00:00, 22711.29it/s

Unnamed: 0,Drug name,Drug number,Advice Text,Line number
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0
1,Abilify,0,This medication may rarely make your blood sug...,20.0
2,Abilify,0,This medication may rarely cause a condition k...,21.0
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0
4,Abilify,0,Avoid alcoholic beverages.,31.0


# Merge data

In [216]:
# merge dataframes
merged_df = pd.merge(anno_df, raw_df, on=["Drug name", "Line number"])
merged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...,20.0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...,21.0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.,31.0,This drug may make you dizzy or drowsy or caus...


# Tokenize data

In [217]:
def tokenize_text(text):
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    return word_tokenize(text)

In [218]:
# tokenize advice text
merged_df["Advice Text"] = merged_df["Advice Text"].apply(tokenize_text)

In [219]:
# tokenize line text
merged_df["Line"] = merged_df["Line"].apply(tokenize_text)

In [220]:
merged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line
0,Abilify,0,"[To, reduce, the, risk, of, dizziness, and, li...",17.0,"[To, reduce, the, risk, of, dizziness, and, li..."
1,Abilify,0,"[This, medication, may, rarely, make, your, bl...",20.0,"[This, medication, may, rarely, make, your, bl..."
2,Abilify,0,"[This, medication, may, rarely, cause, a, cond...",21.0,"[This, medication, may, rarely, cause, a, cond..."
3,Abilify,0,"[This, drug, may, make, you, dizzy, or, drowsy...",31.0,"[This, drug, may, make, you, dizzy, or, drowsy..."
4,Abilify,0,"[Avoid, alcoholic, beverages]",31.0,"[This, drug, may, make, you, dizzy, or, drowsy..."


# IOB tagging

In [225]:
# Use the advice text vs line text to create iob tagging
def tag_iob(line_tokens, advice_tokens):
    tagged_tokens = []
    for i, word in enumerate(line_tokens):
        tag = "O"
        if word in advice_tokens:
            tag = "B" if word == advice_tokens[0] else "I"
            tag = "E" if word == advice_tokens[-1] else tag

        tagged_tokens.append((word, tag))
        if tag == "E":
            break

    for word in line_tokens[i + 1 :]:
        tagged_tokens.append((word, "O"))
    return tagged_tokens


# NOTE: This function will have issues if the last word of the advice
# text also appears in the line text not as the last word
# needs to be fixed

In [226]:
merged_df["IOB Tagged"] = merged_df.apply(
    lambda x: tag_iob(x["Line"], x["Advice Text"]), axis=1
)
merged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line,IOB Tagged
0,Abilify,0,"[To, reduce, the, risk, of, dizziness, and, li...",17.0,"[To, reduce, the, risk, of, dizziness, and, li...","[(To, B), (reduce, I), (the, I), (risk, I), (o..."
1,Abilify,0,"[This, medication, may, rarely, make, your, bl...",20.0,"[This, medication, may, rarely, make, your, bl...","[(This, B), (medication, I), (may, I), (rarely..."
2,Abilify,0,"[This, medication, may, rarely, cause, a, cond...",21.0,"[This, medication, may, rarely, cause, a, cond...","[(This, B), (medication, I), (may, I), (rarely..."
3,Abilify,0,"[This, drug, may, make, you, dizzy, or, drowsy...",31.0,"[This, drug, may, make, you, dizzy, or, drowsy...","[(This, B), (drug, I), (may, I), (make, I), (y..."
4,Abilify,0,"[Avoid, alcoholic, beverages]",31.0,"[This, drug, may, make, you, dizzy, or, drowsy...","[(This, O), (drug, O), (may, O), (make, O), (y..."


In [227]:
# look at the first IOB taged text
merged_df["IOB Tagged"][1]

[('This', 'B'),
 ('medication', 'I'),
 ('may', 'I'),
 ('rarely', 'I'),
 ('make', 'I'),
 ('your', 'I'),
 ('blood', 'I'),
 ('sugar', 'I'),
 ('level', 'I'),
 ('rise', 'I'),
 ('which', 'I'),
 ('can', 'I'),
 ('cause', 'I'),
 ('or', 'I'),
 ('worsen', 'I'),
 ('diabetes', 'I'),
 ('Rarely', 'I'),
 ('very', 'I'),
 ('serious', 'I'),
 ('conditions', 'I'),
 ('such', 'I'),
 ('as', 'I'),
 ('diabetic', 'I'),
 ('coma', 'I'),
 ('may', 'I'),
 ('occur', 'I'),
 ('Tell', 'I'),
 ('your', 'I'),
 ('doctor', 'I'),
 ('right', 'I'),
 ('away', 'I'),
 ('if', 'I'),
 ('you', 'I'),
 ('develop', 'I'),
 ('symptoms', 'I'),
 ('of', 'I'),
 ('high', 'I'),
 ('blood', 'I'),
 ('sugar', 'I'),
 ('such', 'I'),
 ('as', 'I'),
 ('increased', 'I'),
 ('thirst', 'I'),
 ('and', 'I'),
 ('urination', 'I'),
 ('If', 'I'),
 ('you', 'I'),
 ('already', 'I'),
 ('have', 'I'),
 ('diabetes', 'I'),
 ('be', 'I'),
 ('sure', 'I'),
 ('to', 'I'),
 ('check', 'I'),
 ('your', 'I'),
 ('blood', 'I'),
 ('sugars', 'I'),
 ('regularly', 'E'),
 ('Your', 'O'),
 ('

In [224]:
# look at teh first Advice Text
merged_df["Advice Text"][1]

['This',
 'medication',
 'may',
 'rarely',
 'make',
 'your',
 'blood',
 'sugar',
 'level',
 'rise',
 'which',
 'can',
 'cause',
 'or',
 'worsen',
 'diabetes',
 'Rarely',
 'very',
 'serious',
 'conditions',
 'such',
 'as',
 'diabetic',
 'coma',
 'may',
 'occur',
 'Tell',
 'your',
 'doctor',
 'right',
 'away',
 'if',
 'you',
 'develop',
 'symptoms',
 'of',
 'high',
 'blood',
 'sugar',
 'such',
 'as',
 'increased',
 'thirst',
 'and',
 'urination',
 'If',
 'you',
 'already',
 'have',
 'diabetes',
 'be',
 'sure',
 'to',
 'check',
 'your',
 'blood',
 'sugars',
 'regularly']

# Convert tagged data to dataset

In [238]:
# create dataframe of input text and IOBE tags
data = pd.DataFrame()
data["text"] = merged_df["IOB Tagged"].apply(lambda x: [i[0] for i in x])
data["tag"] = merged_df["IOB Tagged"].apply(lambda x: [i[1] for i in x])
data.head()

Unnamed: 0,text,tag
0,"[To, reduce, the, risk, of, dizziness, and, li...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
1,"[This, medication, may, rarely, make, your, bl...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
2,"[This, medication, may, rarely, cause, a, cond...","[B, I, I, I, I, I, I, I, I, I, E, O, O, O, O, ..."
3,"[This, drug, may, make, you, dizzy, or, drowsy...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
4,"[This, drug, may, make, you, dizzy, or, drowsy...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Encode the labels

In [240]:
# encode tags
mapping = {"O": 0, "B": 1, "I": 2, "E": 3}

data["tag"] = data["tag"].apply(lambda x: [mapping[i] for i in x])
data.head()

Unnamed: 0,text,tag
0,"[To, reduce, the, risk, of, dizziness, and, li...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1,"[This, medication, may, rarely, make, your, bl...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[This, medication, may, rarely, cause, a, cond...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 0, 0, ..."
3,"[This, drug, may, make, you, dizzy, or, drowsy...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,"[This, drug, may, make, you, dizzy, or, drowsy...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Convert to dataset

# Evaluate the baseline

In [284]:
def pred_random_IOB(text):

    # get the length of the text
    length = len(text)

    # make array of zeros
    preds = np.zeros(length)

    # get a random number between 0 and the length of the text
    random_start = np.random.randint(0, length - 1)
    random_stop = np.random.randint(random_start + 1, length)

    # set the random start to 1
    preds[random_start] = 1

    # set the random stop to 3
    preds[random_stop] = 3

    # set the values in between to 2
    preds[random_start + 1 : random_stop] = 2

    return preds

In [285]:
preds = data["text"].apply(pred_random_IOB)
preds.head()

0    [1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, ...
Name: text, dtype: object

## Evalute the baseline at token level

We will find the accuracy for evaluating the baseline at the token level.

In [288]:
def calculate_accuracy(ground_truth, preds):
    correct = 0
    total = 0
    for true, pred in zip(ground_truth, preds):
        for t, p in zip(true, pred):
            if t == p:
                correct += 1
            total += 1
    return correct / total


# calculate the accuracy of the token level predictions
calculate_accuracy(data["tag"], preds)

0.5096110300407588

## Evaluate the baseline at span-level

In [291]:
def tags_to_spans(tags):
    spans = defaultdict(list)
    current_span = None
    for i, tag in enumerate(tags):
        if tag == 0:  # Outside
            current_span = None
        elif tag == 1:  # Beginning
            current_span = [i]
        elif tag == 2:  # Inside
            if current_span is not None:
                current_span.append(i)
        elif tag == 3:  # End
            if current_span is not None:
                current_span.append(i)
                spans[current_span[0]].append(
                    current_span[1] + 1
                )  # Increment the end index
                current_span = None
    return spans

In [296]:
def evaluate_span_level(preds, ground_truth):
    # Flatten the spans
    flat_predictions = [span for spans in preds for span in spans]
    flat_ground_truth = [span for spans in ground_truth for span in spans]

    # Compute precision, recall, and F1-score
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        flat_ground_truth, flat_predictions, average="weighted"
    )

    return precision, recall, f1_score

In [297]:
precision, recall, f1_score = evaluate_span_level(preds, data["tag"])
print("Span-level precision:", precision)
print("Span-level recall:", recall)
print("Span-level F1-score:", f1_score)

Span-level precision: 0.49092275121061363
Span-level recall: 0.5096110300407588
Span-level F1-score: 0.48860804539093233
