In [1]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load utalities
from tqdm import tqdm
import re
import random
from collections import defaultdict

# load dataset tools
import datasets
from datasets import load_dataset

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# load tokenizing tools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Load data

In [2]:
# Extract handout.txt from each subdirectory of RawData
def read_handout_txt():
    data = []

    for root, dirs, files in os.walk("./data/RawData/"):
        try:
            with open(os.path.join(root, "handout.txt"), "r") as f:
                handout = f.readlines()
        except:
            print(f"{root}/handout.txt Not Found")
            continue

        for i, line in enumerate(handout):
            line = line.strip()

            # number lines
            line_dict = {
                "Drug name": root.split("/")[-1],
                "Line number": i + 1,
                "Line": line,
            }

            data.append(line_dict)

    return data

In [3]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")

In [4]:
raw_df = pd.DataFrame(read_handout_txt())

./data/RawData//handout.txt Not Found
./data/RawData/Coreg/handout.txt Not Found


In [5]:
raw_df.head()

Unnamed: 0,Drug name,Line number,Line
0,Abilify,1,Patient Educationaripiprazole intramuscular
1,Abilify,2,IMPORTANT: HOW TO USE THIS INFORMATION: This ...
2,Abilify,3,ARIPIPRAZOLE EXTENDED RELEASE - INJECTION
3,Abilify,4,(AR-i-PIP-ra-zole)
4,Abilify,5,"COMMON BRAND NAME(S): Abilify Maintena, Aristada"


# Filter data

In [6]:
anno_df = anno_df[["Drug name", "Drug number", "Advice Text"]]
anno_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text
0,Abilify,0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.


# Reassign line numbers

The line number present in the data is determined based off of scentence structure and not line number. We will locate the Advice text in the raw text, and assign it a new line number label based on the corresponing line. 

This will help us to assign IOB tags to the data.

In [7]:
def find_line_number(advice, raw_data_df):

    for i, line in raw_data_df.iterrows():
        if advice in line["Line"]:
            return line["Line number"]
    return None

In [8]:
# Find line number for each advice text
anno_df["Line number"] = anno_df["Advice Text"].apply(
    lambda x: find_line_number(x, raw_df)
)
anno_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0
1,Abilify,0,This medication may rarely make your blood sug...,20.0
2,Abilify,0,This medication may rarely cause a condition k...,21.0
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0
4,Abilify,0,Avoid alcoholic beverages.,31.0


# Merge data

In [9]:
# merge dataframes
merged_df = pd.merge(anno_df, raw_df, on=["Drug name", "Line number"])
merged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...,20.0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...,21.0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.,31.0,This drug may make you dizzy or drowsy or caus...


# IOB tagging

In [10]:
# Use the advice text vs line text to create iob tagging
def tag_iob(line, advice):

    # basic tokenization
    line_tokens = re.sub(r"[^\w\s]", "", line).split()
    advice_tokens = re.sub(r"[^\w\s]", "", advice).split()

    tagged_tokens = []
    for i, word in enumerate(line_tokens):
        tag = "O"
        if word in advice_tokens:
            tag = "B" if word == advice_tokens[0] else "I"
            tag = "E" if word == advice_tokens[-1] else tag

        tagged_tokens.append((word, tag))
        if tag == "E":
            break

    for word in line_tokens[i + 1 :]:
        tagged_tokens.append((word, "O"))
    return tagged_tokens


# NOTE: This function will have issues if the last word of the advice
# text also appears in the line text not as the last word
# needs to be fixed

In [11]:
tagged_df = merged_df.copy()
tagged_df["IOB Tagged"] = tagged_df.apply(
    lambda x: tag_iob(x["Line"], x["Advice Text"]), axis=1
)



tagged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line,IOB Tagged
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0,To reduce the risk of dizziness and lightheade...,"[(To, B), (reduce, I), (the, I), (risk, I), (o..."
1,Abilify,0,This medication may rarely make your blood sug...,20.0,This medication may rarely make your blood sug...,"[(This, B), (medication, I), (may, I), (rarely..."
2,Abilify,0,This medication may rarely cause a condition k...,21.0,This medication may rarely cause a condition k...,"[(This, B), (medication, I), (may, I), (rarely..."
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0,This drug may make you dizzy or drowsy or caus...,"[(This, B), (drug, I), (may, I), (make, I), (y..."
4,Abilify,0,Avoid alcoholic beverages.,31.0,This drug may make you dizzy or drowsy or caus...,"[(This, O), (drug, O), (may, O), (make, O), (y..."


In [12]:
# look at the first IOB taged text
tagged_df["IOB Tagged"][1]

[('This', 'B'),
 ('medication', 'I'),
 ('may', 'I'),
 ('rarely', 'I'),
 ('make', 'I'),
 ('your', 'I'),
 ('blood', 'I'),
 ('sugar', 'I'),
 ('level', 'I'),
 ('rise', 'I'),
 ('which', 'I'),
 ('can', 'I'),
 ('cause', 'I'),
 ('or', 'I'),
 ('worsen', 'I'),
 ('diabetes', 'I'),
 ('Rarely', 'I'),
 ('very', 'I'),
 ('serious', 'I'),
 ('conditions', 'I'),
 ('such', 'I'),
 ('as', 'I'),
 ('diabetic', 'I'),
 ('coma', 'I'),
 ('may', 'I'),
 ('occur', 'I'),
 ('Tell', 'I'),
 ('your', 'I'),
 ('doctor', 'I'),
 ('right', 'I'),
 ('away', 'I'),
 ('if', 'I'),
 ('you', 'I'),
 ('develop', 'I'),
 ('symptoms', 'I'),
 ('of', 'I'),
 ('high', 'I'),
 ('blood', 'I'),
 ('sugar', 'I'),
 ('such', 'I'),
 ('as', 'I'),
 ('increased', 'I'),
 ('thirst', 'I'),
 ('and', 'I'),
 ('urination', 'I'),
 ('If', 'I'),
 ('you', 'I'),
 ('already', 'I'),
 ('have', 'I'),
 ('diabetes', 'I'),
 ('be', 'I'),
 ('sure', 'I'),
 ('to', 'I'),
 ('check', 'I'),
 ('your', 'I'),
 ('blood', 'I'),
 ('sugars', 'I'),
 ('regularly', 'E'),
 ('Your', 'O'),
 ('

In [13]:
# look at the first Advice Text
tagged_df["Advice Text"][1]

'This medication may rarely make your blood sugar level rise, which can cause or worsen diabetes. Rarely, very serious conditions such as diabetic coma may occur. Tell your doctor right away if you develop symptoms of high blood sugar, such as increased thirst and urination. If you already have diabetes, be sure to check your blood sugars regularly.'

# Convert tagged data to dataset

In [14]:
# create dataframe of input text and IOBE tags
data = pd.DataFrame()
data["text"] = tagged_df["IOB Tagged"].apply(lambda x: [i[0] for i in x])
data["tag"] = tagged_df["IOB Tagged"].apply(lambda x: [i[1] for i in x])
data.head()

Unnamed: 0,text,tag
0,"[To, reduce, the, risk, of, dizziness, and, li...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
1,"[This, medication, may, rarely, make, your, bl...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
2,"[This, medication, may, rarely, cause, a, cond...","[B, I, I, I, I, I, I, I, I, I, E, O, O, O, O, ..."
3,"[This, drug, may, make, you, dizzy, or, drowsy...","[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
4,"[This, drug, may, make, you, dizzy, or, drowsy...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Encode the labels

In [15]:
# encode tags
mapping = {"O": 0, "B": 1, "I": 2, "E": 3}

data["tag"] = data["tag"].apply(lambda x: [mapping[i] for i in x])
data.head()

Unnamed: 0,text,tag
0,"[To, reduce, the, risk, of, dizziness, and, li...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1,"[This, medication, may, rarely, make, your, bl...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[This, medication, may, rarely, cause, a, cond...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 0, 0, ..."
3,"[This, drug, may, make, you, dizzy, or, drowsy...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,"[This, drug, may, make, you, dizzy, or, drowsy...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Convert to dataset

# Evaluate the baseline

In [16]:
def pred_random_IOB(text):

    # get the length of the text
    length = len(text)

    # make array of zeros
    preds = np.zeros(length)

    # get a random number between 0 and the length of the text
    random_start = np.random.randint(0, length - 1)
    random_stop = np.random.randint(random_start + 1, length)

    # set the random start to 1
    preds[random_start] = 1

    # set the random stop to 3
    preds[random_stop] = 3

    # set the values in between to 2
    preds[random_start + 1 : random_stop] = 2

    return preds

In [17]:
preds = data["text"].apply(pred_random_IOB)
preds.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, ...
Name: text, dtype: object

## Evalute the baseline at token level

We will find the accuracy for evaluating the baseline at the token level.

In [18]:
def calculate_accuracy(ground_truth, preds):
    correct = 0
    total = 0
    for true, pred in zip(ground_truth, preds):
        for t, p in zip(true, pred):
            if t == p:
                correct += 1
            total += 1
    return correct / total


# calculate the accuracy of the token level predictions
calculate_accuracy(data["tag"], preds)

0.5084916341678198

## Evaluate the baseline at span-level

In [19]:
def tags_to_spans(tags):
    spans = defaultdict(list)
    current_span = None
    for i, tag in enumerate(tags):
        if tag == 0:  # Outside
            current_span = None
        elif tag == 1:  # Beginning
            current_span = [i]
        elif tag == 2:  # Inside
            if current_span is not None:
                current_span.append(i)
        elif tag == 3:  # End
            if current_span is not None:
                current_span.append(i)
                spans[current_span[0]].append(
                    current_span[1] + 1
                )  # Increment the end index
                current_span = None
    return spans

In [20]:
def evaluate_span_level(preds, ground_truth):
    # Flatten the spans
    flat_predictions = [span for spans in preds for span in spans]
    flat_ground_truth = [span for spans in ground_truth for span in spans]

    # Compute precision, recall, and F1-score
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        flat_ground_truth, flat_predictions, average="macro"
    )

    return precision, recall, f1_score

In [21]:
precision, recall, f1_score = evaluate_span_level(preds, data["tag"])
print("Span-level precision:", precision)
print("Span-level recall:", recall)
print("Span-level F1-score:", f1_score)

Span-level precision: 0.25910493558323733
Span-level recall: 0.2616386113657438
Span-level F1-score: 0.2530667806679219


# Transformer Model

In [22]:
model_id = "roberta-base"

In [23]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.3, random_state=42)

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [25]:
def preprocess_text(example, max_length=512):
    # Because of the way that the BIOE tagging was setup, we need to make sure that the
    # tokenization is aligned with the tagging. This means that we need to tokenize the
    # text and then assign the correct label to each token.

    text = example["text"]
    tags = example["tag"]

    token_ids = [tokenizer.cls_token_id]
    label_alignment = [0]
    attention_mask = [1]
    special_label_for_subwords = -100

    for word, label in zip(text, tags):
        subword_tokens = tokenizer.tokenize(word)

        if len(subword_tokens) > 0:
            token_ids.extend(tokenizer.convert_tokens_to_ids(subword_tokens))
            # Assign the correct label to the first subword token
            label_alignment.append(label)
            # Use a special label for subsequent subword tokens
            label_alignment.extend(
                [special_label_for_subwords] * (len(subword_tokens) - 1)
            )
            attention_mask.extend([1] * len(subword_tokens))

    # add [SEP] token
    token_ids.append(tokenizer.sep_token_id)
    label_alignment.append(0)
    attention_mask.append(1)

    # pad to max length
    padding_length = max_length - len(token_ids)
    token_ids.extend([tokenizer.pad_token_id] * padding_length)
    label_alignment.extend([0] * padding_length)
    attention_mask.extend([0] * padding_length)

    # Make sure everything has correct length
    assert len(token_ids) == max_length
    assert len(label_alignment) == max_length
    assert len(attention_mask) == max_length

    return {
        "input_ids": token_ids,
        "labels": label_alignment,
        "attention_mask": attention_mask,
    }

In [26]:
train_dataset = datasets.Dataset.from_pandas(train)
val_dataset = datasets.Dataset.from_pandas(val)
test_dataset = datasets.Dataset.from_pandas(test)

In [27]:
train_dataset = train_dataset.map(preprocess_text)
val_dataset = val_dataset.map(preprocess_text)
test_dataset = test_dataset.map(preprocess_text)

train_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)
test_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

In [28]:
# look at first example input_ids and labels
train_dataset[0]

{'input_ids': tensor([    0,  6323, 33119, 11990,   154, 48205,  6025, 17304, 40919, 16625,
         22776, 28250,   368,   605,   994,   225, 16625, 12690, 37694,  2407,
         35438, 16625,  3792,  4526, 25058, 12196, 33119,  6968,  1322, 10928,
           463,  4970,  9178,   560,  3698, 35369, 22725,   352, 20345,   438,
          4894,   463, 33912, 33119,   417,  5810,  5526,    29,   368, 42219,
         43452, 16918,   281,  1452,   658,  1001,   506,  4734,  1115, 31726,
           225,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [29]:
# define training arguments
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForTokenClassification,
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
)

In [30]:
model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=4)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [32]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmattcalc[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/192 [00:00<?, ?it/s]

{'loss': 1.2946, 'grad_norm': 12.73617935180664, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.16}
{'loss': 1.1761, 'grad_norm': 12.37436294555664, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.31}
{'loss': 0.8335, 'grad_norm': 18.499210357666016, 'learning_rate': 3e-06, 'epoch': 0.47}
{'loss': 0.3163, 'grad_norm': 2.6592495441436768, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.62}
{'loss': 0.1446, 'grad_norm': 4.017032623291016, 'learning_rate': 5e-06, 'epoch': 0.78}
{'loss': 0.1147, 'grad_norm': 0.8436582088470459, 'learning_rate': 6e-06, 'epoch': 0.94}
{'loss': 0.0789, 'grad_norm': 0.3250788450241089, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.09}
{'loss': 0.0809, 'grad_norm': 0.3070162534713745, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.25}
{'loss': 0.0896, 'grad_norm': 0.9073410034179688, 'learning_rate': 9e-06, 'epoch': 1.41}
{'loss': 0.0706, 'grad_norm': 1.0010526180267334, 'learning_rate': 1e-05, 'epoch': 1.56}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.07211222499608994, 'eval_runtime': 15.079, 'eval_samples_per_second': 14.524, 'eval_steps_per_second': 1.857, 'epoch': 1.56}
{'loss': 0.07, 'grad_norm': 0.6755059957504272, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.72}
{'loss': 0.0701, 'grad_norm': 1.4353053569793701, 'learning_rate': 1.2e-05, 'epoch': 1.88}
{'loss': 0.0709, 'grad_norm': 0.8421601057052612, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.03}
{'loss': 0.0703, 'grad_norm': 0.39515814185142517, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.19}
{'loss': 0.0701, 'grad_norm': 2.3534131050109863, 'learning_rate': 1.5e-05, 'epoch': 2.34}
{'loss': 0.0653, 'grad_norm': 0.6852309703826904, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.5}
{'loss': 0.0665, 'grad_norm': 0.32582950592041016, 'learning_rate': 1.7000000000000003e-05, 'epoch': 2.66}
{'loss': 0.0732, 'grad_norm': 1.0403127670288086, 'learning_rate': 1.8e-05, 'epoch': 2.81}
{'loss': 0.0687, 'grad_norm': 0.45743808150291443, 'learn

TrainOutput(global_step=192, training_loss=0.2520588703919202, metrics={'train_runtime': 245.6846, 'train_samples_per_second': 6.203, 'train_steps_per_second': 0.781, 'train_loss': 0.2520588703919202, 'epoch': 3.0})