In [30]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

# load stats tools
from scipy import stats

# load dataset tools
import datasets
from datasets import load_dataset, DatasetDict

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# from transformers import EvalPrediction

# Load Data

In [31]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
load_shape = anno_df.shape

# Prepare data

In [32]:
# we will drop the columns which we are not interested in
anno_df = anno_df[
    [
        "Drug number",
        "Line number",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

## Extract label_ids

In [33]:
labels = (
    anno_df[["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"]]
    .fillna("")
    .astype(str)
)

label_ids = list(set(labels.values.flatten()))

# remove the empty string
label_ids.remove("")


n_label_ids = len(label_ids)
n_label_ids

8

## Encode advice labels

In [34]:
# Create new columns for each unique tag and initialize them with 0
for lab in label_ids:
    anno_df[lab] = 0

# Update the values to 1 where the tag is present
for lab in label_ids:
    mask = labels.apply(lambda row: lab in row.values, axis=1)
    anno_df.loc[mask, lab] = 1

# Drop the original AdviceTag columns
anno_df.drop(
    columns=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    inplace=True,
)

# Save the transformed data to a new file
anno_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,Pregnancy related,Activity or lifestyle related,Other drugs related,Exercise related,Drug administration related,Disease or symptom related,Food or beverage related,Temporal
0,0,34,To reduce the risk of dizziness and lightheade...,0,1,0,0,0,0,0,0
1,0,38,This medication may rarely make your blood sug...,0,0,0,0,0,1,0,0
2,0,43,This medication may rarely cause a condition k...,0,0,0,0,0,1,0,0
3,0,64,This drug may make you dizzy or drowsy or caus...,0,1,0,0,0,1,0,0
4,0,66,Avoid alcoholic beverages.,0,0,0,0,0,0,1,0


### Ensure the encoding was correct

In [35]:
assert anno_df.shape[0] == load_shape[0], "Mismatch in number of rows"

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The  baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The baseline will be evaluated using the F1 score, Precision, and Recall.

In [36]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

## Load data into dataset

In [37]:
multi_label = datasets.Dataset.from_pandas(anno_df)

## Evaluate the baseline

### Identify ground truth labels

In [38]:
ground_truth = anno_df[label_ids].values

In [39]:
assert ground_truth.shape[0] == load_shape[0], "Mismatch in number of rows"
assert ground_truth.shape[1] == n_label_ids, "Mismatch in number of columns"

### Make predictions and evaluate

#### Random Baseline

In [40]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
# print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

Random Precision: 0.19905566600397614, Recall: 0.4972067039106145, F1: 0.2842945874001775


# Train, Test Split

In [41]:
anno_df.columns

Index(['Drug number', 'Line number', 'Advice Text', 'Pregnancy related',
       'Activity or lifestyle related', 'Other drugs related',
       'Exercise related', 'Drug administration related',
       'Disease or symptom related', 'Food or beverage related', 'Temporal'],
      dtype='object')

In [42]:
# Drop the unneeded columns
anno_df.drop(["Drug number", "Line number"], axis=1, inplace=True)

# Create train test split
train, test = train_test_split(anno_df, test_size=0.2, random_state=42)

# Binary Relevance

For binary relevance we will encode the text using a TF-IDF vectorizer and then train a logistic regression model for each label.

In [43]:
# look at our data
train.head()

Unnamed: 0,Advice Text,Pregnancy related,Activity or lifestyle related,Other drugs related,Exercise related,Drug administration related,Disease or symptom related,Food or beverage related,Temporal
78,Some products that may interact with this drug...,0,0,1,0,0,0,0,0
29,"Beta-blocker medications (such as metoprolol, ...",0,0,1,0,0,1,0,0
280,Wash your hands after applying the patch.,0,1,0,0,1,0,0,0
507,Lithium passes into breast milk and may have u...,1,0,0,0,0,0,0,0
652,Limit alcoholic beverages.,0,0,0,0,0,0,1,0


## Data encoding with TF-IDF

In [44]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the train data
X_train = vectorizer.fit_transform(train["Advice Text"])

# Transform the test data
X_test = vectorizer.transform(test["Advice Text"])

label_ids = train.columns[1:]
label_ids

Index(['Pregnancy related', 'Activity or lifestyle related',
       'Other drugs related', 'Exercise related',
       'Drug administration related', 'Disease or symptom related',
       'Food or beverage related', 'Temporal'],
      dtype='object')

### Training

In [45]:
# Initialize the model
model = LogisticRegression(max_iter=1000)

In [46]:
results = pd.DataFrame()
for label in label_ids:
    print(f"Training model for {label}")
    y_train = train[label]
    y_test = test[label]

    # Train the model
    model.fit(X_train, y_train)

    # Predict the test data
    preds = model.predict(X_test)

    # Evaluate the model
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, preds, average="binary"
    )
    print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

    results[label] = preds

# Evaluate the model
precision, recall, f1, _ = precision_recall_fscore_support(
    test[label_ids], results, average="micro"
)
print(f"\nPrecision: {precision}, Recall: {recall}, F1: {f1}")

Training model for Pregnancy related
Precision: 1.0, Recall: 0.8636363636363636, F1: 0.9268292682926829
Training model for Activity or lifestyle related
Precision: 1.0, Recall: 0.38461538461538464, F1: 0.5555555555555556
Training model for Other drugs related
Precision: 0.9310344827586207, Recall: 0.8307692307692308, F1: 0.8780487804878049
Training model for Exercise related
Precision: 0.0, Recall: 0.0, F1: 0.0
Training model for Drug administration related
Precision: 1.0, Recall: 0.5333333333333333, F1: 0.6956521739130436
Training model for Disease or symptom related
Precision: 0.6578947368421053, Recall: 0.5208333333333334, F1: 0.5813953488372092
Training model for Food or beverage related
Precision: 0.8787878787878788, Recall: 0.6904761904761905, F1: 0.7733333333333333
Training model for Temporal
Precision: 0.8571428571428571, Recall: 0.5142857142857142, F1: 0.6428571428571428

Precision: 0.8918918918918919, Recall: 0.6346153846153846, F1: 0.7415730337078652


  _warn_prf(average, modifier, msg_start, len(result))


# Transformer Based Model

In [47]:
train, val = train_test_split(train, test_size=0.3, random_state=42)

In [48]:
model_id = "roberta-base"

### Load data into dataset

In [49]:
# Load into datasets
train_dataset = datasets.Dataset.from_pandas(train, preserve_index=False)
val_dataset = datasets.Dataset.from_pandas(val, preserve_index=False)
test_dataset = datasets.Dataset.from_pandas(test, preserve_index=False)

In [50]:
# rename Advice Text to text
train_dataset = train_dataset.rename_column("Advice Text", "text")
val_dataset = val_dataset.rename_column("Advice Text", "text")
test_dataset = test_dataset.rename_column("Advice Text", "text")

### Tokenize and split text

In [51]:
tokenizer = RobertaTokenizer.from_pretrained(model_id)

In [52]:
def preprocess_text(batch):

    text = batch["text"]
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    labels = list(batch.keys())[1:]

    label_array = np.zeros((len(text), len(labels)))

    for i, label in enumerate(labels):
        label_array[:, i] = batch[label]

    inputs["labels"] = label_array

    return inputs

In [53]:
train_encodings = train_dataset.map(preprocess_text, batched=True)
eval_encodings = val_dataset.map(preprocess_text, batched=True)
test_encodings = test_dataset.map(preprocess_text, batched=True)

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

In [54]:
# format datasets
train_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
eval_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
test_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

In [55]:
# example
example = train_encodings[0]
print(example["input_ids"].shape)
print(example["attention_mask"].shape)
print(example["labels"].shape)

print(tokenizer.decode(example["input_ids"]))

torch.Size([512])
torch.Size([512])
torch.Size([8])
<s>If you are planning pregnancy, become pregnant, or think you may be pregnant, immediately discuss the benefits and risks of using this medication during pregnancy with your doctor.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

#### Label Information

In [56]:
class_names = anno_df.columns[1:].tolist()
n_labels = len(class_names)

id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

In [57]:
# metrics function
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    preds = (probs > 0.5).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="micro"
    )

    return {"precision": precision, "recall": recall, "f1": f1}

### Train

In [58]:
training_args = TrainingArguments(
    output_dir="./models/roberta-base",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=25,
    logging_dir=f"./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="steps",
    save_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to="wandb",
)

In [59]:
model = RobertaForSequenceClassification.from_pretrained(
    model_id, num_labels=n_labels, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=eval_encodings,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [61]:
# Check sizes
print(train_encodings["input_ids"].shape)
print(train_encodings["attention_mask"].shape)
print(train_encodings["labels"].shape)

torch.Size([562, 512])
torch.Size([562, 512])
torch.Size([562, 8])


In [62]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmattcalc[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/213 [00:00<?, ?it/s]

{'loss': 0.6846, 'grad_norm': 1.3883836269378662, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.14}
{'loss': 0.6829, 'grad_norm': 1.6674448251724243, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.28}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.6701627373695374, 'eval_precision': 0.20523415977961432, 'eval_recall': 0.3941798941798942, 'eval_f1': 0.26992753623188404, 'eval_runtime': 4.9216, 'eval_samples_per_second': 49.171, 'eval_steps_per_second': 6.299, 'epoch': 0.35}
{'loss': 0.6681, 'grad_norm': 1.191691517829895, 'learning_rate': 3e-06, 'epoch': 0.42}
{'loss': 0.661, 'grad_norm': 1.1009907722473145, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.56}
{'loss': 0.6363, 'grad_norm': 1.2419812679290771, 'learning_rate': 5e-06, 'epoch': 0.7}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.6281105875968933, 'eval_precision': 0.3140877598152425, 'eval_recall': 0.35978835978835977, 'eval_f1': 0.3353884093711467, 'eval_runtime': 4.9141, 'eval_samples_per_second': 49.246, 'eval_steps_per_second': 6.308, 'epoch': 0.7}
{'loss': 0.6049, 'grad_norm': 1.9708466529846191, 'learning_rate': 6e-06, 'epoch': 0.85}
{'loss': 0.5459, 'grad_norm': 1.6542376279830933, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.99}


  0%|          | 0/31 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4808005392551422, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 4.649, 'eval_samples_per_second': 52.054, 'eval_steps_per_second': 6.668, 'epoch': 1.06}
{'loss': 0.5156, 'grad_norm': 1.9831092357635498, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.13}
{'loss': 0.4845, 'grad_norm': 1.5671510696411133, 'learning_rate': 9e-06, 'epoch': 1.27}
{'loss': 0.4471, 'grad_norm': 1.721231460571289, 'learning_rate': 1e-05, 'epoch': 1.41}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4091169536113739, 'eval_precision': 1.0, 'eval_recall': 0.07407407407407407, 'eval_f1': 0.13793103448275862, 'eval_runtime': 4.635, 'eval_samples_per_second': 52.211, 'eval_steps_per_second': 6.688, 'epoch': 1.41}
{'loss': 0.4266, 'grad_norm': 1.5391569137573242, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.55}
{'loss': 0.3706, 'grad_norm': 1.6853445768356323, 'learning_rate': 1.2e-05, 'epoch': 1.69}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3198750913143158, 'eval_precision': 0.9196787148594378, 'eval_recall': 0.6058201058201058, 'eval_f1': 0.730462519936204, 'eval_runtime': 16.8603, 'eval_samples_per_second': 14.353, 'eval_steps_per_second': 1.839, 'epoch': 1.76}
{'loss': 0.3524, 'grad_norm': 2.5425198078155518, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.83}
{'loss': 0.337, 'grad_norm': 1.972846269607544, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.97}
{'loss': 0.2931, 'grad_norm': 2.1307029724121094, 'learning_rate': 1.5e-05, 'epoch': 2.11}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.2645148038864136, 'eval_precision': 0.9180887372013652, 'eval_recall': 0.7116402116402116, 'eval_f1': 0.8017883755588673, 'eval_runtime': 4.8484, 'eval_samples_per_second': 49.913, 'eval_steps_per_second': 6.394, 'epoch': 2.11}
{'loss': 0.2637, 'grad_norm': 1.6760720014572144, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.25}
{'loss': 0.2464, 'grad_norm': 1.8481371402740479, 'learning_rate': 1.7000000000000003e-05, 'epoch': 2.39}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.2209036648273468, 'eval_precision': 0.9403973509933775, 'eval_recall': 0.7513227513227513, 'eval_f1': 0.8352941176470587, 'eval_runtime': 4.979, 'eval_samples_per_second': 48.604, 'eval_steps_per_second': 6.226, 'epoch': 2.46}
{'loss': 0.2397, 'grad_norm': 2.067387342453003, 'learning_rate': 1.8e-05, 'epoch': 2.54}
{'loss': 0.2258, 'grad_norm': 2.2678611278533936, 'learning_rate': 1.9e-05, 'epoch': 2.68}
{'loss': 0.2132, 'grad_norm': 1.1475797891616821, 'learning_rate': 2e-05, 'epoch': 2.82}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.19929257035255432, 'eval_precision': 0.8944281524926686, 'eval_recall': 0.8068783068783069, 'eval_f1': 0.8484005563282336, 'eval_runtime': 4.936, 'eval_samples_per_second': 49.028, 'eval_steps_per_second': 6.28, 'epoch': 2.82}
{'loss': 0.2159, 'grad_norm': 1.2862308025360107, 'learning_rate': 2.1e-05, 'epoch': 2.96}
{'train_runtime': 170.321, 'train_samples_per_second': 9.899, 'train_steps_per_second': 1.251, 'train_loss': 0.4308803666365538, 'epoch': 3.0}


TrainOutput(global_step=213, training_loss=0.4308803666365538, metrics={'train_runtime': 170.321, 'train_samples_per_second': 9.899, 'train_steps_per_second': 1.251, 'train_loss': 0.4308803666365538, 'epoch': 3.0})

In [63]:
trainer.evaluate()

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.19929257035255432,
 'eval_precision': 0.8944281524926686,
 'eval_recall': 0.8068783068783069,
 'eval_f1': 0.8484005563282336,
 'eval_runtime': 4.8202,
 'eval_samples_per_second': 50.205,
 'eval_steps_per_second': 6.431,
 'epoch': 3.0}

In [64]:
# evaluate on test set
test_results = trainer.predict(test_encodings)

test_metrics = compute_metrics(test_results)
print(test_metrics)

  0%|          | 0/26 [00:00<?, ?it/s]

{'precision': 0.8962962962962963, 'recall': 0.7756410256410257, 'f1': 0.831615120274914}


### Predicting new text