In [581]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

# load stats tools
from scipy import stats

# load dataset tools
import datasets
from datasets import load_dataset, DatasetDict

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from transformers import EvalPrediction

from datasets import load_dataset, DatasetDict
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch

# data collator
from transformers import DataCollatorWithPadding

# Load Data

In [582]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
load_shape = anno_df.shape

# Prepare data

In [583]:
# we will drop the columns which we are not interested in
anno_df = anno_df[
    [
        "Drug number",
        "Line number",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

## Extract label_ids

In [584]:
labels = (
    anno_df[["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"]]
    .fillna("")
    .astype(str)
)

label_ids = list(set(labels.values.flatten()))

# remove the empty string
label_ids.remove("")


n_label_ids = len(label_ids)
n_label_ids

8

## Encode advice labels

In [585]:
# Create new columns for each unique tag and initialize them with 0
for lab in label_ids:
    anno_df[lab] = 0

# Update the values to 1 where the tag is present
for lab in label_ids:
    mask = labels.apply(lambda row: lab in row.values, axis=1)
    anno_df.loc[mask, lab] = 1

# Drop the original AdviceTag columns
anno_df.drop(
    columns=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    inplace=True,
)

# Save the transformed data to a new file
anno_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,Other drugs related,Temporal,Disease or symptom related,Pregnancy related,Food or beverage related,Drug administration related,Activity or lifestyle related,Exercise related
0,0,34,To reduce the risk of dizziness and lightheade...,0,0,0,0,0,0,1,0
1,0,38,This medication may rarely make your blood sug...,0,0,1,0,0,0,0,0
2,0,43,This medication may rarely cause a condition k...,0,0,1,0,0,0,0,0
3,0,64,This drug may make you dizzy or drowsy or caus...,0,0,1,0,0,0,1,0
4,0,66,Avoid alcoholic beverages.,0,0,0,0,1,0,0,0


### Ensure the encoding was correct

In [586]:
assert anno_df.shape[0] == load_shape[0], "Mismatch in number of rows"

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The  baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The baseline will be evaluated using the F1 score, Precision, and Recall.

In [587]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

## Load data into dataset

In [588]:
multi_label = datasets.Dataset.from_pandas(anno_df)

## Evaluate the baseline

### Identify ground truth labels

In [589]:
ground_truth = anno_df[label_ids].values

In [590]:
assert ground_truth.shape[0] == load_shape[0], "Mismatch in number of rows"
assert ground_truth.shape[1] == n_label_ids, "Mismatch in number of columns"

### Make predictions and evaluate

#### Random Baseline

In [591]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
# print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

Random Precision: 0.20141948115516398, Recall: 0.5108628181253879, F1: 0.28892399508513256


# Train, Test Split

In [592]:
anno_df.columns

Index(['Drug number', 'Line number', 'Advice Text', 'Other drugs related',
       'Temporal', 'Disease or symptom related', 'Pregnancy related',
       'Food or beverage related', 'Drug administration related',
       'Activity or lifestyle related', 'Exercise related'],
      dtype='object')

In [593]:
# Drop the unneeded columns
anno_df.drop(["Drug number", "Line number"], axis=1, inplace=True)

# Create train test split
train, test = train_test_split(anno_df, test_size=0.2, random_state=42)

# Binary Relevance

For binary relevance we will encode the text using a TF-IDF vectorizer and then train a logistic regression model for each label.

In [594]:
# look at our data
train.head()

Unnamed: 0,Advice Text,Other drugs related,Temporal,Disease or symptom related,Pregnancy related,Food or beverage related,Drug administration related,Activity or lifestyle related,Exercise related
78,Some products that may interact with this drug...,1,0,0,0,0,0,0,0
29,"Beta-blocker medications (such as metoprolol, ...",1,0,1,0,0,0,0,0
280,Wash your hands after applying the patch.,0,0,0,0,0,1,1,0
507,Lithium passes into breast milk and may have u...,0,0,0,1,0,0,0,0
652,Limit alcoholic beverages.,0,0,0,0,1,0,0,0


## Data encoding with TF-IDF

In [595]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the train data
X_train = vectorizer.fit_transform(train["Advice Text"])

# Transform the test data
X_test = vectorizer.transform(test["Advice Text"])

label_ids = train.columns[1:]
label_ids

Index(['Other drugs related', 'Temporal', 'Disease or symptom related',
       'Pregnancy related', 'Food or beverage related',
       'Drug administration related', 'Activity or lifestyle related',
       'Exercise related'],
      dtype='object')

### Training

In [596]:
preds = pd.DataFrame()

for label in label_ids:
    # get labels
    y_train = train[label].values
    y_test = test[label].values

    # Initialize the model
    model = LogisticRegression()

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="micro"
    )
    print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

    # Save the predictions
    preds[label] = y_pred

# evaluate the model across all labels
precision, recall, f1, _ = precision_recall_fscore_support(
    test[label_ids].values, preds.values, average="micro"
)
print(f"\nOverall Precision: {precision}, Recall: {recall}, F1: {f1}")

preds.head()

Precision: 0.9253731343283582, Recall: 0.9253731343283582, F1: 0.9253731343283582
Precision: 0.900497512437811, Recall: 0.900497512437811, F1: 0.9004975124378111
Precision: 0.8208955223880597, Recall: 0.8208955223880597, F1: 0.8208955223880597
Precision: 0.9701492537313433, Recall: 0.9701492537313433, F1: 0.9701492537313433
Precision: 0.9154228855721394, Recall: 0.9154228855721394, F1: 0.9154228855721394
Precision: 0.8955223880597015, Recall: 0.8955223880597015, F1: 0.8955223880597015
Precision: 0.9203980099502488, Recall: 0.9203980099502488, F1: 0.9203980099502488
Precision: 0.9651741293532339, Recall: 0.9651741293532339, F1: 0.9651741293532339

Overall Precision: 0.8918918918918919, Recall: 0.6346153846153846, F1: 0.7415730337078652


Unnamed: 0,Other drugs related,Temporal,Disease or symptom related,Pregnancy related,Food or beverage related,Drug administration related,Activity or lifestyle related,Exercise related
0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0


# Transformer Based Model

### Load data into dataset

In [None]:
# Load the dataset
dataset = datasets.Dataset.from_pandas(anno_df)

### Model

In [None]:
# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=8
)

### Metric calculations

In [None]:
def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="micro"
    )
    return {"precision": precision, "recall": recall, "f1": f1}

### Tokenize and split text

In [None]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["Advice Text"], truncation=True)

In [None]:
# Map the tokenization function to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Split the dataset into train, validation, and test sets
train_val_test_split = tokenized_dataset.train_test_split(
    test_size=0.2, shuffle=True
)
train_dataset = train_val_test_split["train"]
val_test_dataset = train_val_test_split["test"].train_test_split(
    test_size=0.5, shuffle=True
)
val_dataset = val_test_dataset["train"]
test_dataset = val_test_dataset["test"]

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Train

In [1]:
# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=2e-5,
    do_eval=True,
    evaluation_strategy="steps",
    seed=42,
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to="wandb",
    output_dir="./models/roberta-base/checkpoints",
    overwrite_output_dir=True,
)

NameError: name 'TrainingArguments' is not defined

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model on validation set
eval_results_val = trainer.evaluate()
print("Validation set results:", eval_results_val)

# Evaluate the model on test set
eval_results_test = trainer.evaluate(eval_dataset=test_dataset)
print("Test set results:", eval_results_test)

### Predicting new text

In [None]:
# Make predictions on new data
def predict(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=128,
        padding="max_length",
        truncation=True,
    )
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_labels = torch.sigmoid(logits)
    return predicted_labels.detach().numpy()