In [2]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

# load stats tools
from scipy import stats

# load dataset tools
import datasets
from datasets import load_dataset, DatasetDict

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# from transformers import EvalPrediction

# Load Data

In [3]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
load_shape = anno_df.shape

# Prepare data

In [4]:
# we will drop the columns which we are not interested in
anno_df = anno_df[
    [
        "Drug number",
        "Line number",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

## Extract label_ids

In [5]:
labels = (
    anno_df[["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"]]
    .fillna("")
    .astype(str)
)

label_ids = list(set(labels.values.flatten()))

# remove the empty string
label_ids.remove("")


n_label_ids = len(label_ids)
n_label_ids

8

## Encode advice labels

In [6]:
# Create new columns for each unique tag and initialize them with 0
for lab in label_ids:
    anno_df[lab] = 0

# Update the values to 1 where the tag is present
for lab in label_ids:
    mask = labels.apply(lambda row: lab in row.values, axis=1)
    anno_df.loc[mask, lab] = 1

# Drop the original AdviceTag columns
anno_df.drop(
    columns=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    inplace=True,
)

# Save the transformed data to a new file
anno_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,Activity or lifestyle related,Food or beverage related,Disease or symptom related,Exercise related,Other drugs related,Drug administration related,Temporal,Pregnancy related
0,0,34,To reduce the risk of dizziness and lightheade...,1,0,0,0,0,0,0,0
1,0,38,This medication may rarely make your blood sug...,0,0,1,0,0,0,0,0
2,0,43,This medication may rarely cause a condition k...,0,0,1,0,0,0,0,0
3,0,64,This drug may make you dizzy or drowsy or caus...,1,0,1,0,0,0,0,0
4,0,66,Avoid alcoholic beverages.,0,1,0,0,0,0,0,0


### Ensure the encoding was correct

In [7]:
assert anno_df.shape[0] == load_shape[0], "Mismatch in number of rows"

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The  baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The baseline will be evaluated using the F1 score, Precision, and Recall.

In [8]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

## Load data into dataset

In [9]:
multi_label = datasets.Dataset.from_pandas(anno_df)

## Evaluate the baseline

### Identify ground truth labels

In [10]:
ground_truth = anno_df[label_ids].values

In [11]:
assert ground_truth.shape[0] == load_shape[0], "Mismatch in number of rows"
assert ground_truth.shape[1] == n_label_ids, "Mismatch in number of columns"

### Make predictions and evaluate

#### Random Baseline

In [12]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
# print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

Random Precision: 0.19637462235649547, Recall: 0.48417132216014896, F1: 0.27941966684578184


# Train, Test Split

In [13]:
anno_df.columns

Index(['Drug number', 'Line number', 'Advice Text',
       'Activity or lifestyle related', 'Food or beverage related',
       'Disease or symptom related', 'Exercise related', 'Other drugs related',
       'Drug administration related', 'Temporal', 'Pregnancy related'],
      dtype='object')

In [14]:
# Drop the unneeded columns
anno_df.drop(["Drug number", "Line number"], axis=1, inplace=True)

# Create train test split
train, test = train_test_split(anno_df, test_size=0.2, random_state=42)

# Binary Relevance

For binary relevance we will encode the text using a TF-IDF vectorizer and then train a logistic regression model for each label.

In [15]:
# look at our data
train.head()

Unnamed: 0,Advice Text,Activity or lifestyle related,Food or beverage related,Disease or symptom related,Exercise related,Other drugs related,Drug administration related,Temporal,Pregnancy related
78,Some products that may interact with this drug...,0,0,0,0,1,0,0,0
29,"Beta-blocker medications (such as metoprolol, ...",0,0,1,0,1,0,0,0
280,Wash your hands after applying the patch.,1,0,0,0,0,1,0,0
507,Lithium passes into breast milk and may have u...,0,0,0,0,0,0,0,1
652,Limit alcoholic beverages.,0,1,0,0,0,0,0,0


## Data encoding with TF-IDF

In [16]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the train data
X_train = vectorizer.fit_transform(train["Advice Text"])

# Transform the test data
X_test = vectorizer.transform(test["Advice Text"])

label_ids = train.columns[1:]
label_ids

Index(['Activity or lifestyle related', 'Food or beverage related',
       'Disease or symptom related', 'Exercise related', 'Other drugs related',
       'Drug administration related', 'Temporal', 'Pregnancy related'],
      dtype='object')

### Training

# Transformer Based Model

In [17]:
model_id = "roberta-base"

### Load data into dataset

In [18]:
# rename Advice Text to text
anno_df = anno_df.rename(columns={"Advice Text": "text"})

# Load into dataset


dataset = datasets.Dataset.from_pandas(anno_df)

In [25]:
# test, train, eval split
train, test = dataset.train_test_split(test_size=0.2)
train, eval = train.train_test_split(test_size=0.3)

AttributeError: 'str' object has no attribute 'train_test_split'

### Tokenize and split text

In [22]:
tokenizer = RobertaTokenizer.from_pretrained(model_id)

In [23]:
def tokenize_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

In [24]:
train_encodings = train.map(tokenize_function, batched=True)
eval_encodings = eval.map(tokenize_function, batched=True)
test_encodings = test.map(tokenize_function, batched=True)

AttributeError: 'str' object has no attribute 'map'

In [None]:
# format datasets
train_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
eval_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
test_encodings.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

#### Label Information

In [None]:
class_names = anno_df.columns[1:].tolist()
n_labels = len(class_names)

id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

### Model

### Train

In [None]:
training_args = TrainingArguments(
    output_dir="./models/roberta-base",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb",
)

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    model_id, num_labels=n_labels, id2label=id2label, label2id=label2id
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=eval_encodings,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Predicting new text