In [177]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load dataset tools
import datasets
from datasets import load_dataset

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Load Data

In [178]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
load_shape = anno_df.shape

# Prepare data

In [179]:
# we will drop the columns which we are not interested in
anno_df = anno_df[
    [
        "Drug number",
        "Line number",
        "Advice Text",
        "AdviceTag1",
        "AdviceTag2",
        "AdviceTag3",
        "AdviceTag4",
    ]
]

## Extract label_ids

In [180]:
labels = (
    anno_df[["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"]]
    .fillna("")
    .astype(str)
)

label_ids = list(set(labels.values.flatten()))

# remove the empty string
label_ids.remove("")



n_label_ids = len(label_ids)
n_label_ids

8

## Encode advice labels

In [181]:
# Create new columns for each unique tag and initialize them with 0
for lab in label_ids:
    anno_df[lab] = 0

# Update the values to 1 where the tag is present
for lab in label_ids:
    mask = labels.apply(lambda row: lab in row.values, axis=1)
    anno_df.loc[mask, lab] = 1

# Drop the original AdviceTag columns
anno_df.drop(
    columns=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    inplace=True,
)

# Save the transformed data to a new file
anno_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,Food or beverage related,Pregnancy related,Drug administration related,Other drugs related,Exercise related,Activity or lifestyle related,Disease or symptom related,Temporal
0,0,34,To reduce the risk of dizziness and lightheade...,0,0,0,0,0,1,0,0
1,0,38,This medication may rarely make your blood sug...,0,0,0,0,0,0,1,0
2,0,43,This medication may rarely cause a condition k...,0,0,0,0,0,0,1,0
3,0,64,This drug may make you dizzy or drowsy or caus...,0,0,0,0,0,1,1,0
4,0,66,Avoid alcoholic beverages.,1,0,0,0,0,0,0,0


### Ensure the encoding was correct

In [182]:
assert anno_df.shape[0] == load_shape[0], "Mismatch in number of rows"

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The first baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The second baseline will be a majority class baseline, where the most common label is assigned to all advice text.

The baselines will be evaluated using the F1 score, Precision, and Recall.

In [183]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

In [184]:
def mode_baseline_pred(dataset, n_labels=8):
    """
    Predicts the most common label for each label_id in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    label_ids = list(dataset.features.keys())
    label_ids = label_ids[len(label_ids) - n_labels :]

    # can use median since our labels are binary
    modes = np.array([np.median(dataset[label_id]) for label_id in label_ids])

    return np.tile(modes, (len(dataset), 1))

## Load data into dataset

In [185]:
multi_label = datasets.Dataset.from_pandas(anno_df)

## Evaluate the baseline

### Identify ground truth labels

In [186]:
ground_truth = anno_df[label_ids].values

In [189]:
assert ground_truth.shape[0] == load_shape[0], "Mismatch in number of rows"
assert ground_truth.shape[1] == n_label_ids, "Mismatch in number of columns"

### Make predictions and evaluate

#### Random Baseline

In [190]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

(1005, 8)
Random Precision: 0.20787637088733799, Recall: 0.5176908752327747, F1: 0.2966388049084119


#### Most common class baseline

In [191]:
mode_preds = mode_baseline_pred(multi_label)
print(mode_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Mode Precision: {precision}, Recall: {recall}, F1: {f1}")

(1005, 8)
Mode Precision: 0.20787637088733799, Recall: 0.5176908752327747, F1: 0.2966388049084119
