In [133]:
# Load base packages
import pandas as pd


import numpy as np


import matplotlib.pyplot as plt


import seaborn as sns
import os

# load dataset tools
import datasets
from datasets import load_dataset

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Load Data

In [134]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")
anno_df.shape

(1005, 13)

# Prepare data

In [135]:
# stack advice columns
melt_df = anno_df.melt(
    id_vars=["Drug number", "Line number", "Advice Text"],
    value_vars=["AdviceTag1", "AdviceTag2", "AdviceTag3", "AdviceTag4"],
    value_name="AdviceTag",
)

melt_df.drop("variable", axis=1, inplace=True)
melt_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,AdviceTag
0,0,34,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related
1,0,38,This medication may rarely make your blood sug...,Disease or symptom related
2,0,43,This medication may rarely cause a condition k...,Disease or symptom related
3,0,64,This drug may make you dizzy or drowsy or caus...,Disease or symptom related
4,0,66,Avoid alcoholic beverages.,Food or beverage related


In [136]:
anno_df.shape

(1005, 13)

In [137]:
assert (
    melt_df.shape[0] == 4 * anno_df.shape[0]
), "Error in stacking advice columns"

## Extract labels

In [138]:
label_ids = melt_df["AdviceTag"].unique()
label_ids = [label for label in label_ids if str(label) != "nan"]

n_label_ids = len(label_ids)
n_label_ids

8

## Encode advice labels

In [139]:
encoded_df = melt_df.copy()

# one hot encode labels
enc = OneHotEncoder()
enc.fit(melt_df["AdviceTag"].values.reshape(-1, 1))

# create one hot encoded labels
one_hot_labels = enc.transform(
    melt_df["AdviceTag"].values.reshape(-1, 1)
).toarray()

# add one hot encoded labels to dataframe
for i, label_id in enumerate(label_ids):
    encoded_df[label_id] = one_hot_labels[:, i]

encoded_df.head()

Unnamed: 0,Drug number,Line number,Advice Text,AdviceTag,Activity or lifestyle related,Disease or symptom related,Food or beverage related,Pregnancy related,Other drugs related,Drug administration related,Temporal,Exercise related
0,0,34,To reduce the risk of dizziness and lightheade...,Activity or lifestyle related,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,38,This medication may rarely make your blood sug...,Disease or symptom related,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,43,This medication may rarely cause a condition k...,Disease or symptom related,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,64,This drug may make you dizzy or drowsy or caus...,Disease or symptom related,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,66,Avoid alcoholic beverages.,Food or beverage related,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [140]:
encoded_df.drop_duplicates().shape

(2596, 12)

### Ensure the encoding was correct

In [142]:
assert encoded_df.shape[0] == melt_df.shape[0], "Mismatch in number of rows"
assert (
    encoded_df.shape[1] == anno_df.shape[1] + n_labels - 1
), "Mismatch in number of columns"

AssertionError: Mismatch in number of columns

# Baseline Predictions


Two baselines will be tested with for the multilabel classification task.

The first baseline will be a random baseline, where the labels are randomly assigned to the advice text.

The second baseline will be a majority class baseline, where the most common label is assigned to all advice text.

The baselines will be evaluated using the F1 score, Precision, and Recall.

In [None]:
def rand_baseline_pred(dataset, n_labels=8):
    """
    Randomly predicts a label for each example in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    return np.random.randint(0, 2, size=(len(dataset), n_labels))

In [None]:
def mode_baseline_pred(dataset, n_labels=8):
    """
    Predicts the most common label for each label_id in the dataset.

    Args:
        dataset (datasets.Dataset): The dataset to predict labels for.

    Returns:
        np.ndarray: The predicted labels.
    """

    label_ids = list(dataset.features.keys())
    label_ids = label_ids[len(label_ids) - n_labels :]

    # can use median since our labels are binary
    modes = np.array([np.median(dataset[label_id]) for label_id in label_ids])

    return np.tile(modes, (len(dataset), 1))

## Load data into dataset

In [None]:
multi_label = datasets.Dataset.from_pandas(encoded_df)

## Evaluate the baseline

### Identify ground truth labels

In [None]:
ground_truth = encoded_df[label_ids].values

In [None]:
assert (
    ground_truth.shape[0] == encoded_df.shape[0]
), "Mismatch in number of rows"
assert ground_truth.shape[1] == n_label_ids, "Mismatch in number of columns"

### Make predictions and evaluate

#### Random Baseline

In [None]:
# make predictions
rand_preds = rand_baseline_pred(multi_label)
print(rand_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Random Precision: {precision}, Recall: {recall}, F1: {f1}")

(4020, 8)
Random Precision: 0.05019233155478347, Recall: 0.5021725636250776, F1: 0.09126290258897851


#### Most common class baseline

In [None]:
mode_preds = mode_baseline_pred(multi_label)
print(mode_preds.shape)
precision, recall, f1, _ = precision_recall_fscore_support(
    ground_truth, rand_preds, average="micro"
)
print(f"Mode Precision: {precision}, Recall: {recall}, F1: {f1}")

(4020, 8)
Mode Precision: 0.05019233155478347, Recall: 0.5021725636250776, F1: 0.09126290258897851
