In [1]:
%run local_functions.py
from local_functions import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import BertConfig, BertModel


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import TrainingArguments, Trainer


from sklearn.model_selection import train_test_split

plt.style.use("dark_background")

pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", 100)


plt.style.use("dark_background")

%load_ext lab_black

2023-09-18 11:30:36.849094: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-18 11:30:36.869707: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
df = pd.read_parquet("datasets/drug-reviews.parquet")[["condition", "review"]]

In [33]:
df["text"] = df["review"].apply(text_normalization_3)
df = df[["condition", "text"]]

N_CONDITIONS = 100
top_n_conditions = df.condition.value_counts().head(N_CONDITIONS).index.to_list()

df = df[df.condition.isin(top_n_conditions)]

SAMPLE_SIZE = 25000

df = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

In [61]:
df.condition.value_counts()

Birth Control             5048
Depression                1612
Pain                      1107
Anxiety                   1055
Acne                       995
                          ... 
Hot Flashes                 36
Alcohol Withdrawal          34
Pneumonia                   33
Neuropathic Pain            33
Ankylosing Spondylitis      33
Name: condition, Length: 100, dtype: int64

In [34]:
training_df, testing_df = train_test_split(df, test_size=0.25, random_state=42)

In [35]:
encoded_df = pd.get_dummies(
    training_df["condition"], columns=["condition"], prefix="", prefix_sep=""
)
encoded_df = encoded_df.astype(bool)
encoded_df_con = pd.concat([training_df["text"], encoded_df], axis=1)

# Dataset

In [36]:
train_df, temp_df = train_test_split(encoded_df_con, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [37]:
# Convert the split DataFrames into Datasets
train = Dataset.from_pandas(train_df, split="train")
valid = Dataset.from_pandas(valid_df, split="validation")
test = Dataset.from_pandas(test_df, split="test")

dataset = DatasetDict({"train": train, "validation": valid, "test": test})

In [38]:
labels = [
    label
    for label in dataset["train"].features.keys()
    if label not in ["text", "__index_level_0__"]
]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Tokenizer

In [40]:
LM = "bert-base-uncased"

In [41]:
tokenizer = AutoTokenizer.from_pretrained(LM)

In [42]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding


encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)

encoded_dataset.set_format("torch")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

Map:   0%|          | 0/2812 [00:00<?, ? examples/s]

Map:   0%|          | 0/2813 [00:00<?, ? examples/s]

# Model

In [43]:
model = AutoModelForSequenceClassification.from_pretrained(
    LM,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
EPOCHS = 1
BATCH_SIZE = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-drugs-finetuned-sem_eval-english",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # push_to_hub=True,
)

In [45]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [46]:
trainer.train()



  0%|          | 0/1641 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.1537, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.3}
{'loss': 0.053, 'learning_rate': 7.81230956733699e-06, 'epoch': 0.61}
{'loss': 0.0495, 'learning_rate': 1.7184643510054846e-06, 'epoch': 0.91}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.0484723336994648, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.0, 'eval_runtime': 16.6741, 'eval_samples_per_second': 168.645, 'eval_steps_per_second': 21.111, 'epoch': 1.0}
{'train_runtime': 289.7486, 'train_samples_per_second': 45.298, 'train_steps_per_second': 5.664, 'train_loss': 0.08226439315033424, 'epoch': 1.0}


TrainOutput(global_step=1641, training_loss=0.08226439315033424, metrics={'train_runtime': 289.7486, 'train_samples_per_second': 45.298, 'train_steps_per_second': 5.664, 'train_loss': 0.08226439315033424, 'epoch': 1.0})

In [47]:
trainer.evaluate()

  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.0484723336994648,
 'eval_f1': 0.0,
 'eval_roc_auc': 0.5,
 'eval_accuracy': 0.0,
 'eval_runtime': 16.6375,
 'eval_samples_per_second': 169.016,
 'eval_steps_per_second': 21.157,
 'epoch': 1.0}

# Eval

In [52]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-drugs-finetuned-sem_eval-english/checkpoint-1641"
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

trainer = Trainer(model=model, tokenizer=tokenizer)

df_test = testing_df

In [57]:
ROWS_TO_EVALUATE = len(df_test)

CONFIDENCE_THRESHOLD = 0.15  # 0.1 works well for low number of non-responses

TOP_N_PREDS = 5  # number of top predictions to return


# make predictions
df_test["predicted_class"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    predict_class, args=(tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD)
)  # args: text, tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD=0.5

# calculate if prediction is correct
df_test["correct"] = df_test[0:ROWS_TO_EVALUATE].apply(
    lambda row: int(row["condition"] in row["predicted_class"]), axis=1
)
# calculate score (including penalty for guessing multiple categories) used to help find optimal confidence threshold
df_test["correct_w_discount"] = df_test[0:ROWS_TO_EVALUATE].apply(
    multi_cat_guess_penalty,
    axis=1,
    args=(0.9,),  # muli_cat_guess_penalty (somewhere around 0.85 works well)
)

df_test["correct_w_non_preds"] = df_test.apply(multi_positive_outcome, axis=1)

# get top n predictions
df_test["top_n_preds"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    n_most_likely_classes, args=(tokenizer, trainer, id2label, TOP_N_PREDS)
)

print(
    f"Percent of predictions that include correct class: {round((df_test.correct.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_guess discount score: {round((df_test.correct_w_discount.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_positive_outcome discount score: {round((df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)

print(
    f"Percent of non-preds: {round((df_test.correct_w_non_preds.sum() - df_test.correct.sum()) / ROWS_TO_EVALUATE*100, 2)}% "
)
print(
    f"Percent of wrong preds: {round((1-(df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE))*100,2)}%"
)

Percent of predictions that include correct class: 20.69%
Multi_guess discount score: 20.69%
Multi_positive_outcome discount score: 20.69%
Percent of non-preds: 0.0% 
Percent of wrong preds: 79.31%


In [58]:
df_test

Unnamed: 0,condition,text,predicted_class,correct,correct_w_discount,correct_w_non_preds,top_n_preds
6868,High Blood Pressure,male 56 i started taking cozaar just 4 days ago to add to my current regiment of diuretic and be...,[Birth Control],0,0,0,"{'Birth Control': 0.187, 'Depression': 0.059, 'Pain': 0.043, 'Anxiety': 0.041, 'Acne': 0.039}"
24016,Acne,i have been using tri nessa for three months and it has been pretty good my doctor put it on me ...,[Birth Control],0,0,0,"{'Birth Control': 0.215, 'Depression': 0.059, 'Acne': 0.042, 'Anxiety': 0.041, 'Pain': 0.041}"
9668,Birth Control,so i 039 ve had the nexplanon since december of 2013 prior to the nexplanon i was on the deep sh...,[Birth Control],1,1,1,"{'Birth Control': 0.215, 'Depression': 0.06, 'Acne': 0.042, 'Anxiety': 0.041, 'Pain': 0.041}"
13640,Depression,i have tried lexapro zoloft calexa and brintellix my body does not tolerate meds well lexapro wa...,[Birth Control],0,0,0,"{'Birth Control': 0.194, 'Depression': 0.059, 'Pain': 0.042, 'Anxiety': 0.041, 'Acne': 0.039}"
14018,Depression,this little pill caused me to have worsening suicidal thoughts i was on edge and crying daily th...,[Birth Control],0,0,0,"{'Birth Control': 0.189, 'Depression': 0.058, 'Pain': 0.043, 'Anxiety': 0.041, 'Acne': 0.039}"
...,...,...,...,...,...,...,...
24717,Birth Control,i am happy overall with the paragard copper iud i like that it 039 s hormone free birth control ...,[Birth Control],1,1,1,"{'Birth Control': 0.231, 'Depression': 0.062, 'Acne': 0.044, 'Anxiety': 0.042, 'Pain': 0.041}"
2578,Asthma,omg i 039 ve been taking singulair for one week i haven 039 t received my advair from the mail o...,[Birth Control],0,0,0,"{'Birth Control': 0.198, 'Depression': 0.059, 'Pain': 0.042, 'Anxiety': 0.041, 'Acne': 0.04}"
10121,Urinary Tract Infection,when the doctor gave me this medicine i felt a lot better in my opinion it is effective,[Birth Control],0,0,0,"{'Birth Control': 0.18, 'Depression': 0.059, 'Pain': 0.045, 'Anxiety': 0.042, 'Acne': 0.038}"
21601,Constipation,so i 039 m 5 039 3 and 135 lbs i have tried dulcolax in the past for constipation but decided th...,[Birth Control],0,0,0,"{'Birth Control': 0.195, 'Depression': 0.059, 'Pain': 0.042, 'Anxiety': 0.041, 'Acne': 0.04}"
