In [1]:
%run local_functions.py
from local_functions import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import BertConfig, BertModel


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import TrainingArguments, Trainer


from sklearn.model_selection import train_test_split

plt.style.use("dark_background")

pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", 100)


plt.style.use("dark_background")

%load_ext lab_black

2023-09-19 19:29:33.094825: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-19 19:29:33.115315: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load in and normalize data

In [17]:
df = pd.read_parquet("datasets/drug-reviews.parquet")[["condition", "review"]]

df["text"] = df["review"].apply(text_normalization_3)
df = df[["condition", "text"]]

N_CONDITIONS = 50
top_n_conditions = df.condition.value_counts().head(N_CONDITIONS).index.to_list()

df = df[df.condition.isin(top_n_conditions)]

SAMPLE_SIZE = 25000

df = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

This takes n samples from each condition to balance classes

In [65]:
new_df = pd.DataFrame(columns=["condition", "text"])
for condition in df.condition.unique():
    c = df[df.condition == condition].sample(100, random_state=42)
    new_df = pd.concat([new_df, c])

df = new_df.reset_index(drop=True)

In [67]:
df

Unnamed: 0,condition,text
0,Chronic Pain,i am taking this for my chronic pain along with hydrocodone 15mg x 4 daily for breakthrough pain...
1,Chronic Pain,i find it nearly impossible to see how ms contin 30 is equivalent to 20mg of oxycodone in compar...
2,Chronic Pain,prescribed 30mg 3x daily used to be on opana 40mg opana was definitely a better solution because...
3,Chronic Pain,not great at all for relieving pain in my opinion
4,Chronic Pain,i have been on this for almost a full month and can 039 t wait to get off of it the nausea has b...
...,...,...
4995,Overactive Bladde,i was worried the patch would be intrusive but once i put it on the only thing i noticed was my ...
4996,Overactive Bladde,worth every penny
4997,Overactive Bladde,works very well for urgency etc an obscure side effect is diminished ability to achieve an erect...
4998,Overactive Bladde,after 4 years of taking trospium doctor tried me on flomax not only did it not work i expereince...


In [68]:
training_df, testing_df = train_test_split(df, test_size=0.25, random_state=42)

In [69]:
encoded_df = pd.get_dummies(
    training_df["condition"], columns=["condition"], prefix="", prefix_sep=""
)
encoded_df = encoded_df.astype(bool)
encoded_df_con = pd.concat([training_df["text"], encoded_df], axis=1)

# Dataset

In [70]:
train_df, temp_df = train_test_split(encoded_df_con, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [71]:
# Convert the split DataFrames into Datasets
train = Dataset.from_pandas(train_df, split="train")
valid = Dataset.from_pandas(valid_df, split="validation")
test = Dataset.from_pandas(test_df, split="test")

dataset = DatasetDict({"train": train, "validation": valid, "test": test})

In [72]:
labels = [
    label
    for label in dataset["train"].features.keys()
    if label not in ["text", "__index_level_0__"]
]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Tokenizer

In [73]:
LM = "bert-base-uncased"

In [74]:
tokenizer = AutoTokenizer.from_pretrained(LM)

In [75]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding


encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)

encoded_dataset.set_format("torch")

Map:   0%|          | 0/2625 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]

# Model

In [76]:
model = AutoModelForSequenceClassification.from_pretrained(
    LM,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
EPOCHS = 1
BATCH_SIZE = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-drugs-finetuned-sem_eval-english",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # push_to_hub=True,
)

In [78]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [79]:
trainer.train()



  0%|          | 0/329 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/71 [00:00<?, ?it/s]

{'eval_loss': 0.12772415578365326, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.0, 'eval_runtime': 3.3256, 'eval_samples_per_second': 168.991, 'eval_steps_per_second': 21.349, 'epoch': 1.0}
{'train_runtime': 58.8991, 'train_samples_per_second': 44.568, 'train_steps_per_second': 5.586, 'train_loss': 0.22950851301291794, 'epoch': 1.0}


TrainOutput(global_step=329, training_loss=0.22950851301291794, metrics={'train_runtime': 58.8991, 'train_samples_per_second': 44.568, 'train_steps_per_second': 5.586, 'train_loss': 0.22950851301291794, 'epoch': 1.0})

In [80]:
trainer.evaluate()

  0%|          | 0/71 [00:00<?, ?it/s]

{'eval_loss': 0.12772415578365326,
 'eval_f1': 0.0,
 'eval_roc_auc': 0.5,
 'eval_accuracy': 0.0,
 'eval_runtime': 3.3227,
 'eval_samples_per_second': 169.142,
 'eval_steps_per_second': 21.368,
 'epoch': 1.0}

# Eval

In [81]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-drugs-finetuned-sem_eval-english/checkpoint-329"
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

trainer = Trainer(model=model, tokenizer=tokenizer)

df_test = testing_df

In [82]:
ROWS_TO_EVALUATE = len(df_test)

CONFIDENCE_THRESHOLD = 0.15  # 0.1 works well for low number of non-responses

TOP_N_PREDS = 5  # number of top predictions to return


# make predictions
df_test["predicted_class"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    predict_class, args=(tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD)
)  # args: text, tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD=0.5

# calculate if prediction is correct
df_test["correct"] = df_test[0:ROWS_TO_EVALUATE].apply(
    lambda row: int(row["condition"] in row["predicted_class"]), axis=1
)
# calculate score (including penalty for guessing multiple categories) used to help find optimal confidence threshold
df_test["correct_w_discount"] = df_test[0:ROWS_TO_EVALUATE].apply(
    multi_cat_guess_penalty,
    axis=1,
    args=(0.9,),  # muli_cat_guess_penalty (somewhere around 0.85 works well)
)

df_test["correct_w_non_preds"] = df_test.apply(multi_positive_outcome, axis=1)

# get top n predictions
df_test["top_n_preds"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    n_most_likely_classes, args=(tokenizer, trainer, id2label, TOP_N_PREDS)
)

print(
    f"Percent of predictions that include correct class: {round((df_test.correct.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_guess discount score: {round((df_test.correct_w_discount.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_positive_outcome discount score: {round((df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)

print(
    f"Percent of non-preds: {round((df_test.correct_w_non_preds.sum() - df_test.correct.sum()) / ROWS_TO_EVALUATE*100, 2)}% "
)
print(
    f"Percent of wrong preds: {round((1-(df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE))*100,2)}%"
)

Percent of predictions that include correct class: 0.0%
Multi_guess discount score: 0.0%
Multi_positive_outcome discount score: 100.0%
Percent of non-preds: 100.0% 
Percent of wrong preds: 0.0%


In [83]:
df_test

Unnamed: 0,condition,text,predicted_class,correct,correct_w_discount,correct_w_non_preds,top_n_preds
1501,Opiate Dependence,i was addicted to opiates for 4years before this shot i finally feel like i 039 m not living a d...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.095, 'Bacterial Infection': 0.092, 'Sinusitis': 0.092, 'P..."
2586,Psoriasis,been on enbrel for 4 years was totally clear for 2 years when i got a kidney infection went off ...,[],0,0,1,"{'Schizophrenia': 0.099, 'Insomnia': 0.096, 'Bacterial Infection': 0.093, 'Sinusitis': 0.092, 'P..."
2653,Migraine,i have had migraines for approximately 20 yrs at 40 went into early menopause i had been migrain...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.095, 'Bacterial Infection': 0.093, 'Sinusitis': 0.092, 'P..."
1055,Anxiety,only been on 15 mg for a week now and just upped to 30mg warning do not mix this with alcohol wh...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.095, 'Bacterial Infection': 0.093, 'Sinusitis': 0.091, 'P..."
705,Panic Disorde,klonopin helps my anxiety panic disorder and insomnia i take it on an as needed basis and it 039...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.095, 'Bacterial Infection': 0.093, 'Sinusitis': 0.093, 'H..."
...,...,...,...,...,...,...,...
4141,Abnormal Uterine Bleeding,i was 17 when i started on this pill because i had cysts on my ovaries this pill is a good pill ...,[],0,0,1,"{'Schizophrenia': 0.098, 'Insomnia': 0.096, 'Bacterial Infection': 0.093, 'Sinusitis': 0.091, 'P..."
3168,"Diabetes, Type 2",i have no appetite and i do have nausea i have lost 11 pounds this month i did miss 1 week of wo...,[],0,0,1,"{'Schizophrenia': 0.098, 'Insomnia': 0.095, 'Bacterial Infection': 0.093, 'Sinusitis': 0.093, 'P..."
2478,Migraine Prevention,i was given this medication to replace toprimate i took a pill friday night and saturday night 1...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.095, 'Bacterial Infection': 0.094, 'Sinusitis': 0.091, 'P..."
4214,Irritable Bowel Syndrome,i was skeptical and did not want to take this my doctor prescribed amitriptyline and citalopram ...,[],0,0,1,"{'Schizophrenia': 0.097, 'Insomnia': 0.096, 'Bacterial Infection': 0.094, 'Sinusitis': 0.091, 'P..."
