This notebook contains code for multiple models.

In [None]:
# CELL 1
# import necessary libraries for data manipulation, model evaluation, and plotting
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from tensorboard.backend.event_processing import event_accumulator
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

import os
from pathlib import Path
import pandas as pd
import datetime
import math

import torch
from torch.optim import AdamW  # variant of Adam with weight decay
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback
import torch.nn as nn
import torch.nn.functional as F

import json
import numpy as np
from transformers import EarlyStoppingCallback

from sklearn.utils.class_weight import compute_class_weight

import datetime
import warnings
import shutil
import sklearn
from transformers import get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback

!pip install transformers[torch] accelerate -U plotting



In [None]:
# CELL 2
# Data preprocessing and helper functions

# Separate models for each SDOH

current_sdoh = "behavior_alcohol"

# community present/absent and education
sdoh_community_education = {
      0: 'False',
      1: 'True',
}

# economics and environment
sdoh_economics_environment = {
      0: 'None',
      1: 'True',
      2: 'False',
}

# alcohol, drug, tobacco
sdbh_alcohol_drug_tobacco = {
      0: 'None',
      1: 'Present',
      2: 'Past',
      3: 'Never',
      4: 'Unsure'
}

# function to determine which SDOH is described
# here, the SDOH for the classification report will be set
def set_sdoh(sdoh_group):
  global current_sdoh
  current_sdoh = sdoh_group

''' balance_data function balances the classes in a dataset by upsamling/oversampling
the minority classes to match the number of samples in the majority class '''
def balance_data(df):
    values = df['y'].value_counts() # calculates the frequency of each class in the 'y' column of the df (identifies majority/minority based on occurrence)
    majority = df[df['y'] == values.idxmax()] # identifies majority class, selects class with highest frequency
    desired_samples = len(majority) # this number will be used as target number of samples after samples

    # iterate through each label in the dataset and check if current label is the majority class
    for label in values.index:
        if label == values.idxmax():
            continue
        minority = df[df['y'] == label]
        upsampled_minority = resample(minority,
                                      replace=True,  # Sample with replacement
                                      n_samples=desired_samples,  # Match number of majority class
                                      random_state=42)
        majority = pd.concat([majority, upsampled_minority])

    return majority

 # new function to calculate class weights for the given data
def get_class_weights(y):
   class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
   return torch.tensor(class_weights, dtype=torch.float32)

# TEST TRAIN SPLIT
# for all SDoH from the preprocessed data
dataset = pd.read_csv("/content/PREPROCESSED-NOTES.csv")

# prepare directories for storing train-test split data for each SDoH category
base_path = '/content/test_train_split'
os.makedirs(base_path, exist_ok=True)

sdoh_data = {
    "sdoh_community_present": dataset["sdoh_community_present"].to_list(),
    "sdoh_community_absent": dataset["sdoh_community_absent"].to_list(),
    "sdoh_education": dataset["sdoh_education"].to_list(),
    "sdoh_economics": dataset["sdoh_economics"].to_list(),
    "sdoh_environment": dataset["sdoh_environment"].to_list(),
    "behavior_alcohol": dataset["behavior_alcohol"].to_list(),
    "behavior_tobacco": dataset["behavior_tobacco"].to_list(),
    "behavior_drug": dataset["behavior_drug"].to_list()
}


# extract text data and specific SDoH categories from the dataset
text_data = dataset["text"].to_list()

 # Iterate through each SDOH data category to split and save as separate CSV files
for category, data in sdoh_data.items():
    category_path = os.path.join(base_path, category)  # Build the full path for the category
    os.makedirs(category_path, exist_ok=True)  # Ensure the category directory exists

    # Split data for the current category into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        text_data, data, test_size=0.2, random_state=0, stratify=data
    )

    # Save all splits as CSV files
    pd.DataFrame({"text": X_train, category: y_train}).to_csv(os.path.join(category_path, 'train.csv'), index=False)
    pd.DataFrame({"text": X_val, category: y_val}).to_csv(os.path.join(category_path, 'test.csv'), index=False)

In [None]:
# CELL 3
# Relevant information for the training and testing models are in this cell

class SDOHDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class CustomTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def compute_metrics(self, eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=-1)
    return compute_metrics((eval_pred.label_ids, preds))

  def compute_loss(self, model, inputs, return_outputs=False):
        """
        This function computes the loss for the given model and inputs
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = torch.nn.functional.cross_entropy(logits, labels, weight=self.weights if hasattr(self, 'weights') else None)
        return (loss, outputs) if return_outputs else loss



In [None]:
# CELL 4
class Model():
    def __init__(self, Sdoh_name, num_of_labels, model_name, epochs, batch, project_base_path, balanced, weighted, output_dir=None, cv=None):
        """
        Initialize the tokenizer and model for the class to use
        """
        # Suppress FutureWarning messages
        warnings.simplefilter(action='ignore', category=FutureWarning)

        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
        # Ensure the tokenizer accepts a pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        # Initialize the model for sequence classification with the specified number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_of_labels)

         # Add pad token id to model configuration if not already set
        if self.model.config.pad_token_id is None:
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        # Move model to appropriate device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        self.Sdoh_name = Sdoh_name
        self.num_of_labels = num_of_labels
        self.epochs = epochs
        self.batch = batch
        self.project_base_path = project_base_path
        self.balanced = balanced
        self.weighted = weighted
        self.output_dir = output_dir
        self.cv = cv

    def train(self):
        data_path = os.path.join(self.project_base_path, f"/content/test_train_split/{self.Sdoh_name}/")
        data_file = 'train.csv'
        df = pd.read_csv(os.path.join(data_path, data_file))
        df.dropna(subset=['text'], inplace=True)
        x = df['text']
        y = df[self.Sdoh_name]

        MAX_LENGTH = 512
        early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
        optimizer = AdamW(self.model.parameters(), lr=5e-5)
        current_fold = 1

        if self.weighted:
            self.weights = get_class_weights(y)

        if self.cv:
            # Implement 5-fold stratified cross val
            skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
            split_iterator = skf.split(x, y)

        else:
            X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
            max_index = len(df) - 1  # Get the maximum valid index of the dataframe
            split_iterator = [(list(X_train.index[X_train.index <= max_index]), list(X_val.index[X_val.index <= max_index]))]

        for train, test in split_iterator:
            X_train = x.iloc[train]
            X_val = x.iloc[test]

            X_train, X_val = x.iloc[train], x.iloc[test]
            y_train, y_val = y.iloc[train], y.iloc[test]
            epoch_training_steps = len(X_train) // self.batch
            num_training_steps = epoch_training_steps * self.epochs
            num_warmup_steps = epoch_training_steps * 0.1
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
            optimizers = (optimizer, scheduler)

            # Handle class imbalance in training data
            if self.balanced:
                df_train = pd.DataFrame({'X': X_train, 'y': y_train})
                balanced_train = balance_data(df_train)

                # Convert back to lists if needed
                list_train_x = balanced_train['X'].tolist()
                list_train_y = balanced_train['y'].tolist()
            else:
                list_train_x = X_train.tolist()
                list_train_y = y_train.tolist()

            list_val_x = X_val.tolist()
            list_val_y = y_val.tolist()

            train_encodings = self.tokenizer(list_train_x, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')
            val_encodings = self.tokenizer(list_val_x, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')

            train_dataset: SDOHDataset(
              train_encodings,  # These should be the output from the tokenizer
              list_train_y  # These should be labels, as a list or tensor
            )

            val_dataset = SDOHDataset(
              val_encodings,  # These should be the output from the tokenizer
              list_val_y  # These should be labels, as a list or tensor
            )

            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

            log_dir = self.project_base_path if self.output_dir is None else self.output_dir

            tensor_logs = os.path.join(log_dir, f'logs/{self.Sdoh_name}/tensor_logs/logs_{timestamp}')
            os.makedirs(tensor_logs, exist_ok=True)

            epoch_logs = os.path.join(log_dir, f'logs/{self.Sdoh_name}/epoch_logs/logs_{timestamp}')
            os.makedirs(epoch_logs, exist_ok=True)

            # Model training code here
            # training args - need to adjust
            training_args = TrainingArguments(
              output_dir= epoch_logs,
              logging_strategy='epoch',
              num_train_epochs=self.epochs, # 4
              per_device_train_batch_size= self.batch, #16
              per_device_eval_batch_size= self.batch, #64
              save_strategy= 'epoch',
              warmup_steps=500,
              weight_decay=1e-5,
              logging_dir= tensor_logs,
              evaluation_strategy="epoch",
              load_best_model_at_end=True,
              metric_for_best_model="eval_loss",
              #greater_is_better=False,   Set to False because a lower loss is better
            )

            trainer = CustomTrainer(
              model=model,
              args=training_args,
              train_dataset=train_dataset,
              eval_dataset=val_dataset,
              compute_metrics=compute_metrics,
              callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # 3 is a balance between giving the model enough chance  to improve and stopping early enough to prevent overfitting and unnecessary computation
            )

            print(f'Starting Training{" Fold " + str(current_fold) if self.cv else ""}')

            # train the model
            trainer.train()

            print(f'Finished Training{" Fold " + str(current_fold) if self.cv else ""}')

            graph_dir = os.path.join(self.project_base_path, f'graphs/{self.Sdoh_name}')
            save_dir = os.path.join(self.project_base_path, f'saved_models/{self.Sdoh_name}')

            # Configure directory paths depending on config
            if self.cv:
                graph_dir += f'_cv{current_fold}'
                save_dir += f'_cv{current_fold}'
            if self.balanced:
                graph_dir += '_balanced'
                save_dir += '_balanced'
            if self.weighted:
                graph_dir += '_weighted'
                save_dir += '_weighted'

            # Create directories
            os.makedirs(graph_dir, exist_ok=True)
            os.makedirs(save_dir, exist_ok=True)

            # Plot and save
            plot_metric_from_tensor(tensor_logs, f'{graph_dir}/plot_loss.jpg')
            self.model.save_pretrained(save_dir)
            self.tokenizer.save_pretrained(save_dir)

            current_fold += 1

    def test(self):
        set_helper_sdoh(self.Sdoh_name)

        data_path = os.path.join(self.project_base_path, f"/content/test_train_split/{self.Sdoh_name}")

        data_file = 'test.csv'
        test_df = pd.read_csv(os.path.join(data_path, data_file))

        test_df.dropna(subset=['text'], inplace=True)
        test_inputs = test_df['text'].tolist()
        test_labels = test_df[self.Sdoh_name].tolist()

        max_seq_length = 128

        test_encodings = self.tokenizer(test_inputs, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt')

        test_dataset = MIMICDataset(
            test_encodings,
            test_labels
        )

        saved_models_dir = os.path.join(self.project_base_path, f'saved_models/')
        sdoh_dir = f'{self.Sdoh_name}'

        if self.cv:
            sdoh_dir += f'_cv5'
        if self.balanced:
            sdoh_dir += '_balanced'
        if self.weighted:
            sdoh_dir += '_weighted'

        results_dir = os.path.join(self.project_base_path, f'test_results/{sdoh_dir}')
        os.makedirs(results_dir, exist_ok=True)

        model =  AutoModelForSequenceClassification.from_pretrained(os.path.join(saved_models_dir, sdoh_dir))

        eval_trainer = CustomTrainer(
            model=model,
            test=True,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics
        )

        results = eval_trainer.evaluate()

        cm = results.get('eval_cm')

        cm.plot()
        plt.savefig(f"{results_dir}/confusion_matrix.jpg")

        curves = results.get('eval_roc')
        roc_dir = os.path.join(results_dir, 'roc')
        os.makedirs(roc_dir, exist_ok=True)

        for display, best_threshold in curves:
            # Plot the ROC curve
            display.plot()

            # Set titles and labels
            plt.title(f'ROC Curve for {display.estimator_name}')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')

            # Add annotation for the best threshold
            plt.text(0.5, 0.5, f'Best Threshold: {best_threshold:.2f}', ha='center', va='center', fontsize=10, bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))

            # Save the figure as JPG with the estimator name
            plt.savefig(f'{roc_dir}/{display.estimator_name}.jpg')

        report_df = results.get('eval_classification_report')
        report_df.to_csv(f"{results_dir}/classification_report.csv")

         # Save evaluation results to a CSV file
        results_df = pd.DataFrame([results]).drop(columns=['eval_cm', 'eval_roc', 'eval_classification_report'])
        os.makedirs(results_dir, exist_ok=True)
        results_df.to_csv(f"{results_dir}/results.csv", index=False)
        print("Evaluation results saved to:", results_dir)

        # Clean up temp files
        tmp_dir = os.path.join(os.getcwd(), 'tmp_trainer')
        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)

In [None]:
project_base_path = Path("/content").resolve()
train_dataset_path = project_base_path /"/content/PREPROCESSED-NOTES.csv"  # Adjust as necessary
test_dataset_path = project_base_path /"/content/ANNOTATEDNOTES.csv"

In [None]:
# CELL 5
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    logits = eval_pred.predictions
    logits_tensor = torch.tensor(logits)  # Convert logits to PyTorch tensor
    preds_probs_tensor = F.softmax(logits_tensor, dim=-1)  # Apply softmax along the last dimension
    preds_probs = preds_probs_tensor.numpy()  # Convert probabilities back to numpy array

    # PREDICTIONS
    preds = np.argmax(preds_probs, axis=-1)

    # Classifier Metrics
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    # classification report
    if current_sdoh.startswith("behavior"):
        current_sbdh_dict = sdbh_alcohol_drug_tobacco
    elif current_sdoh == "sdoh_economics" or current_sdoh == "sdoh_environment":
        current_sbdh_dict = sdoh_economics_environment
    else:  # This includes 'sdoh_community_education'
        current_sbdh_dict = sdoh_community_education

    report = classification_report(labels, preds, target_names=current_sbdh_dict.values(), output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Metrics for predicted probabilities
    # AUC score
    if num_classes > 2:
        auc = roc_auc_score(labels, preds_probs, average='weighted', multi_class='ovr')
    else:
        greater_class_prob = preds_probs[:, 1]
        auc = roc_auc_score(labels, greater_class_prob, average='weighted', multi_class='ovr')

    if test:
        # Confusion Matrix
        cm = ConfusionMatrixDisplay.from_predictions(labels, preds)

        # ROC Curve
        # Handle multi class ROC curves using OvR
        curves = []
        if num_classes > 2:
            for i in range(num_classes):
                fpr, tpr, thresholds = roc_curve(labels, preds_probs[:, i], pos_label=i)
                display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc, estimator_name=f'{current_sbdh}_{current_sbdh_dict[i]}')
                best_threshold = thresholds[np.argmax(tpr - fpr)]
                curves.append((display, best_threshold))
        else:
            fpr, tpr, thresholds = roc_curve(labels, greater_class_prob, pos_label=1)
            display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc, estimator_name='Estimator')
            best_threshold = thresholds[np.argmax(tpr - fpr)]
            curves.append((display, best_threshold))

        return {
          'accuracy': acc,
          'f1': f1,
          'precision': precision,
          'recall': recall,
          'classification_report': report_df,
          'roc': curves,
          'cm': cm,
        }

def plot_metric_from_tensor(log_dir, output_dir, steps_per_epoch):

 # Calculate steps_per_epoch based on training data and training arguments
    event_acc = event_accumulator.EventAccumulator(log_dir)
    event_acc.Reload()

    graph1_data = event_acc.Scalars("eval/loss")
    graph2_data = event_acc.Scalars("train/loss")

    training_dataset_size = len(X_train)  # Size of your training dataset
    total_steps_per_epoch = math.ceil(training_dataset_size / self.batch)

    # convert steps to epochs #
    epochs1 = [event.step / total_steps_per_epoch for event in graph1_data]
    values1 = [event.value for event in graph1_data]

    epochs2 = [event.step / total_steps_per_epoch for event in graph2_data]
    values2 = [event.value for event in graph2_data]

    plt.figure(figsize=(10, 6))

    plt.plot(epochs1, values1, label="Eval Loss")
    plt.plot(epochs2, values2, label="Train Loss")

    plt.legend()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Overlap")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Save the graph to the specified folder
    plt.savefig(os.path.join(output_dir, 'metrics_plot.png'))
    plt.show()


In [None]:
### NEW
config = {
    'sdoh': 'behavior_alcohol',  # or any other SDOH category you're working with
    'model': 'gpt2',  # or any model you're planning to use
    'epochs': 4,
    'batch': 16,
    'balanced': True,
    'weighted': True,
    'output': 'path_to_output_directory',
    'cv': False
}

sdoh_label_counts = {
    'sdoh_community_present': 2,
    'sdoh_community_absent': 2,
    'sdoh_education': 2,
    'sdoh_economics': 3,  # Assuming 'None', 'True', 'False'
    'sdoh_environment': 3,  # Assuming 'None', 'True', 'False'
    'behavior_alcohol': 5,  # 'None', 'Present', 'Past', 'Never', 'Unsure'
    'behavior_tobacco': 5,  # Similar to alcohol
    'behavior_drug': 5  # Similar to alcohol and tobacco
}

# Set base path to current working directory in Colab
project_base_path = Path("/content").resolve()

# Example function call, adjust according to your needs
model = Model(config['sdoh'], sdoh_label_counts[config['sdoh']], config['model'], config['epochs'], config['batch'], project_base_path, config['balanced'], config['weighted'], output_dir=config['output'], cv=config['cv'])

model.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


UnboundLocalError: local variable 'train_dataset' referenced before assignment