In [None]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb

df = pd.read_csv('copd_heart_failure_df_no2.csv')

#Select only columns reason_clean, label_list
df = df[['reason_clean', 'disease_label']]

#Split into train and test
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

train_df.columns = ["text", "labels"]
test_df.columns = ["text", "labels"]





In [None]:
#Plot distribution of labels in test and train
import matplotlib.pyplot as plt
import seaborn as sns

disease_count = train_df['labels'].value_counts()
disease_count2 = test_df['labels'].value_counts()

fig, ax = plt.subplots(1,2, figsize=(20, 10))
sns.barplot(x=disease_count.index, y=disease_count.values, ax=ax[0])
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)
ax[0].set_title('Distribution of labels in train set')
ax[0].set_xlabel('Disease')
ax[0].set_ylabel('Count')
sns.barplot(x=disease_count2.index, y=disease_count2.values, ax=ax[1])
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)
ax[1].set_title('Distribution of labels in test set')
ax[1].set_xlabel('Disease')
ax[1].set_ylabel('Count')
plt.show()

In [None]:
#Setup hyperparameter sweep
sweep_config = {
    "method": "bayes",  
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"values": [2, 3, 4]},
        "learning_rate": {"min": 5e-5, "max": 4e-4},
    },
}

sweep_id = wandb.sweep(sweep_config, project="Multi-Class Sweep")


model_args = ClassificationArgs()
model_args.train_batch_size = 16
model_args.eval_batch_size = 8
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True

def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        "bert",
        "StanfordAIMI/RadBERT",
        use_cuda=True,
        num_labels=2,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(train_df, eval_df=test_df)

    # Evaluate the model
    model.eval_model(test_df)

    # Sync wandb
    wandb.join()

wandb.agent(sweep_id, train)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions_train = model.eval_model(train_df)