In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft bitsandbytes
!pip install wandb
!pip install datasets
!pip install pyarrow==15.0.2


In [None]:
import numpy as np
import pandas as pd
import os
import gc
from datasets import Dataset

from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForMaskedLM,
    DataCollatorWithPadding,
    AutoModelForPreTraining,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)

from trl import SFTTrainer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast
import torch.optim as optim
from sklearn.model_selection import train_test_split

from peft import get_peft_model

import wandb

from google.colab import drive
drive.mount('/content/drive')

In [None]:
use_dtype = torch.bfloat16

### Classification head to go on LLM
class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim, dtype=use_dtype)

    def forward(self, x):
        return self.fc(x)

### Combine LLM and Classification Head
class CombinedModel(nn.Module):
    def __init__(self, llama_model, classifier):
        super(CombinedModel, self).__init__()
        self.llama_model = llama_model
        self.classifier = classifier

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        with torch.no_grad():  # Freeze pretrained model weights
            outputs = self.llama_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        ### Pass the output representation to linear layer
        logits = self.classifier(outputs.logits.mean(axis=1).to(device, dtype=use_dtype))
        return logits

### Tokenize the datasets
def tokenize_data(data):
    return tokenizer(data['text'], truncation=True, padding='max_length', max_length=max_seq_length)

### Save weights and biases (wandb)
def save_wandb(model_name):
    wandb.login()
    wandb.init(project="lea")
    model_artifact = wandb.Artifact(model_name, type='model')
    torch.save(model.state_dict(), f'{model_name}.pth')
    model_artifact.add_file(f'{model_name}.pth')
    wandb.log_artifact(model_artifact)

    wandb.log({"per_device_train_batch_size": 2,
              "gradient_accumulation_steps": 4,
              "warmup_steps": 5,
              "num_train_epochs": 20,
              "learning_rate": 1e-4,
              "logging_steps": 1,
              "optim": "adamw_8bit",
              "weight_decay": 0.01,
              "lr_scheduler_type": "linear",
              "seed": 890,
              "output_dir": "outputs",
              "train_test_split_random_state": 42
              })

def get_pretrained_model(model_name, max_seq_length, dtype, load_in_4bit):
    return FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
      )

### Read weights and biases (wandb)
def read_model_wandb(model, model_name):
    wandb.login()
    wandb.init(project="clinical_nlp")
    artifact = wandb.use_artifact("marvinmeng11-foursquare/clinical_nlp/unsloth-llama3-7b-2nd:latest", type="model")
    artifact_dir = artifact.download()
    model.load_state_dict(torch.load(f"{artifact_dir}/{model_name}"))

### Finetine Pretrained LLM
def finetune_LLM(lora_model, tokenizer, tokenized_labelled, max_seq_length):
    trainer = SFTTrainer(
        model = lora_model,
        tokenizer = tokenizer,
        train_dataset = tokenized_labelled,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = 20,
            learning_rate = 1e-4,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 890,
            output_dir = "outputs",
        ),
    )

def confidence_filter(row):
    ### Threshold to measure confidence in "Yes" or "No"
    threshold = .2
    return row['label'][0] <= threshold or row['label'][0] >= (1 - threshold) or row['label'][1] <= threshold or row['label'][1] >= (1 - threshold)

# Training

In [None]:
# ### Separate Test Set and Non-Test Set
# base_data = pd.read_csv("drive/MyDrive/Colab Notebooks/data/base_data.csv")

# test_set = base_data[base_data["test_set"] == 1]
# non_test_set = base_data[base_data["test_set"] == 0]

# test_set.to_csv("test_data.csv", index=False)
# non_test_set.to_csv("train_data.csv", index=False)

### Load in Data
train_data = pd.read_csv("drive/MyDrive/Colab Notebooks/health_data/train_data.csv")

### Split Labelled and Unlabelled
train_data_labelled = train_data[~train_data["has_cancer"].isnull()]
train_data_unlabelled = train_data[train_data["has_cancer"].isnull()]

### Combine Cancer and Diabetes Labels
train_data_labelled['label'] = train_data_labelled.apply(lambda row: [row['has_cancer'], row['has_diabetes']], axis=1)
train_data_labelled = train_data_labelled.drop(['has_cancer', 'has_diabetes', 'test_set', 'patient_identifier'], axis=1)

### Create Training and Validation sets
train_split_labelled, test_split_labelled = train_test_split(train_data_labelled, test_size=0.3, random_state = 42)


### Convert data to Huggingface Datasets
labelled_dataset = Dataset.from_pandas(train_split_labelled)
unlabelled_dataset = Dataset.from_pandas(train_data_unlabelled)
labelled_eval_dataset = Dataset.from_pandas(test_split_labelled)

### Pull in LLM and Tokenizer
model, tokenizer = get_pretrained_model("unsloth/llama-3-8b-bnb-4bit", 1024, use_dtype, True)

### Add LORA Adapters
lora_model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 890,
    use_rslora = True,
)

### Read in trained models, if applicable
# read_model_wandb("unsloth-llama3-7b-3rd.pth")

tokenized_labelled = labelled_dataset.map(tokenize_data, batched=True)
tokenized_unlabelled = unlabelled_dataset.map(tokenize_data, batched=True)
tokenized_labelled_eval = labelled_eval_dataset.map(tokenize_data, batched=True)

tokenized_labelled = tokenized_labelled.select_columns(['label', 'input_ids', 'attention_mask'])
tokenized_unlabelled = tokenized_unlabelled.select_columns(['input_ids', 'attention_mask'])
tokenized_labelled_eval = tokenized_labelled_eval.select_columns(['label','input_ids', 'attention_mask'])

tokenized_labelled.set_format("torch")
tokenized_unlabelled.set_format("torch")
tokenized_labelled_eval.set_format("torch")

### Create Dataloaders
batch_size = 8
test_dataloader = DataLoader(tokenized_unlabelled, batch_size=batch_size, shuffle=False)
train_dataloader = DataLoader(tokenized_labelled, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(tokenized_labelled_eval, batch_size=batch_size, shuffle=True)


### Finetune LLM
finetune_LLM(lora_model, tokenizer, tokenized_labelled, max_seq_length)

# ### Save Model to wandb
# save_wandb('unsloth-llama3-7b-5')
input_dim = 128256
output_dim = 2
classifier = Classifier(input_dim, output_dim)

# Instantiate the combined model
combined_model = CombinedModel(lora_model, classifier)


# Device configuration
device = combined_model.llama_model.device
combined_model.to(device)

# Optimizer and loss function
optimizer = optim.Adam(combined_model.classifier.parameters(), lr=1e-6)
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.MSELoss()

# Training loop
num_epochs = 10
combined_model.train()

iter_counter = 0
n_iterations = 10

train_len = len(train_dataloader)

### Generate Pseudo-Labels on unlabelled data
### Move pseudo-labelled data to training set if score is confident enough
### Remove low-confidence pseudo-labels and predict again after training on new data set
while(tokenized_unlabelled.num_rows > 0) or (iter_counter < n_iterations):
    torch.cuda.empty_cache()
    iter_counter += 1
    print("Iteration: ", iter_counter)
    print("Starting Training")
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            optimizer.zero_grad()

            with autocast():
                logits = torch.sigmoid(combined_model(input_ids, attention_mask=attention_mask))
                loss = criterion(logits.float(), labels.float())

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / train_len}")


    print("Predict Pseudo-Labels")
    all_predictions = []
    counter = 0
    with torch.no_grad():
        for batch in test_dataloader:
            # Move batch to GPU if available
            batch = {k: v.to('cuda') for k, v in batch.items()}

            # Make predictions
            outputs = combined_model(**batch)
            predictions = torch.sigmoid(outputs.float())

            # Collect predictions

            all_predictions += [i for i in predictions.cpu().float().numpy()]

    print("Update Training Set")
    pseudo_labelled_dataset = tokenized_unlabelled.add_column('label', all_predictions)
    confident = pseudo_labelled_dataset.filter(confidence_filter)

    ### Reset unlabelled data
    tokenized_unlabelled = pseudo_labelled_dataset.filter(lambda x: not confidence_filter(x)).remove_columns(['label'])

# Evaluate

In [None]:
### Evaluate

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    mean_squared_error,
    roc_auc_score,
)

outputs = combined_model(tokenized_labelled_eval['input_ids'], tokenized_labelled_eval['attention_mask'])
predictions = torch.sigmoid(outputs)
cancer_probabilities = [x[0].item() for x in predictions]
diabetes_probabilities = [x[1].item() for x in predictions]

cancer_predictions = [round(x) for x in cancer_probabilities]
diabetes_predictions = [round(x) for x in diabetes_probabilities]

cancer_labels = [x[0].item() for x in tokenized_labelled_eval['label']]
diabetes_labels = [x[1].item() for x in tokenized_labelled_eval['label']]

cancer_labels = np.array(cancer_labels)
cancer_predictions = np.array(cancer_predictions)
cancer_probabilities = np.array(cancer_probabilities)

# Accuracy
accuracy = accuracy_score(cancer_labels, cancer_predictions)

# Precision
precision = precision_score(cancer_labels, cancer_predictions)

# Recall
recall = recall_score(cancer_labels, cancer_predictions)

# F1-Score
f1 = f1_score(cancer_labels, cancer_predictions)

# Mean Squared Error (MSE)
mse = mean_squared_error(cancer_labels, cancer_probabilities)

# Area Under the Receiver Operating Characteristic Curve (AUROC)
auroc = roc_auc_score(cancer_labels, cancer_probabilities)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"MSE: {mse}")
print(f"AUROC: {auroc}")

In [None]:
### Diabetes Metrics
diabetes_labels = np.array(diabetes_labels)
diabetes_predictions = np.array(diabetes_predictions)
diabetes_probabilities = np.array(diabetes_probabilities)

# Accuracy
accuracy = accuracy_score(diabetes_labels, diabetes_predictions)

# Precision
precision = precision_score(diabetes_labels, diabetes_predictions)

# Recall
recall = recall_score(diabetes_labels, diabetes_predictions)

# F1-Score
f1 = f1_score(diabetes_labels, diabetes_predictions)

# Mean Squared Error (MSE)
mse = mean_squared_error(diabetes_labels, diabetes_probabilities)

# AUROC
auroc = roc_auc_score(diabetes_labels, diabetes_probabilities)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"MSE: {mse}")
print(f"AUROC: {auroc}")

# Test

In [None]:
### Predict outputs for test set

final_test_set = pd.read_csv("drive/MyDrive/Colab Notebooks/data/test_data.csv")
final_test_dataset = Dataset.from_pandas(final_test_set)
tokenized_final_test = final_test_dataset.map(tokenize_data, batched=True)
tokenized_final_test = tokenized_final_test.select_columns(['patient_identifier', 'input_ids', 'attention_mask'])
tokenized_final_test.set_format("torch")

### Create Dataloaders
batch_size = 16
final_test_dataloader = DataLoader(tokenized_final_test, batch_size=batch_size, shuffle=False)
all_predictions = []
ids = []
for batch in final_test_dataloader:
    new_batch = {'input_ids': batch['input_ids'].to('cuda'), 'attention_mask': batch['attention_mask'].to('cuda')}

    # Make predictions
    outputs = combined_model(**new_batch)
    predictions = torch.sigmoid(outputs.float())

    # Collect predictions
    all_predictions += [i for i in predictions.cpu().detach().numpy()]
    ids += batch['patient_identifier']



ids = [x.item() for x in ids]
cancer_probabilities = [x[0].item() for x in all_predictions]
diabetes_probabilities = [x[1].item() for x in all_predictions]

cancer_predictions = [round(x) for x in cancer_probabilities]
diabetes_predictions = [round(x) for x in diabetes_probabilities]

final_df = pd.DataFrame({'patient_identifier': ids, 'cancer_probability': cancer_probabilities, 'diabetes_probability': diabetes_probabilities ,'has_cancer': cancer_predictions, 'has_diabetes': diabetes_predictions})
final_df.to_csv("predictions.csv", index=False)