In [None]:
!pip install peft
!pip install trl
!pip install datasets
!pip install bitsandbytes
!pip install evaluate

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from datasets import Dataset

from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoModelForMaskedLM,
    AutoModelForPreTraining,
    AutoModelForCausalLM,
    AutoModelForNextSentencePrediction,
    DataCollatorWithPadding,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)

from huggingface_hub import login

from trl import SFTTrainer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast
import torch.optim as optim
from sklearn.model_selection import train_test_split

from peft import get_peft_model, LoraConfig

from dataclasses import dataclass
import bitsandbytes as bnb
import evaluate

from google.colab import drive
drive.mount('/content/drive')


In [None]:
### Tokenize the datasets
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_4bit=True)
max_seq_length = 128
def tokenize_data(data):
    return tokenizer(data['text'], truncation=True, padding='max_length', max_length=max_seq_length)


def compute_metrics(eval_pred):
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    accuracy = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"precision": precision.compute(predictions = predictions, references = labels)['precision'],
            "recall": recall.compute(predictions = predictions, references = labels)['recall'],
            "accuracy": accuracy.compute(predictions = predictions, references = labels)['accuracy'],
            "f1": f1.compute(predictions = predictions, references = labels)['f1']}





# Create a custom data collator for MLM and NSP
class DataCollatorForPreTraining:
    def __init__(self, tokenizer, mlm=True, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm = mlm
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        batch = self.tokenizer.pad(examples, return_tensors="pt")
        if self.mlm:
            inputs, labels = self.mask_tokens(batch["input_ids"])
            batch["input_ids"] = inputs
            batch["labels"] = labels
        else:
            batch["labels"] = batch["input_ids"]

        # Add dummy next sentence prediction labels (since we do not have sentence pairs)
        batch["next_sentence_label"] = torch.zeros(len(batch["input_ids"]), dtype=torch.long)
        return batch

    def mask_tokens(self, inputs):
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        return inputs, labels

In [None]:
### Load in Data
train_data = pd.read_csv("drive/MyDrive/Colab Notebooks/layer_health_data/train_data.csv")

### Split Labelled and Unlabelled
train_data_labelled = train_data[~train_data["has_cancer"].isnull()]
train_data_unlabelled = train_data[train_data["has_cancer"].isnull()]

train_data_labelled['has_cancer'] = train_data_labelled['has_cancer'].astype(int)
train_data_labelled['has_diabetes'] = train_data_labelled['has_diabetes'].astype(int)

### Combine Cancer and Diabetes Labels
train_data_labelled['label'] = train_data_labelled.apply(lambda row: [row['has_cancer'], row['has_diabetes']], axis=1)
train_data_labelled = train_data_labelled.drop(['test_set', 'patient_identifier'], axis=1)






### Create Training and Validation sets
train_split_labelled, test_split_labelled = train_test_split(train_data_labelled, test_size=0.3, random_state = 42)


### Convert data to Huggingface Datasets
labelled_dataset = Dataset.from_pandas(train_split_labelled)
unlabelled_dataset = Dataset.from_pandas(train_data_unlabelled)
labelled_eval_dataset = Dataset.from_pandas(test_split_labelled)

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenized_labelled = labelled_dataset.map(tokenize_data, batched=True)
tokenized_labelled_eval = labelled_eval_dataset.map(tokenize_data, batched=True)

tokenized_labelled_all = tokenized_labelled.select_columns(['label', 'input_ids', 'attention_mask'])
tokenized_labelled_all.set_format("torch")

tokenized_labelled_eval_all = tokenized_labelled_eval.select_columns(['label','input_ids', 'attention_mask'])
tokenized_labelled_eval_all.set_format("torch")


tokenized_labelled_cancer = tokenized_labelled.select_columns(['has_cancer', 'input_ids', 'attention_mask'])
tokenized_labelled_cancer = tokenized_labelled_cancer.rename_column("has_cancer", "labels")
tokenized_labelled_cancer.set_format("torch")

tokenized_labelled_eval_cancer = tokenized_labelled_eval.select_columns(['has_cancer','input_ids', 'attention_mask'])
tokenized_labelled_eval_cancer = tokenized_labelled_eval_cancer.rename_column("has_cancer", "labels")
tokenized_labelled_eval_cancer.set_format("torch")

tokenized_labelled_diabetes = tokenized_labelled.select_columns(['has_diabetes', 'input_ids', 'attention_mask'])
tokenized_labelled_diabetes = tokenized_labelled_diabetes.rename_column("has_diabetes", "labels")
tokenized_labelled_diabetes.set_format("torch")

tokenized_labelled_eval_diabetes = tokenized_labelled_eval.select_columns(['has_diabetes','input_ids', 'attention_mask'])
tokenized_labelled_eval_diabetes = tokenized_labelled_eval_diabetes.rename_column("has_diabetes", "labels")
tokenized_labelled_eval_diabetes.set_format("torch")


tokenized_unlabelled = unlabelled_dataset.map(tokenize_data, batched=True)
tokenized_unlabelled = tokenized_unlabelled.select_columns(['input_ids', 'attention_mask'])
tokenized_unlabelled.set_format("torch")

In [None]:
pretrainModel = AutoModelForPreTraining.from_pretrained("google-bert/bert-base-uncased")
maskedLMmodel = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
cancerModel = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
diabetesModel = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

peft_config = LoraConfig(r = 16,
    target_modules = ["query", "key", "value", "dense"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_rslora = True,)

pretrainModel = get_peft_model(pretrainModel, peft_config)
maskedLMmodel = get_peft_model(maskedLMmodel, peft_config)
cancerModel = get_peft_model(cancerModel, peft_config)
diabetesModel = get_peft_model(diabetesModel, peft_config)


# pretrain_data_collator = DataCollatorForPreTraining(tokenizer)
# mlm_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
@dataclass
class TrainArgs:
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    warmup_steps: int = 5
    num_train_epochs:int = 1
    learning_rate: float = 1e-4
    fp16: bool = False #not is_bfloat16_supported(),
    bf16: bool = True #is_bfloat16_supported(),
    logging_steps: int = 1
    optim: str = "adamw_8bit"
    weight_decay: float = 0.01
    lr_scheduler_type: str = "linear"
    seed: int = 890
    output_dir: str = "outputs"
args = TrainArgs()

# Pre Train

In [None]:
trainer = SFTTrainer(
        model = pretrainModel,
        tokenizer = tokenizer,
        train_dataset = tokenized_unlabelled,
        dataset_text_field = "text",
        max_seq_length = 512,
        dataset_num_proc = 2,
        peft_config = peft_config,
        packing = False,
        data_collator=pretrain_data_collator,
        args = TrainingArguments(
            per_device_train_batch_size = TrainArgs.per_device_train_batch_size,
            gradient_accumulation_steps = TrainArgs.gradient_accumulation_steps,
            warmup_steps = TrainArgs.warmup_steps,
            num_train_epochs = TrainArgs.num_train_epochs,
            learning_rate = TrainArgs.learning_rate,
            fp16 = TrainArgs.fp16,
            bf16 = TrainArgs.bf16,
            logging_steps = TrainArgs.logging_steps,
            optim = TrainArgs.optim,
            weight_decay = TrainArgs.weight_decay,
            lr_scheduler_type = TrainArgs.lr_scheduler_type,
            seed = TrainArgs.seed,
            output_dir = TrainArgs.output_dir,
        ),
    )

In [None]:
trainer.train()

In [None]:
output_file = "drive/MyDrive/Colab Notebooks/layer_health_data/mlmHead.pth"
torch.save(maskedLMmodel.state_dict(), output_file)

In [None]:
## Save locally with torch
# output_file = "drive/MyDrive/Colab Notebooks/layer_health_data/maskedLMModel.pth"
# torch.save(maskedLMmodel.state_dict(), output_file)


# blankModel = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

# peft_config = LoraConfig(r = 16,
#     target_modules = ["query", "key", "value", "dense"],
#     lora_alpha = 16,
#     lora_dropout = 0,
#     bias = "none",
#     use_rslora = True,)

# blankModel = get_peft_model(blankModel, peft_config)
# blankModel.load_state_dict(torch.load(output_file))

# Fine-tune

In [None]:
pretrainModel.load_state_dict(torch.load("drive/MyDrive/Colab Notebooks/layer_health_data/pretrainHead.pth"))
maskedLMmodel.load_state_dict(torch.load("drive/MyDrive/Colab Notebooks/layer_health_data/mlmHead.pth"))

In [None]:
# ### Transfer weights
# cancerModel.bert = maskedLMmodel.bert # pretrainModel.bert
# diabetesModel.bert = maskedLMmodel.bert # pretrainModel.bert

cancerModel = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
diabetesModel = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

peft_config = LoraConfig(r = 16,
    target_modules = ["classifier"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_rslora = True,)

cancerModel = get_peft_model(cancerModel, peft_config)
diabetesModel = get_peft_model(diabetesModel, peft_config)


# pretrain_data_collator = DataCollatorForPreTraining(tokenizer)
# mlm_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
# diabetesModel.bert.encoder.layer[0].attention.self.query.weight


# Parameter containing:
# tensor([[-0.0164,  0.0261, -0.0263,  ...,  0.0154,  0.0768,  0.0548],
#         [-0.0326,  0.0346, -0.0423,  ..., -0.0527,  0.1393,  0.0078],
#         [ 0.0105,  0.0334,  0.0109,  ..., -0.0279,  0.0258, -0.0468],
#         ...,
#         [-0.0085,  0.0514,  0.0555,  ...,  0.0282,  0.0543, -0.0541],
#         [-0.0198,  0.0944,  0.0617,  ..., -0.1042,  0.0601,  0.0470],
#         [ 0.0015, -0.0952,  0.0099,  ..., -0.0191, -0.0508, -0.0085]])

In [None]:
cancer_fine_tune_trainer = Trainer(
        model = cancerModel,
        train_dataset = tokenized_labelled_cancer,
        eval_dataset = tokenized_labelled_eval_cancer,
        compute_metrics=compute_metrics,
        # dataset_text_field = "text",
        # max_seq_length = 128,
        # dataset_num_proc = 2,
        # peft_config = peft_config,
        # packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = 10,
            learning_rate = 1e-4,
            fp16 = False, #not is_bfloat16_supported(),
            bf16 = True, #is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed= 890,
            output_dir= "cancer_outputs",
            run_name = "cancer_est_run_fine_tune_seq",
        ),
    )

diabetes_fine_tune_trainer = Trainer(
        model = diabetesModel,
        train_dataset = tokenized_labelled_diabetes,
        eval_dataset = tokenized_labelled_eval_diabetes,
        compute_metrics=compute_metrics,
        # dataset_text_field = "text",
        # max_seq_length = 128,
        # dataset_num_proc = 2,
        # peft_config = peft_config,
        # packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = 10,
            learning_rate = 1e-4,
            fp16 = False, #not is_bfloat16_supported(),
            bf16 = True, #is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed= 890,
            output_dir= "diabetes_outputs",
            run_name = "diabetes_est_run_fine_tune_seq",
        ),
    )



In [None]:
cancer_fine_tune_trainer.train()

In [None]:
diabetes_fine_tune_trainer.train()