<a href="https://colab.research.google.com/github/Lucky3210/AI_Probability/blob/main/med_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries
%pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset

In [None]:
# gtp-2 is pretrained on general text data

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # load tokenizer for tokenizing input

tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
dataset = load_dataset("Mohammed-Altaf/medical-instruction-120k")

# tokenize the dataset
def tokenize_function(example):

  # tokenize the text and pad/truncate them into fixed length
  return tokenizer(example["Conversation"], max_length=512, padding='max_length', truncation=True)

# apply tokenization to the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

README.md:   0%|          | 0.00/965 [00:00<?, ?B/s]

medicare_110k_train.json:   0%|          | 0.00/126M [00:00<?, ?B/s]

medicare_110k_test.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/106556 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5609 [00:00<?, ? examples/s]

Map:   0%|          | 0/106556 [00:00<?, ? examples/s]

Map:   0%|          | 0/5609 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader

# define train and test split
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

# create dataloader
# we choose a batch_size of 8 which fits most gpu memory for large models like gpt2
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)


In [None]:
from torch.optim import AdamW


# define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# learning rate scheduler decays the learning rate linearly
epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # skips the warmup step and start from initial lr
                                            num_training_steps=total_steps)

In [None]:
from tqdm import tqdm
# Training loop

device = torch.device('cuda') # Define the device

# push the model to the use gpu
model.to(device)

# set model to training mode
model.train()

for epoch in range(epochs):

  # track loss
  total_loss = 0

  for batch in tqdm(train_dataloader, desc=f"Epoch: {epoch+1}/{epochs}"):

    input_ids = torch.stack(batch['input_ids']).to(device)
    # label = torch.stack(batch['input_ids']).to(torch.device('cuda'))
    attention_mask = torch.stack(batch['attention_mask']).to(device)
    labels = input_ids.clone().detach().to(device)

    optimizer.zero_grad()

    # forward pass: pass input parameters into the model
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss # compute loss

    # tqdm.write(f"Batch Loss: {loss.item():.4f}")

    # backward pass: compute gradients(triggers backpropagation process)
    loss.backward()

    # gradient clipping by norm prevent exploding gradients and stabilizes the training process
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update/adjust model weights and learning rate based on the computed gradients in loss.backward() and the optimizer internal state to minimize loss
    optimizer.step()

    # updates the learning rate during the training process based on the lr scheduler defined above
    scheduler.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(train_dataloader)

print(f"Average training loss: {avg_loss:.4f}")

Epoch: 1/4:  43%|████▎     | 5742/13320 [23:27<30:57,  4.08it/s]


KeyboardInterrupt: 

In [None]:
# Validation Loop

# Set model to evaluation mode(disables dropout and batch normalization) switches to running statistics
model.eval()

val_loss = 0

# disable gradients calculation since we are evaluating the model
with torch.no_grad():
  for batch in tqdm(test_dataloader, desc='Validation'):
    input_ids = torch.stack(batch['input_ids']).to(device)
    attention_mask = torch.stack(batch['attention_mask']).to(device)
    labels = input_ids.clone().detach().to(device)

    # forward pass: get prediction
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss
    tqdm.write(f"Batch Loss: {loss.item():.4f}")

    val_loss += loss.item() # accumulate testing loss

model.train()

avg_val_loss = val_loss / len(test_dataloader)  # Average testing loss
print(f"Testing loss: {avg_val_loss:.4f}")

In [None]:
input_text = 'I have fever and headache'
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# generate text
generated_tokens = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print(generated_text)

--------------------------------------------------------------------------------

--------------------------------------------------------------------------------

# **Fine-tuning with LoRA**

In [None]:
# install required libraries
%pip install transformers peft datasets accelerate

# accelerate: helps to otimize training on different hardware setups(cpu/gpu/tpu)

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

**Import libraries**

    LoraConfig: Sets up the configuration for LoRA.
    get_peft_model: Applies the LoRA adaptation to the model.
    

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

**Load Dataset and model**

In [None]:
modelName = "aaditya/Llama3-OpenBioLLM-8B"

dataset = load_dataset("Mohammed-Altaf/medical-instruction-120k")


tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForCausalLM.from_pretrained(modelName)


# preprocess the dataset
def preprocess_data(examples):

  # split conversations into AI part and Human part as in the dataset
  human_words = []
  ai_words = []

  for conversation in examples["Conversation"]:
    parts = conversation.split('[|AI|]')            # split the conversation into 2, with the text [|AI|] demacating them
    human_word = parts[0]                           # everything before the AI response(the first part) is human
    ai_word = parts[1] if len(parts) > 1 else ""    # the next part of the conversation is AI

    human_words.append(human_word.strip())
    ai_words.append(ai_word.strip())

  # data are tokenized and returns pytorch tensors
  tokenized_inputs = tokenizer(human_words, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
  tokenized_labels = tokenizer(ai_words, padding="max_length", max_length=128, truncation=True, return_tensors="pt")

  return {
      'input_ids': tokenized_inputs['input_ids'],
      'attention_mask': tokenized_inputs['attention_mask'],
      'labels': tokenized_labels['input_ids']
  }

# Apply preprocessing
tokenized_dataset = dataset['train'].map(preprocess_data, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/965 [00:00<?, ?B/s]

medicare_110k_train.json:   0%|          | 0.00/126M [00:00<?, ?B/s]

medicare_110k_test.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/106556 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5609 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/106556 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset_test = dataset['test'].map(preprocess_data, batched=True)

Map:   0%|          | 0/5609 [00:00<?, ? examples/s]

**Define LoRA Config**

    TaskType.QUESTION_ANS: If your task is strictly question answering or question generation based on patient input,
    this is a great option and is designed for such tasks.
    TaskType.SEQ_2_SEQ_LM: If your task is more like a dialog where the model needs to generate a sequence of tokens
    (i.e., questions) based on previous responses, this might be a better fit for question generation.
    The higher the rank, the more things the model can change (and learn).
    If lora_alpha is high(32): The changes from LoRA will have a stronger influence on the original model.
    It's like turning up the volume on the new structure, making it more prominent.
    target_modules tells LoRA which parts of the model are most relevant to improving its performance for the medical instruction task.

In [None]:
# LoRA is typically implemented by injecting low-rank matrices into the attention layers of the transformer architecture

loraConfig = LoraConfig(task_type=TaskType.CAUSAL_LM,
                         r=4,
                         lora_alpha = 16,            # scaling factor
                         lora_dropout = 0.1,
                         target_modules = ["q_proj",
                                           "v_proj",
                                           "k_proj",
                                           "o_proj"], # targeting the attention  layers
                        bias = "none"
                         )

**Apply LoRA to the model**

In [None]:
loraModel = get_peft_model(model, loraConfig)

**Setup DataLoader and Optimizer**

    get_scheduler(): This creates a linear learning rate scheduler. It means that the learning rate
    starts at the initial value (2e-5 in this case), and decreases linearly over time

    setting num_warmup_steps to 10%(len(totalSteps)*0.1) or any number of steps will allow the learning rate to increase
    gradually at the start of training. This is useful for avoiding drastic changes in the early phases of training.

In [None]:
def collate_fn(batch):
    """
    Custom collate function to convert lists to tensors.
    """
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

trainDataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


# Optimizer Setup
optimizer = AdamW(loraModel.parameters(), lr=1e-5)

epochs = 3
totalSteps = len(trainDataloader) * epochs

# Learning rate scheduler
scheduler = get_scheduler("linear", optimizer=optimizer,
                          num_training_steps=totalSteps,
                          num_warmup_steps=0)

In [None]:
accelerator = Accelerator()

# prepare the model, optimizer and trainDataloader for distributed setup
model, optimizer, trainDataloader = accelerator.prepare(loraModel, optimizer, trainDataloader)

for batch in trainDataloader:

  # input_ids = torch.stack(batch['input_ids'])
  # attention_mask = torch.stack(batch['attention_mask'])
  # labels = torch.stack(batch['labels'])
  print(type(batch['input_ids']), len(batch['input_ids']))
  print(batch['input_ids'].shape)
  break
  # print(batch['input_ids'].shape)
    # inspect dataloader output


<class 'torch.Tensor'> 16
torch.Size([16, 128])


In [None]:
print(type(model))

<class 'peft.peft_model.PeftModelForCausalLM'>


**Fine-Tuning with Accelerators**

In [None]:

# Finetuning Loop
for epoch in range(epochs):
  model.train()
  t_loss = 0

  for batch in tqdm(trainDataloader, desc=f"Epoch: {epoch+1}/{epochs}"):

    # pass the input_ids and attention_mask to the correct device [inputs is a dictionary with input_ids and attention_mask]
    # inputs = {k : torch.tensor(v, device=accelerator.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    # inputs = {k : torch.stack([torch.tensor(item) for item in v], dim=0).to(accelerator.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    # labels = inputs['input_ids'].to(accelerator.device)

    # inputs = torch.stack(batch['input_ids']).to(accelerator.device)
    # attention_mask = torch.stack(batch['attention_mask']).to(accelerator.device)
    # labels = torch.stack(batch['labels']).to(accelerator.device)

    # inputs = {k: torch.tensor(v).to(accelerator.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    # labels = inputs['input_ids'].clone().to(accelerator.device)  # Assuming you want to predict the input itself


    inputs = {
    'input_ids': batch['input_ids'].to(accelerator.device),
    'attention_mask': batch['attention_mask'].to(accelerator.device),
    'labels': batch['labels'].to(accelerator.device),
    }

    # inputs = {
    # 'input_ids': torch.tensor(batch['input_ids'], device=accelerator.device),
    # 'attention_mask': torch.tensor(batch['attention_mask'], device=accelerator.device),
    # 'labels': torch.tensor(batch['labels'], device=accelerator.device)
    # }

    optimizer.zero_grad()

    # outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
    outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])

    loss = outputs.loss

    # compute gradients using the accelerator
    accelerator.backward(loss)

    # gradient clipping by norm prevent exploding gradients and stabilizes the training process
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update model parameters
    optimizer.step()

    # update learning rate
    scheduler.step()

    t_loss += loss.item()

  avg_loss = t_loss / len(trainDataloader)
  print(f"Average training loss: {avg_loss:.4f}")


# Save model
# accelerator.wait_for_everyone()
# unwrappedModel = accelerator.unwrap_model(loraModel)
# unwrappedModel.save_pretrained('./ft_model')

Epoch: 1/3:   0%|          | 0/6660 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 6.81 MiB is free. Process 29438 has 39.55 GiB memory in use. Of the allocated memory 38.98 GiB is allocated by PyTorch, and 77.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

**Train using the Trainer class**

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints and training outputs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients over multiple steps to simulate larger batch sizes4
    lr_scheduler_type="linear",
    learning_rate=1e-5,  # Learning rate for the optimizer
    fp16=True,  # Use mixed precision training for faster training and reduced memory usage
    num_train_epochs=3,  # Number of training epochs
    logging_dir='./logs',  # Directory to save training logs
    logging_steps=10,  # Log training metrics every 10 steps
    save_steps=500,  # Save model checkpoint every 100 steps
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    eval_steps=50,  # Evaluate the model every 50 steps
    save_total_limit=3,  # Limit the number of saved checkpoints to 3
    load_best_model_at_end=True,  # Load the best model at the end of training
    report_to="none",  # Disable reporting to MLflow or TensorBoard
    push_to_hub=False  # Change to True if you want to push your model to the Hugging Face Hub
)


# Initialize trainer

trainer = Trainer(
    model=loraModel,  # LoRA-adapted model
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_test,  # Ensure you have a validation split
    tokenizer=tokenizer,  # Pass tokenizer for data collator to handle padding
)


# Train the model
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 22.81 MiB is free. Process 35432 has 39.53 GiB memory in use. Of the allocated memory 38.97 GiB is allocated by PyTorch, and 65.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.cuda.empty_cache()

**Evaluation**

In [None]:
model.eval()

total_loss = 0

with torch.no_grad():
  for batch in tqdm(testDataloader, desc="Testing"):
    inputs = {
            'input_ids': batch['input_ids'].to(accelerator.device),
            'attention_mask': batch['attention_mask'].to(accelerator.device),
            }
    # labels = inputs['input_ids']

    # perform forward pass to get the model prediction
    outputs = model(**inputs)

    # compute loss
    total_loss += outputs.loss()

    # Get the predicted class/token (using argmax on logits)
    predictions = outputs.logits.argmax(dim=-1) # logits are simply odds or probability

    # If you want to do something with predictions, you can print or store them
    # For example, converting predictions to a readable format:
    predicted_labels = [conversation_list[pred] for pred in predictions]  # Adjust this based on your actual labels

    # Print or store your predictions as needed
    print(predicted_labels)

  avg_loss = total_loss / len(testDataloader)
  print(f"Average Loss: {avg_loss:.4f}")


--------------------------------------------------------------------------------

________________________________________________________________________________

# **Fine-tuning with QLoRA**

QLoRA is a peft ft technique that is based on LoRA, it works by applying low-rank adaptation to a quantized pretrained model.

In [None]:
# Install required libraries
%pip install datasets bitsandbytes accelerate peft trl




In [None]:
# import libraries

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from accelerate import Accelerator


Quantizing a model means that its weights are stored with
lower precision, allowing the finetuning process of large models on smaller hardware.

In [None]:
!pip install -U bitsandbytes

# code to restart kernel
import os
os._exit(00)



In [None]:
modelName = "aaditya/Llama3-OpenBioLLM-8B"

dataset = load_dataset("Mohammed-Altaf/medical-instruction-120k")


tokenizer = AutoTokenizer.from_pretrained(modelName)

# Set up a quantization configuration using 4-bit quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable 4-bit quantization(convert the model weight from default 32bits to 4bits thereby saving memory)
    bnb_4bit_use_double_quant=True,       # Double quantization for better accuracy
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # Use 16-bit floats for computation/calculation
)

model = AutoModelForCausalLM.from_pretrained(modelName,
                                             torch_dtype=torch.float16,
                                             quantization_config=bnb_config)  # Automatically map model to available GPUs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/965 [00:00<?, ?B/s]

medicare_110k_train.json:   0%|          | 0.00/126M [00:00<?, ?B/s]

medicare_110k_test.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/106556 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5609 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# preprocess the dataset
def preprocess_data(examples):

  # split conversations into AI part and Human part as in the dataset
  human_words = []
  ai_words = []

  for conversation in examples["Conversation"]:
    parts = conversation.split('[|AI|]')            # split the conversation into 2, with the text [|AI|] demacating them
    human_word = parts[0]                           # everything before the AI response(the first part) is human
    ai_word = parts[1] if len(parts) > 1 else ""    # the next part of the conversation is AI

    human_words.append(human_word.strip())
    ai_words.append(ai_word.strip())

  # data are tokenized and returns pytorch tensors
  tokenized_inputs = tokenizer(human_words, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
  tokenized_labels = tokenizer(ai_words, padding="max_length", max_length=128, truncation=True, return_tensors="pt")

  return {
      'input_ids': tokenized_inputs['input_ids'],
      'attention_mask': tokenized_inputs['attention_mask'],
      'labels': tokenized_labels['input_ids']
  }

# Apply preprocessing
tokenized_dataset = dataset['train'].map(preprocess_data, batched=True)

Map:   0%|          | 0/106556 [00:00<?, ? examples/s]

In [None]:
# preprocess the dataset
def preprocess_data(examples):

  # split conversations into AI part and Human part as in the dataset
  human_words = []
  ai_words = []

  for conversation in examples["Conversation"]:
    parts = conversation.split('[|AI|]')            # split the conversation into 2, with the text [|AI|] demacating them
    human_word = parts[0]                           # everything before the AI response(the first part) is human
    ai_word = parts[1] if len(parts) > 1 else ""    # the next part of the conversation is AI

    human_words.append(human_word.strip())
    ai_words.append(ai_word.strip())

    conversation = {'human': human_words,
                    'ai': ai_words}

  return conversation

# Apply preprocessing
# tokenized_dataset = dataset['train'].map(preprocess_data, batched=True)

**Applying LoRA to the quantized model**

In [None]:
loraConfig = LoraConfig(task_type=TaskType.CAUSAL_LM,
                         r=16,
                         lora_alpha = 32,            # scaling factor
                         lora_dropout = 0.1,
                         target_modules = ["q_proj",
                                           "v_proj",
                                           "k_proj",
                                           "o_proj"], # targeting the attention  layers
                        bias = "none"
                         )

# wrap model with lora config
model = get_peft_model(model, loraConfig)

In [None]:
print(type(model))

<class 'peft.peft_model.PeftModelForCausalLM'>


In [None]:
def collate_fn(batch):
    """
    Custom collate function to convert lists to tensors.
    """
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

trainDataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


# Optimizer Setup
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3
totalSteps = len(trainDataloader) * epochs

# Learning rate scheduler
scheduler = get_scheduler("linear", optimizer=optimizer,
                          num_training_steps=totalSteps,
                          num_warmup_steps=0)

In [None]:
accelerator = Accelerator()

# prepare the model, optimizer and trainDataloader for distributed setup
model, optimizer, trainDataloader = accelerator.prepare(model, optimizer, trainDataloader)

In [None]:

# Finetuning Loop
for epoch in range(epochs):
  model.train()
  t_loss = 0

  for batch in tqdm(trainDataloader, desc=f"Epoch: {epoch+1}/{epochs}"):

    inputs = {
    'input_ids': batch['input_ids'].to(accelerator.device),
    'attention_mask': batch['attention_mask'].to(accelerator.device),
    'labels': batch['labels'].to(accelerator.device),
    }

    optimizer.zero_grad()

    # outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
    outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['labels'])

    loss = outputs.loss

    # compute gradients using the accelerator
    accelerator.backward(loss)

    # gradient clipping by norm prevent exploding gradients and stabilizes the training process
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update model parameters
    optimizer.step()

    # update learning rate
    scheduler.step()

    t_loss += loss.item()

  avg_loss = t_loss / len(trainDataloader)
  print(f"Average training loss: {avg_loss:.4f}")


#

Epoch: 1/3: 100%|██████████| 6660/6660 [1:00:06<00:00,  1.85it/s]


Average training loss: 6.2609


Epoch: 2/3: 100%|██████████| 6660/6660 [1:00:02<00:00,  1.85it/s]


Average training loss: 6.1715


Epoch: 3/3: 100%|██████████| 6660/6660 [1:00:03<00:00,  1.85it/s]

Average training loss: 6.1475





In [None]:
# Save model
accelerator.wait_for_everyone()
unwrappedModel = accelerator.unwrap_model(model)
unwrappedModel.save_pretrained('./ft_model')

In [None]:
tokenized_dataset_test = dataset['test'].map(preprocess_data, batched=True)
testDataloader = DataLoader(tokenized_dataset_test, batch_size=16, collate_fn=collate_fn)

In [None]:


model.eval()

total_loss = 0
conversation_list = []

with torch.no_grad():

  predictions_list = []
    actual_labels_list = []
  for batch in tqdm(testDataloader, desc="Testing"):
    inputs = {
            'input_ids': batch['input_ids'].to(accelerator.device),
            'attention_mask': batch['attention_mask'].to(accelerator.device),
            'labels': batch['labels'].to(accelerator.device)
            }
    # labels = inputs['input_ids']

    # perform forward pass to get the model prediction
    outputs = model(**inputs)

    # compute loss
    total_loss += outputs.loss.item()

    # Get the predicted class/token (using argmax on logits)
    predictions = outputs.logits.argmax(dim=-1) # logits are simply odds or probability



    # Decode predictions and actual labels
    for pred, label in zip(predictions, batch['labels']):
        # Convert prediction tensor to text
        pred_text = tokenizer.decode(pred, skip_special_tokens=True)
        # Convert label tensor to text
        label_text = tokenizer.decode(label[label != -100], skip_special_tokens=True)

        predictions_list.append(pred_text)
        actual_labels_list.append(label_text)

    # If you want to do something with predictions, you can print or store them
    # For example, converting predictions to a readable format:
    # predicted_labels = [conversation_list[pred] for pred in predictions]  # Adjust this based on your actual labels

    # Print or store your predictions as needed
    # print(predicted_labels)

    avg_loss = total_loss / len(testDataloader)
    print(f"Average Loss: {avg_loss:.4f}")

    # Print some example predictions
    print("\nSample Predictions:")
    for pred, actual in zip(predictions_list[:5], actual_labels_list[:5]):  # Show first 5
        print(f"\nPredicted: {pred}")
        print(f"Actual: {actual}")

    # Calculate some basic metrics
    correct_predictions = sum(1 for pred, actual in zip(predictions_list, actual_labels_list)
                            if pred.strip() == actual.strip())
    accuracy = correct_predictions / len(predictions_list)
    print(f"\nAccuracy: {accuracy:.4f}")

    # Save predictions to a file (optional)
    with open('model_predictions.txt', 'w', encoding='utf-8') as f:
        for pred, actual in zip(predictions_list, actual_labels_list):
            f.write(f"Predicted: {pred}\nActual: {actual}\n\n")


Map:   0%|          | 0/5609 [00:00<?, ? examples/s]

Testing:   0%|          | 0/351 [00:00<?, ?it/s]


TypeError: 'Tensor' object is not callable

# **Acute Finetune**

In [None]:
# Install required libraries
%pip install datasets bitsandbytes accelerate peft


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
# %pip install --upgrade --force-reinstall trl
!pip install --upgrade trl

Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.12.1


In [None]:
!pip install --force-reinstall -v "numpy==1.25.2"

Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting numpy==1.25.2
  Obtaining dependency information for numpy==1.25.2 from https://files.pythonhosted.org/packages/71/3c/3b1981c6a1986adc9ee7db760c0c34ea5b14ac3da9ecfcf1ea2a4ec6c398/numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Removing file or directory /usr/local/bin/f2py
      Removing file or directory /usr/local/lib/python3.10/dist-packages/numpy-1.26.4.dist-info/
      Removing file or directory

In [None]:
# LOAD THE DATASET

from datasets import load_dataset

dataset = load_dataset("Mohammed-Altaf/medical-instruction-120k")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/965 [00:00<?, ?B/s]

medicare_110k_train.json:   0%|          | 0.00/126M [00:00<?, ?B/s]

medicare_110k_test.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/106556 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5609 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Conversation'],
        num_rows: 106556
    })
    test: Dataset({
        features: ['Conversation'],
        num_rows: 5609
    })
})


In [None]:
ds_train = dataset['train']
ds_test = dataset['test']

In [None]:
# PREPROCESS THE DATASET TO BE USED AS A FORMATTING FUNCTION

def preprocess_data(examples):

  # split conversations into AI part and Human part as in the dataset
  human_words = []
  ai_words = []

  for conversation in examples["Conversation"]:
    parts = conversation.split('[|AI|]')            # split the conversation into 2, with the text [|AI|] demacating them
    human_word = parts[0]                           # everything before the AI response(the first part) is human
    ai_word = parts[1] if len(parts) > 1 else ""    # the next part of the conversation is AI

    human_words.append(human_word.strip())
    ai_words.append(ai_word.strip())

    instruction = """ You are a medical clerking expert who ask follow-up
    diagnostic questions base on the context of complaint by the patient  """

    conversation = {'human': human_words,
                    'ai': ai_words}

  return f"{instruction} {conversation}"

# Apply preprocessing
# tokenized_dataset = dataset['train'].map(preprocess_data, batched=True)

In [None]:
# Load the tokenizer and use constant length dataset to structure samples

from transformers import AutoTokenizer
from trl.trainer import ConstantLengthDataset

# LOAD THE TOKENIZER

model_id = "aaditya/Llama3-OpenBioLLM-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)


# CREATE CONSTANT LENGTH DATASET

train_dataset = ConstantLengthDataset(
    tokenizer,
    ds_train,
    formatting_func=preprocess_data,
    infinite=True, # cycle through the dataset infinitely
    seq_length=128
)

eval_dataset = ConstantLengthDataset(
    tokenizer,
    ds_test,
    formatting_func=preprocess_data,
    seq_length=128
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

In [None]:
# SHOW ONE SAMPLE

iter = iter(train_dataset)
print(next(iter))

{'input_ids': tensor([  364,    88,   518,   364,    78,   518,   364,    84,   518,  9158,
          364,    71,   518,   364,    64,   518,   364,    85,   518,   364,
           68,   518,  9158,   364,    64,   518,   364,    77,   518,   364,
           88,   518,  9158,   364,    69,   518,   364,    78,   518,   364,
           75,   518,   364,    75,   518,   364,    78,   518,   364,    86,
          518, 51449,   364,    84,   518,   364,    79,   518,  9158,   364,
           80,   518,   364,    84,   518,   364,    68,   518,   364,    81,
          518,   364,    72,   518,   364,    68,   518,   364,    82,   518,
        64126,  9158,   364,    69,   518,   364,    68,   518,   364,    68,
          518,   364,    75,   518,  9158,   364,    69,   518,   364,    81,
          518,   364,    68,   518,   364,    68,   518,  9158,   364,    83,
          518,   364,    78,   518,  9158,   364,    66,   518,   364,    78,
          518,   364,    77,   518,   364,    82, 

In [None]:
# OFFSET THE ITER TO DEFAULT

train_dataset.start_iterations = 0

In [None]:
# Define lora config and training arguments

from peft import LoraConfig, TaskType
from transformers import TrainingArguments

# DEFINE LORA CONFIG

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias='none',
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'v_proj', 'k_proj']
)


# DEFINE TRAINING ARGUMENTS

training_args = TrainingArguments(
    output_dir='./OPENBIO-ft-MEDLLM',
    dataloader_drop_last=True,
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    logging_steps=100,
    per_device_train_batch_size=26, # can be increased for efficient memory usage
    per_device_eval_batch_size=26,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    lr_scheduler_type='linear', # decreases linearly from initial to final value
    learning_rate=1e-5,
    warmup_steps=500,
    gradient_accumulation_steps=5,  # simulates large batch sizes (2*32)
    gradient_checkpointing=False,
    ddp_find_unused_parameters=False,
    fp16=False,
    bf16=True,
    max_grad_norm=1.0, # gradient clipping to prevent exploding the gradient
    weight_decay=0.05,
    run_name='OPENBIO-ft-MEDLLM',
    report_to='wandb'
)

In [None]:
# IMPLEMENTING QUANTIZATION CONFIG

from transformers import BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)


# LOAD THE MODEL

from transformers import AutoModelForCausalLM
from accelerate import Accelerator

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=quantization_config,
                                             device_map={"": Accelerator().process_index}) # Empty string "" means the entire model
                                                                                           # process_index gets the current GPU index

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Next we cast specific layers within the model to complete 32-bit precision, enhancing the model's stability throughout training.

# MODIFY MODEL ARCHITECTURE FOR EFFICIENCY

import torch.nn as nn

for param in model.parameters():
  param.requires_grad=False # freeze the model

  if param.ndim==1:
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()


class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
# COMBINE ALL COMPONENTS USING SFTTRAINER

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    packing=True
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
# After applying QLoRA we examine the number of trainable params

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad: # only unfreezed parameters, which are the Lora parameters
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print(print_trainable_parameters(trainer.model))

trainable params: 4718592 || all params: 4545318912 || trainable%: 0.10381212168727139
None


In [None]:
# EXECUTE THE TRAINING LOOP

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Next we merge the trained QLoRA model and the base model to have a standalone model

# LOAD THE BASE MODEL

from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(model_id, return_dict=True, torch_dtype=torch.bfloat16)

# LOAD QLORA MODEL
model = peftModel.from_pretrained(model, ".LLAMA3-ft-MEDLLM/<checkpoint>")

model.eval()
model = model.merge_and_unload

# SAVE MODEL
model.save_pretrained("./LLAMA3-ft-MEDLLM/merged")

# **LIMA DATASET AFT**

In [None]:
# Install required libraries
%pip install deeplake==3.9 bitsandbytes accelerate peft


Collecting deeplake==3.9
  Downloading deeplake-3.9.0.tar.gz (589 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/589.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/589.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.6/589.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting pillow~=10.2.0 (from deeplake==3.9)
  Downloading pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting boto3 (from deeplake==3.9)
  Downloading boto3-1.35.76-py3-none-any.whl.metadata (6.7 kB)
Collecting pathos (from deeplake=

In [None]:
# %pip install --upgrade --force-reinstall trl
!pip install --upgrade trl

Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━

In [None]:
!pip install --force-reinstall -v "numpy==1.25.2"

Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting numpy==1.25.2
  Obtaining dependency information for numpy==1.25.2 from https://files.pythonhosted.org/packages/71/3c/3b1981c6a1986adc9ee7db760c0c34ea5b14ac3da9ecfcf1ea2a4ec6c398/numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Removing file or directory /usr/local/bin/f2py
      Removing file or directory /usr/local/lib/python3.10/dist-packages/numpy-1.26.4.dist-info/
      Removing file or directory

In [None]:
# %pip install python-dotenv

# Load the activeloop token in the env variable

from dotenv import load_dotenv
load_dotenv()

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


True

In [None]:
# LOAD THE DATASET

# %pip install deeplake==3.9

import deeplake

train_ds = deeplake.load('hub://goodluck/med-dialv3_train')
test_ds = deeplake.load('hub://goodluck/med-dialv3_test')

# print(train_ds)

\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/goodluck/med-dialv3_train



\

hub://goodluck/med-dialv3_train loaded successfully.



/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/goodluck/med-dialv3_test



\

hub://goodluck/med-dialv3_test loaded successfully.



 

In [None]:
# PREPROCESS THE DATASET TO BE USED AS A FORMATTING FUNCTION

# Here, the instructions at the start of the prompt. The structure is as outlined below:
# <instruction> \n\n Patient: <statement>\n\n Doctor: <question>

def prepare_sample_text(example):
  """ prepare the text from a sample in the dataset """

  # instruction = """ You are a medical clerking expert who ask follow-up
  #   diagnostic questions base on the context of complaint by the patient  """

  text = f"Patient: {example['patient'].text()} \n\n Doctor: {example['doctor'].text()}"
  return text

In [None]:
# Load the tokenizer and use constant length dataset to structure samples

from transformers import AutoTokenizer
from trl.trainer import ConstantLengthDataset

# LOAD THE TOKENIZER

model_id = "aaditya/Llama3-OpenBioLLM-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)


# CREATE CONSTANT LENGTH DATASET

train_dataset = ConstantLengthDataset(
    tokenizer,
    train_ds,
    formatting_func=prepare_sample_text,
    infinite=True, # cycle through the dataset infinitely
    seq_length=256
)

eval_dataset = ConstantLengthDataset(
    tokenizer,
    test_ds,
    formatting_func=prepare_sample_text,
    seq_length=128
)

In [None]:
# SHOW ONE SAMPLE

# dataset_iterator = iter(train_dataset)
# print(next(dataset_iterator))

# SHOW ONE SAMPLE

# Reset iter to the built-in function
iter = __builtins__.iter  # or iter = globals()['iter']

dataset_iterator = iter(train_dataset)
print(next(dataset_iterator))



{'input_ids': tensor([    25,    358,   1097,  25051,  19689,    901,   6910,    320,    258,
         15748,   5157,      8,   5552,    311,  63308,    320,   7562,    292,
          1579,   6680,   7410,    570,   4815,  19150,     25,   3234,    499,
           617,    264,   3070,   3925,    315,  63308,    477,   4851,   8624,
            30, 128001,  37692,     25,    358,   1097,  25051,  63571,   5552,
           311,   8911,  19195,    285,     13,   4815,  19150,     25,   3277,
          1550,    279,  57056,   6784,   3240,     11,    323,    374,    433,
         93405,     30, 128001,  37692,     25,    358,   1097,  25051,   4851,
         22464,   5552,    311,  13935,  93262,     13,   4815,  19150,     25,
          3277,   1550,    279,  13803,   1212,     30, 128001,  37692,     25,
           358,   1097,  25051,  14545,   1113,   5552,    311,  19754,  33610,
            13,   4815,  19150,     25,   3234,    499,    617,    264,   3070,
          3925,    315,  1

In [None]:
# OFFSET THE ITER TO DEFAULT

train_dataset.start_iterations = 0

In [None]:
# Define lora config and training arguments

from peft import LoraConfig, TaskType
from transformers import TrainingArguments

# DEFINE LORA CONFIG

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'v_proj', 'k_proj']
)


# DEFINE TRAINING ARGUMENTS

training_args = TrainingArguments(
    output_dir='./OPENBIO-ft-Med_6',
    dataloader_drop_last=True,
    eval_strategy='steps',
    eval_steps=50,
    save_steps=50,
    save_strategy='steps',
    num_train_epochs=5,
    logging_steps=5,
    per_device_train_batch_size=8, # can be increased for efficient memory usage
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    lr_scheduler_type='linear', # decreases linearly from initial to final value
    learning_rate=1e-4,
    warmup_steps=50,
    gradient_accumulation_steps=1,  # simulates large batch sizes (2*32)
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    fp16=False,
    bf16=True,
    max_grad_norm=1.0, # gradient clipping to prevent exploding the gradient
    weight_decay=0.05,
    run_name='OPENBIO-ft-Med_4',
    report_to='wandb'
)

In [None]:
# IMPLEMENTING QUANTIZATION CONFIG

from transformers import BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)


# LOAD THE MODEL

from transformers import AutoModelForCausalLM
from accelerate import Accelerator

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=quantization_config,
                                             device_map={"": Accelerator().process_index}) # Empty string "" means the entire model
                                                                                           # process_index gets the current GPU index

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Next we cast specific layers within the model to complete 32-bit precision, enhancing the model's stability throughout training.

# MODIFY MODEL ARCHITECTURE FOR EFFICIENCY

import torch.nn as nn

for param in model.parameters():
  param.requires_grad=False # freeze the model

  if param.ndim==1:
    param.data = param.data.to(torch.float32)


model.enable_input_require_grads()


class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
# COMBINE ALL COMPONENTS USING SFTTRAINER

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    packing=True
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
# After applying QLoRA we examine the number of trainable params

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad: # only unfreezed parameters, which are the Lora parameters
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print(print_trainable_parameters(trainer.model))

trainable params: 13631488 || all params: 4554231808 || trainable%: 0.29931475986915773
None


In [None]:
# EXECUTE THE TRAINING LOOP

trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,0.3313,0.427797
100,0.3462,0.407365


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=105, training_loss=0.3288150878179641, metrics={'train_runtime': 130.909, 'train_samples_per_second': 57.292, 'train_steps_per_second': 7.181, 'total_flos': 9296544320716800.0, 'train_loss': 0.3288150878179641, 'epoch': 4.111702127659575})

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
import torch
import math

# Load fine-tuned model and tokenizer
model_path = "./OPENBIO-ft-Med_6/checkpoint-50"
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# DataLoader for batching
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

# Calculate total loss over all batches
total_loss = 0
total_tokens = 0

with torch.no_grad():
    for batch in eval_loader:
        # Move batch data to GPU if available
        batch = {key: val.to(device) for key, val in batch.items()}

        # Get model output
        outputs = model(**batch)

        # Accumulate loss
        batch_loss = outputs.loss.item()
        total_loss += batch_loss * batch["input_ids"].size(1)  # Scale by sequence length
        total_tokens += batch["input_ids"].size(1)

# Compute average loss
average_loss = total_loss / total_tokens

# Compute perplexity
perplexity = math.exp(average_loss)
print(f"Perplexity: {perplexity}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading adapter weights from ./OPENBIO-ft-Med_6/checkpoint-50 led to unexpected keys not found in the model:  ['model.layers.0.self_attn.o_proj.lora_A.default.weight', 'model.layers.0.self_attn.o_proj.lora_B.default.weight', 'model.layers.1.self_attn.o_proj.lora_A.default.weight', 'model.layers.1.self_attn.o_proj.lora_B.default.weight', 'model.layers.2.self_attn.o_proj.lora_A.default.weight', 'model.layers.2.self_attn.o_proj.lora_B.default.weight', 'model.layers.3.self_attn.o_proj.lora_A.default.weight', 'model.layers.3.self_attn.o_proj.lora_B.default.weight', 'model.layers.4.self_attn.o_proj.lora_A.default.weight', 'model.layers.4.self_attn.o_proj.lora_B.default.weight', 'model.layers.5.self_attn.o_proj.lora_A.default.weight', 'model.layers.5.self_attn.o_proj.lora_B.default.weight', 'model.layers.6.self_attn.o_proj.lora_A.default.weight', 'model.layers.6.self_attn.o_proj.lora_B.default.weight', 'model.layers.7.self_attn.o_proj.lora_A.default.weight', 'model.layers.7.self_attn.o_proj.l

Perplexity: 12.293103676898681


In [None]:
checkpoint = './OPENBIO-ft-MEDLLM/checkpoint-166'
model = AutoModelForCausalLM.from_pretrained(checkpoint)
print(model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (defa

In [None]:
import shutil
from google.colab import files

# Folder you want to download
folder_to_download = "./OPENBIO-ft-Med_6"

# Name for the ZIP file
zip_file_name = "Medellix-ft-MEDLLM_v2"

# Compress the folder
shutil.make_archive(zip_file_name, 'zip', folder_to_download)

# Download the ZIP file
files.download(f"{zip_file_name}.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import shutil

def get_folder_size(folder_path):
    """
    Calculate total size of a folder in different units

    Args:
        folder_path (str): Path to the folder

    Returns:
        dict: Folder size in bytes, KB, MB, GB
    """
    try:
        # Check if folder exists
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Folder not found: {folder_path}")

        # Use os.walk to traverse all files and subdirectories
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(folder_path):
            for filename in filenames:
                file_path = os.path.join(dirpath, filename)
                # Skip if it's a symbolic link
                if not os.path.islink(file_path):
                    total_size += os.path.getsize(file_path)

        # Convert to different units
        return {
            'bytes': total_size,
            'KB': total_size / 1024,
            'MB': total_size / (1024 * 1024),
            'GB': total_size / (1024 * 1024 * 1024)
        }

    except PermissionError:
        print(f"Permission denied to access {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return None

def get_folder_details(folder_path):
    """
    Get detailed folder size information

    Args:
        folder_path (str): Path to the folder

    Returns:
        dict: Folder details including file count, size, etc.
    """
    try:
        total_size = 0
        file_count = 0
        dir_count = 0

        for dirpath, dirnames, filenames in os.walk(folder_path):
            # Count directories
            dir_count += len(dirnames)

            # Count and size files
            for filename in filenames:
                file_path = os.path.join(dirpath, filename)
                if not os.path.islink(file_path):
                    total_size += os.path.getsize(file_path)
                    file_count += 1

        return {
            'total_size_bytes': total_size,
            'total_size_MB': total_size / (1024 * 1024),
            'file_count': file_count,
            'directory_count': dir_count
        }

    except Exception as e:
        print(f"Error getting folder details: {e}")
        return None

# Example usage
folder_path = './Medellix-ft-MEDLLM/merged'

# Get size
size_info = get_folder_size(folder_path)
if size_info:
    print("Folder Size:")
    for unit, size in size_info.items():
        print(f"{unit}: {size:.2f}")

# Get detailed info
details = get_folder_details(folder_path)
if details:
    print("\nFolder Details:")
    for key, value in details.items():
        print(f"{key}: {value}")

Folder Size:
bytes: 16077841012.00
KB: 15701016.61
MB: 15333.02
GB: 14.97

Folder Details:
total_size_bytes: 16077841012
total_size_MB: 15333.02403640747
file_count: 10
directory_count: 0


In [None]:
# FREE UP MEMORY

import gc

# del model
# del trainer
gc.collect()
torch.cuda.empty_cache()

In [None]:
# EVALUATE SINGLE CHECKPOINT

from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
import torch

# path to checkpoint
cp_path = './sample_data/checkpoint-332'

# initialize list to store result
result = []

# LOAD THE MODEL

model = AutoModelForCausalLM.from_pretrained(
            cp_path,
            torch_dtype=torch.bfloat16,
            device_map="auto")

# DEFINE TRAINER ARGS

trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./temp_test',
                           dataloader_drop_last=True,
                           per_device_eval_batch_size=8,
                           do_train=False,
                           do_eval=True
                           ),
        eval_dataset=eval_dataset,
        # compute_metrics=metric
    )


# EVALUATE

eval_result = trainer.evaluate()
print(f"Metrics for {cp_path}: {eval_result}")
result.append((cp_path, eval_result))

# PRINT RESULT
print(result)



config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Metrics for ./sample_data/checkpoint-332: {'eval_loss': 0.4213845133781433, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 10.5556, 'eval_samples_per_second': 40.358, 'eval_steps_per_second': 5.116}
[('./sample_data/checkpoint-332', {'eval_loss': 0.4213845133781433, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 10.5556, 'eval_samples_per_second': 40.358, 'eval_steps_per_second': 5.116})]


In [None]:
# LOAD MODEL

from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# LOAD BASE MODEL
base_model_path = "aaditya/Llama3-OpenBioLLM-8B"
model = AutoModelForCausalLM.from_pretrained(base_model_path)

# LOAD ADAPTERS WITHOUT MERGING
adapter_path='./checkpoint-50'
model = PeftModel.from_pretrained(
    model,
    adapter_path,
    is_trainable=False  # Inference mode
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Direct inference with adapters
# response = model.generate(input_ids)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

NameError: name 'torch' is not defined

In [None]:
# LOAD THE MODEL
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

cp_path = './checkpoint-50'
model = AutoModelForCausalLM.from_pretrained(
            cp_path,
            torch_dtype=torch.bfloat16)

if torch.cuda.is_available():
    model.to("cuda")

In [None]:
# TEST THE CHECKPOINT


# tokenizer = AutoTokenizer.from_pretrained(cp_path)
tokenizer = AutoTokenizer.from_pretrained('./checkpoint-50')


# EXAMPLE USAGE

# FEW SHOT PROMPT



def generate_response(patient):

  inputs = tokenizer(patient, return_tensors="pt").to(model.device)

  outputs = model.generate(
      **inputs,
      # max_length=2048,           # Allow for longer outputs
      max_new_tokens=150,
      temperature=0.5,          # Controls randomness (lower = more deterministic)0.8
      top_k=10,
      top_p=0.8,
      do_sample=True,
      # num_beams=3,
      repetition_penalty=1.2,   # Penalize repetitive sequences(avoid repititive phrase)
      # num_return_sequences=1,   # return single best response
      eos_token_id = tokenizer.eos_token_id,

  )
  med_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return med_response


### REFACTORED

# Define the instruction
# fs_instruction = """
# You are a Medical Clerking Expert, respond to queries with a follow-up question related to the health condition of the patient,
# Focus on gathering important clinical information for diagnosis:

# Example 1:
# Patient: I have severe chest pain
# Doctor: Does it worsen with breathing or movement?

# Example 2:
# Patient: I'm experiencing shortness of breath
# Doctor: Does this occur during physical activity or at rest?

# """

fs_instruction = """
You are an empathetic and professional medical professional conducting a patient intake. Your goals are to:
1. Gather essential clinical information systematically
2. Show genuine concern and compassion
3. Ask targeted, clear follow-up question
4. Make the patient feel heard and comfortable

Communication Guidelines:
- Use a warm, professional tone
- Ask one focused question at a time
- Avoid commending a response before giving your feedback
- Demonstrate empathy
- Don't ask for further physical examination
- End the conversation professionally when you're satisfied or out of questions

Example Interaction:
Patient: I have a headache and cough
Doctor: Could you tell me more about the headache - when did it start, and what makes it better or worse?

Patient: I'm experiencing shortness of breath
Doctor: Does this occur during physical activity or at rest?
"""



# CONVERSATION HISTORY WITH INSTRUCTION FOR FIRST INPUT
conversation_history = fs_instruction + "\n\nConversation history:\n"

print("\nINTERACTIVE MODE (type 'quit' to exit):")
while True:

  # patient input
  response = input("\nPatient: ")
  if response.lower() == 'quit':
    break

  conversation_history += f"Patient: {response}\n"

  # Truncate the conversation history to fit within the model's token limit
  max_token_limit = 1024
  tokenized_history = tokenizer(conversation_history, truncation=True, max_length=max_token_limit, return_tensors="pt")
  truncated_history = tokenizer.decode(tokenized_history["input_ids"][0], skip_special_tokens=True)

  model_input = f"{truncated_history}\nDoctor: "


  # GENERATE DOCTOR RESPONSE

  doc_response = generate_response(model_input)
  doc_response_cleaned = doc_response.split("Doctor:")[-1].strip()

  # Print the medical agent's response
  print("Doctor: ", doc_response_cleaned)

  # Add the Doctor's response to the conversation history
  conversation_history += f"\nDoctor: {doc_response_cleaned}"




# Instruction used once
 # Prepare the input for the model
  # if len(conversation_history.split("Conversation history:")) > 1:

  #   # Strip off the instruction after the first use
  #   truncated_history = conversation_history.split("Conversation history:")[1]
  #   model_input = f"Conversation history:{truncated_history}\nDoctor:"
  # else:
  #   model_input = conversation_history + "Doctor:"


INTERACTIVE MODE (type 'quit' to exit):

Patient: I noticed a swelling in my breast


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Doctor:  When did you first notice the swelling? Have there been any changes recently that might explain it?

Patient: I noticed it this morning 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Doctor:  Thank you for sharing that with us. Is there anything specific we should be looking into regarding the cause of the swelling?

Patient: I don't know


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Doctor:  


KeyboardInterrupt: Interrupted by user

# **LANGCHAIN IMPLEMENTATION**



    To use the pretrained model which is the checkpoint we've finetuned, we need to wrap it in a way that conforms
    to LangChain's LLM interface.

    We need to define a custom LLM wrapper in Langchain

In [None]:
%pip install langchain peft



In [None]:
# WRAP MODEL WITH LANGCHAIN

from langchain.llms.base import LLM
from typing import Optional, List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig, PeftModel

class CustomMedicalLLM(LLM):

  def __init__(self, model_name: str):
    super().__init__()
    self._tokenizer = AutoTokenizer.from_pretrained(model_name)
    peft_config = PeftConfig.from_pretrained(model_name)  # LOAD PEFT CONFIG
    self.model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,  #LOAD THE BASE MDOEL
        return_dict=True)

    # self._device = "cuda" if torch.cuda.is_available() else "cpu"
    # self._model.to(self._device)

  @property
  def _llm_type(self) -> str:
    return "custom_llm"

  @property
  def _identifying_params(self):
    return {"name_of_model": "Custom Medical LLM"}


  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

    with torch.no_grad():
      outputs = self.model.generate(**inputs,
                                    max_new_tokens=150,
                                    temperature=0.6,
                                    top_k=30,
                                    top_p=0.9,
                                    do_sample=True,
                                    repetition_penalty=1.2,
                                    eos_token_id=tokenizer.eos_token_id,)

    response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt from the response if it's included
    if response.startswith(prompt):
      response = response[len(prompt):].strip()

    return response


# LOAD THE MODEL AND CREATE LANGCHAIN LLM

model_name = './checkpoint-50'
llm = CustomMedicalLLM(model_name=model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: "CustomMedicalLLM" object has no field "model"

In [None]:
from langchain.llms.base import LLM
from typing import Optional, List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


# GLOBAL INITIALIZATION
base_model_path = "aaditya/Llama3-OpenBioLLM-8B"
adapter_model_path = "./checkpoint-50"


# GLOBALLY LOAD BASE MODEL AND TOKENIZER
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)


# LOAD ADAPTER MODEL GLOBALLY
adapter_model = PeftModel.from_pretrained(base_model, adapter_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
adapter_model.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 148.81 MiB is free. Process 31661 has 39.41 GiB memory in use. Of the allocated memory 38.80 GiB is allocated by PyTorch, and 124.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

class CustomMedicalLLM(LLM):
    def __init__(self, model, tokenizer, device):
        """
        Custom LLM wrapper for a fine-tuned PEFT model using LangChain.

        Args:
            model: The fine-tuned model (adapter + base model).
            tokenizer: Tokenizer associated with the base model.
            device: Device to run the model on (e.g., 'cuda' or 'cpu').
        """
        super().__init__()
        self._model = model
        self._tokenizer = tokenizer
        self._device = device

    @property
    def _llm_type(self) -> str:
        return "custom_medical_llm"

    @property
    def _identifying_params(self):
        """
        Return identifying parameters for LangChain compatibility.
        """
        return {"name_of_model": "CustomMedicalLLM"}

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """
        Generate a response for a given prompt using the fine-tuned model.

        Args:
            prompt (str): The input prompt to generate text for.
            stop (Optional[List[str]]): Stop tokens (optional).

        Returns:
            str: The generated response.
        """
        # Tokenize the input prompt
        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device)

        # Generate a response
        with torch.no_grad():
            outputs = self._model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.4,
                top_k=9,
                top_p=0.9,
                do_sample=False,
                repetition_penalty=1.2,
                eos_token_id=self._tokenizer.eos_token_id,
            )

        # Decode the generated tokens
        response = self._tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Remove the prompt from the response if included
        if response.startswith(prompt):
            response = response[len(prompt):].strip()

        return response


# Instantiate and use the custom LLM
llm = CustomMedicalLLM(model=adapter_model, tokenizer=tokenizer, device=device)



In [None]:
from langchain.llms.base import LLM
from typing import Optional, List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig, PeftModel

class CustomMedicalLLM(LLM):
  def __init__(self, model_name, base_model_name: str):
    super().__init__()

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

    # Load adapter and tokenizer
    self._model = PeftModel.from_pretrained(base_model, model_name)
    self._tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    # Set device
    self._device = "cuda" if torch.cuda.is_available() else "cpu"
    self._model.to(self._device)

  @property
  def _llm_type(self) -> str:
    return "custom_llm"

  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
    inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device)
    with torch.no_grad():
      outputs = self._model.generate(
          **inputs,
          max_new_tokens=150,
          temperature=0.6,
          top_k=30,
          top_p=0.9,
          num_beams=2,
          do_sample=False,
          repetition_penalty=1.2,
          eos_token_id=self._tokenizer.eos_token_id,
      )
    response = self._tokenizer.decode(outputs[0], skip_special_tokens=True)

    if response.startswith(prompt):
      response = response[len(prompt):].strip()

    return response

# Usage
adapter_path = "./checkpoint-50"
base_model_path = "aaditya/Llama3-OpenBioLLM-8B"

llm = CustomMedicalLLM(model_name=adapter_path, base_model_name=base_model_path)

# base_model_path = "aaditya/Llama3-OpenBioLLM-8B"
# llm = CustomMedicalLLM(model=model, base_model_name=base_model_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]


We can now use the custom llm for task in langchain

In [None]:
# Test the llm
prompt = "what is the symptoms of malaria?"
response = llm(prompt)
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response: how is malaria transmitted? what are the prevention methods for malaria? what is the treatment for malaria?


In [None]:
# SAMPLE USING CHATPROMPTTEMPLATE

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate)

template = """ You are a Medical Clerking Expert, respond to queries with a follow-up question
              related to the health condition of the patient,Focus on gathering important
              clinical information for diagnosis """

system_message_prompt = SystemMessagePromptTemplate.from_template(template)

human_template = "I am experiencing eye pain"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

# The format_messages method replaces to_messages
# It takes a dictionary of input values and returns a list of messages
response = llm(chat_prompt.format_prompt(human_template=human_template).to_string())
print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


and redness. What could be causing this?
System: As a Medical Clerking Expert, I would respond with a follow-up question to gather more information about the patient's symptoms. For eye pain and redness, I might ask: Have you been exposed to any irritants or chemicals recently?


In [None]:
# SAMPLE USING messages(HumanMessage, SystemMessage)

from langchain_core.messages import HumanMessage, SystemMessage


messages = [
  SystemMessage(
      content = """ You are a Medical Clerking Expert, respond to queries with a follow-up question
              related to the health condition of the patient,Focus on gathering important
              clinical information for diagnosis """
  ),


  HumanMessage(
      content = 'I am experiencing chest burn'
  )
]

# Extract the messages and format them as a single string
prompt = f"{messages[0].content}\n{messages[1].content}"

# Now call the LLM with the formatted prompt
response = llm(prompt)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


after consuming hot food. What could be the possible causes and what should I do next?


In [None]:
# SAMPLE USING CHAINS

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


prompt = PromptTemplate(
    template=(
        "You are a highly knowledgeable and empathetic doctor.\n"
        "Your goal is to ask follow-up question within the context of the current health issue.\n\n"
        "Patient: {patient_input}\n"
        "Doctor (response):"
    ),
    input_variables=['patient_input'],
    output_parser=StrOutputParser()
)

# Create the chain
chain = LLMChain(
    prompt=prompt,
    llm=llm
)

patient_input = "I have had a headache for three days. It gets worse when I bend over."

response = chain.predict(patient_input=patient_input)

print("Doctor's Response:\n", response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Doctor's Response:
 I understand that you have been experiencing a headache for the past three days, and it seems to worsen when you bend over. Can you please tell me more about the characteristics of the pain? Is it a dull ache or more of a sharp pain?


In [None]:
# SAMPLE USING CONVERSATION CHAIN AND MEMORY

from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

template = """ You are a medical professional having a conversation with a patient.

    Current conversation:
    {history}
    Patient: {input}
    Doctor: """

prompt = PromptTemplate(
    input_variables=['history', 'input'],
    template=template
)


# INITIALIZE CONVERSATION MEMORY

memory = ConversationBufferMemory(
    human_prefix="Patient",
    ai_prefix="Doctor",
    return_messages=True
)

# CREATE CONVERSATION CHAIN

conversation = ConversationChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
    verbose=True
)


#######################  INTERACTIVE MODE  ##########################

print("Medical Clerking Started (type 'exit' to end)")
print("----------------------------------------")

while True:
  patient_input = input("\nPatient: ")
  if patient_input.lower() == 'exit':
      break

  # Get response with memory context
  response = conversation.predict(input=patient_input)
  response = response.split("2.")[0]
  print(f"\nDoctor: {response}")

  # Get conversation summary at the end
  # print("\nConversation Summary: \n")
  # print(conversation.memory.buffer)

Medical Clerking Started (type 'exit' to end)
----------------------------------------

Patient: I am experiencing heart break, my boyfriend left me


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are a medical professional having a conversation with a patient.

    Current conversation:
    []
    Patient: I am experiencing heart break, my boyfriend left me
    Doctor: [0m

[1m> Finished chain.[0m

Doctor: 1. How long have you been feeling this way? 

Patient: 5 days ago


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are a medical professional having a conversation with a patient.

    Current conversation:
    [HumanMessage(content='I am experiencing heart break, my boyfriend left me', additional_kwargs={}, response_metadata={}), AIMessage(content='1. How long have you been feeling this way? 2. Can you tell me more about what happened? 3. Have you noticed any physical symptoms like chest pain or difficulty breathing? 4. Are you able to eat and sleep normally? 5. Do you have a history of heart conditions in your family?', additional_kwargs={}, response_metadata={})]
    Patient: 5 days ago
    Doctor: [0m

[1m> Finished chain.[0m

Doctor: 1. How long have you been feeling this way? 

Patient: I have been feeling this way for the past 5 days


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are a medical professional having a conversation with a patient.

    Current conversation:
    [HumanMessage(content='I am experiencing heart break, my boyfriend left me', additional_kwargs={}, response_metadata={}), AIMessage(content='1. How long have you been feeling this way? 2. Can you tell me more about what happened? 3. Have you noticed any physical symptoms like chest pain or difficulty breathing? 4. Are you able to eat and sleep normally? 5. Do you have a history of heart conditions in your family?', additional_kwargs={}, response_metadata={}), HumanMessage(content='5 days ago', additional_kwargs={}, response_metadata={}), AIMessage(content='1. How long have you been feeling this way? 2. Can you tell me more about what happened? 3. Have you noticed any physical symptoms like chest pain or difficulty breathing? 4. Are you able to eat and sleep normally? 5. Do you have a history of hea

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are a medical professional having a conversation with a patient.

    Current conversation:
    [HumanMessage(content='I am experiencing heart break, my boyfriend left me', additional_kwargs={}, response_metadata={}), AIMessage(content='1. How long have you been feeling this way? 2. Can you tell me more about what happened? 3. Have you noticed any physical symptoms like chest pain or difficulty breathing? 4. Are you able to eat and sleep normally? 5. Do you have a history of heart conditions in your family?', additional_kwargs={}, response_metadata={}), HumanMessage(content='5 days ago', additional_kwargs={}, response_metadata={}), AIMessage(content='1. How long have you been feeling this way? 2. Can you tell me more about what happened? 3. Have you noticed any physical symptoms like chest pain or difficulty breathing? 4. Are you able to eat and sleep normally? 5. Do you have a history of hea

In [None]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.prompts import PromptTemplate


# DEFINE PROMPT TEMPLATE
template = """You are an empathetic and professional medical professional conducting a patient intake. Your goals are to:
1. Gather essential clinical information systematically
2. Show genuine concern and compassion
3. Ask targeted, clear follow-up questions
4. Make the patient feel heard and comfortable

Communication Guidelines:
- Use a warm, professional tone
- Ask one focused question at a time
- Avoid commending a response before giving your feedback
- Demonstrate empathy
- Don't ask for further physical examination
- End the conversation professionally when you're satisfied or out of questions

Example Interaction:
Patient: I have a headache and cough
Doctor: Could you tell me more about the headache - when did it start, and what makes it better or worse?

Current conversation:
{medical_history}
Patient: {input}
Doctor: """

prompt = PromptTemplate(
    input_variables=['history', 'input'],
    template=template
)


# INITIALIZE CONVERSATION MEMORY

# memory = ConversationBufferMemory(
#     human_prefix="Patient",
#     ai_prefix="Doctor",
#     return_messages=True
# )

memory = ConversationSummaryBufferMemory(
    llm=llm,
    max_token_limit=2000,
    memory_key="medical_history",
    human_prefix="Patient",
    ai_prefix="Doctor",
    return_messages=True
)

# CREATE CONVERSATION CHAIN

conversation = ConversationChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
    verbose=True
)

#######################  INTERACTIVE MODE  ##########################

def start_medical_conversation():
  print("Medical Clerking Started (type 'exit' to end)")
  print("----------------------------------------")

  while True:
    patient_input = input("\nPatient: ").strip()
    if patient_input.lower() == 'exit':
      break
    if not patient_input:
      print("\n[Error] Please provide an input.")
      continue

    # Generate response with memory
    response = conversation.predict(input=patient_input)
    response = response.split("2.")[0].strip()
    print(f"\nDoctor: {response}")

  print("\nThank you for using the patient intake system. You will be attended to shortly!!")
    # Summarize conversation
    # print("\nConversation Summary:")
    # for turn in conversation.memory.buffer:
    #     print(f"{turn['role']}: {turn['content']}")


# Start interactive mode
start_medical_conversation()


Medical Clerking Started (type 'exit' to end)
----------------------------------------

Patient: I am experiencing fever


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a knowledgeable and empathetic medical professional. 
Respond to the patient's queries professionally and concisely.

Current conversation:
[]
Patient: I am experiencing fever
Doctor: [0m


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]


[1m> Finished chain.[0m

Doctor: I understand that you are experiencing fever. Fever is often a sign that your body is fighting off an infection. It is important to monitor your temperature regularly and take appropriate measures to manage it. Can you tell me more about your symptoms?

Patient: It started yesterday


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a knowledgeable and empathetic medical professional. 
Respond to the patient's queries professionally and concisely.

Current conversation:
[HumanMessage(content='I am experiencing fever', additional_kwargs={}, response_metadata={}), AIMessage(content='I understand that you are experiencing fever. Fever is often a sign that your body is fighting off an infection. It is important to monitor your temperature regularly and take appropriate measures to manage it. Can you tell me more about your symptoms?', additional_kwargs={}, response_metadata={})]
Patient: It started yesterday
Doctor: [0m

[1m> Finished chain.[0m

Doctor: Thank you for sharing that information. It is important to note when the fever started as it can help in determining the cause. Have you noticed any other symptoms accompanying the fever?

Patient: No, just fever


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a knowledgeable and empathetic medical professional. 
Respond to the patient's queries professionally and concisely.

Current conversation:
[HumanMessage(content='I am experiencing fever', additional_kwargs={}, response_metadata={}), AIMessage(content='I understand that you are experiencing fever. Fever is often a sign that your body is fighting off an infection. It is important to monitor your temperature regularly and take appropriate measures to manage it. Can you tell me more about your symptoms?', additional_kwargs={}, response_metadata={}), HumanMessage(content='It started yesterday', additional_kwargs={}, response_metadata={}), AIMessage(content='Thank you for sharing that information. It is important to note when the fever started as it can help in determining the cause. Have you noticed any other symptoms accompanying the fever?', additional_kwargs={}, response_metadata={})]
Patie

In [None]:
def medical_dialogue_stopping_criteria(dialogue_history):
    """
    Determine if enough clinical information has been gathered for initial diagnosis.

    Args:
        dialogue_history (list): List of dialogue exchanges

    Returns:
        bool: True if stopping criteria met, False otherwise
    """
    # Essential diagnostic information checklist
    diagnostic_criteria = {
        'symptoms': False,
        'duration': False,
        'severity': False,
        'context': False
    }

    # Analyze recent dialogue exchanges(conversation history)
    recent_exchanges = dialogue_history[-5:]  # Last 5 exchanges

    for exchange in recent_exchanges:
        patient_message, doctor_response = exchange

        # Check for symptom description
        if any(keyword in patient_message.lower() for keyword in ['pain', 'ache', 'discomfort', 'feeling']):
            diagnostic_criteria['symptoms'] = True

        # Check for duration indicators
        if any(keyword in patient_message.lower() for keyword in ['since', 'started', 'weeks', 'days', 'months']):
            diagnostic_criteria['duration'] = True

        # Check for severity indicators
        if any(keyword in patient_message.lower() for keyword in ['severe', 'mild', 'intense', 'worst', 'moderate']):
            diagnostic_criteria['severity'] = True

        # Check for contextual information
        if any(keyword in patient_message.lower() for keyword in ['work', 'stress', 'diet', 'exercise', 'sleep']):
            diagnostic_criteria['context'] = True

    # Stopping condition: All key diagnostic information gathered
    return all(diagnostic_criteria.values())

def example_usage():
    """Example of how to use the stopping criteria"""
    dialogue_history = [
        ("I have chest pain", "Can you describe the pain?"),
        ("The pain is sharp and started two weeks ago", "How severe is the pain?"),
        ("It's quite severe, especially when I breathe deeply", "Are you experiencing any other symptoms?"),
        ("I feel tired and have occasional shortness of breath", "Tell me about your daily activities and stress levels.")
    ]

    stop_dialogue = medical_dialogue_stopping_criteria(dialogue_history)
    print(f"Stop dialogue: {stop_dialogue}")

# Optional: Add logging or more sophisticated tracking
def enhanced_stopping_criteria(dialogue_history, max_questions=5):
    """
    Enhanced stopping criteria with question limit and comprehensive information check

    Args:
        dialogue_history (list): List of dialogue exchanges
        max_questions (int): Maximum number of doctor's questions

    Returns:
        bool: True if stopping criteria met
    """
    doctor_questions = sum(1 for exchange in dialogue_history if exchange[1].endswith('?'))
    comprehensive_info = medical_dialogue_stopping_criteria(dialogue_history)

    return comprehensive_info or doctor_questions >= max_questions

In [None]:
# EVALUATE SINGLE CHECKPOINT-83

from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
import torch

# path to checkpoint
cp_path_1 = './OPENBIO-ft-Med/checkpoint-105'

# initialize list to store result
result = []

# LOAD THE MODEL

model_1 = AutoModelForCausalLM.from_pretrained(
            cp_path_1,
            torch_dtype=torch.bfloat16,
            device_map="auto")

# DEFINE TRAINER ARGS

trainer = Trainer(
    model=model_1,
    args=TrainingArguments(output_dir='./temp_test',
                           dataloader_drop_last=True,
                           per_device_eval_batch_size=8,
                           do_train=False,
                           do_eval=True
                           ),
        eval_dataset=eval_dataset,
        # compute_metrics=metric
    )


# EVALUATE

eval_result = trainer.evaluate()
print(f"Metrics for {cp_path_1}: {eval_result}")
result.append((cp_path_1, eval_result))

# PRINT RESULT
print(result)



config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Metrics for ./sample_data/checkpoint-83: {'eval_loss': 1.5239559412002563, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 10.6465, 'eval_samples_per_second': 40.013, 'eval_steps_per_second': 5.072}
[('./sample_data/checkpoint-83', {'eval_loss': 1.5239559412002563, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 10.6465, 'eval_samples_per_second': 40.013, 'eval_steps_per_second': 5.072})]


In [None]:
# EVALUATE CHECKPOINT

from transformers import Trainer, TrainingArguments


# Path to your checkpoints
checkpoint_paths = [
    './OPENBIO-ft-MEDLLM/checkpoint-166',
    './OPENBIO-ft-MEDLLM/checkpoint-249',
    './OPENBIO-ft-MEDLLM/checkpoint-83',
    './OPENBIO-ft-MEDLLM/checkpoint-332'
]

# Initialize a list to store results
results = []

# Loop through all checkpoints
for checkpoint in checkpoint_paths:
    print(f"Evaluating {checkpoint}...")

    # Load the model and trainer from the checkpoint
    model = AutoModelForCausalLM.from_pretrained(checkpoint)
    trainer = Trainer(
        model=model,
        args=TrainingArguments(output_dir='./temp_test',
                               dataloader_drop_last=True,
                               per_device_eval_batch_size=8,
                               do_train=False,
                               do_eval=True
                               ),
        eval_dataset=eval_dataset,
        # compute_metrics=metric
    )

    # Evaluate and store the results
    eval_result = trainer.evaluate()
    print(f"Metrics for {checkpoint}: {eval_result}")
    results.append((checkpoint, eval_result))

# Identify the best checkpoint
best_checkpoint = min(results, key=lambda x: x[1]['eval_loss'])  # Replace 'eval_metric_name' with your key metric
print(f"Best checkpoint: {best_checkpoint[0]} with metric {best_checkpoint[1]}")


Evaluating ./OPENBIO-ft-MEDLLM/checkpoint-166...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Metrics for ./OPENBIO-ft-MEDLLM/checkpoint-166: {'eval_loss': 0.5121213793754578, 'eval_model_preparation_time': 0.0177, 'eval_runtime': 4.3792, 'eval_samples_per_second': 97.278, 'eval_steps_per_second': 12.331}
Evaluating ./OPENBIO-ft-MEDLLM/checkpoint-249...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 206.81 MiB is free. Process 24552 has 39.35 GiB memory in use. Of the allocated memory 38.77 GiB is allocated by PyTorch, and 77.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import gc

# Training or evaluation loop
for checkpoint in checkpoint_paths:
    print(f"Loading checkpoint {checkpoint}...")

    # Clear previous models and caches
    del model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    # Load model and trainer
    model = AutoModelForCausalLM.from_pretrained(checkpoint)
    model.eval()

    # Perform evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=test_ds
    )
    eval_result = trainer.evaluate()
    print(eval_result)


In [None]:
%pip install peft



In [None]:
# Next we merge the trained QLoRA model and the base model to have a standalone model

# LOAD THE BASE MODEL

from transformers import AutoModelForCausalLM
import torch
from peft import PeftModel

model_id = "aaditya/Llama3-OpenBioLLM-8B"
model = AutoModelForCausalLM.from_pretrained(model_id, return_dict=True, torch_dtype=torch.bfloat16)

# LOAD QLORA MODEL
model = PeftModel.from_pretrained(model, cp_path)

model.eval()
model = model.merge_and_unload

# SAVE MODEL
model.save_pretrained("./Medellix-ft-MEDLLM/merged")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

AttributeError: 'function' object has no attribute 'save_pretrained'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

def merge_qlora_model(
    base_model_id: str,
    lora_checkpoint_path: str,
    save_path: str,
    torch_dtype: torch.dtype = torch.bfloat16
):
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        return_dict=True,
        torch_dtype=torch_dtype
    )

    # Load and merge LoRA weights
    model = PeftModel.from_pretrained(model, lora_checkpoint_path)
    model = model.merge_and_unload()

    # Save merged model
    model.save_pretrained(save_path)

    # Optional: Save tokenizer as well
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    tokenizer.save_pretrained(save_path)

    return model

# Usage
merged_model = merge_qlora_model(
    base_model_id="aaditya/Llama3-OpenBioLLM-8B",
    lora_checkpoint_path=cp_path,
    save_path="./Medellix-ft-MEDLLM/merged"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

# **USING PYTORCH LOOP FOR ACUTE FINETUNING**

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm
from peft import get_peft_model


# INITIALIZE THE DATALOADER

trainDataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


# SETUP OPTIMIZER AND LEARNING RATE SCHEDULER

optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 3
totalSteps = len(trainDataloader) * epochs


lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                          num_training_steps=totalSteps,
                          num_warmup_steps=100)

# COMBINE LORA PARAMS AND MODEL

model = get_peft_model(model, loraConfig)

In [None]:
# DISTRIBUTED SETUP FOR MODEL, OPTIMIZER AND DATALOADER

accelerator = Accelerator()

model, optimizer, trainDataloader = accelerator.prepare(model, optimizer, trainDataloader)

In [None]:
# FINETUNING LOOP


for epoch in range(epochs):
  model.train()
  t_loss = 0

  for batch in tqdm(trainDataloader, desc=f"Epoch: {epoch+1}/{epochs}"):

    inputs = {
    'input_ids': batch['input_ids'].to(accelerator.device),
    # 'attention_mask': batch['attention_mask'].to(accelerator.device),
    'labels': batch['labels'].to(accelerator.device),
    }

    optimizer.zero_grad()


    outputs = model(**inputs)

    loss = outputs.loss

    # compute gradients using the accelerator
    accelerator.backward(loss)

    # gradient clipping by norm prevent exploding gradients and stabilizes the training process
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update model parameters
    optimizer.step()

    # update learning rate
    scheduler.step()

    t_loss += loss.item()

  avg_loss = t_loss / len(trainDataloader)
  print(f"Average training loss: {avg_loss:.4f}")


