<a href="https://colab.research.google.com/github/Harshit0722/MediTalk/blob/main/MediTalk_(fine_tuned_dolly_on_medmcqa).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using PEFT & bitsandbytes to finetune a LoRa using our medical dataset




## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install pytorch

import re
import string
from tqdm import tqdm

import json
import os

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-44fbe700-e45e-03cf-a495-c4a74fb7b754)


### Setup the model

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "databricks/dolly-v2-3b",
    load_in_8bit=True,
    device_map='auto',
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [None]:
tokenizer = AutoTokenizer.from_pretrained('databricks/dolly-v2-3b', batch_size=125, padding_side='left')

In [None]:
print(tokenizer)

GPTNeoXTokenizerFast(name_or_path='databricks/dolly-v2-3b', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['### End', '### Instruction:', '### Response:']}, clean_up_tokenization_spaces=True)


### Freezing the original weights


In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.embed_out)

### Setting up the LoRa Adapters

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 5242880 || all params: 2780328960 || trainable%: 0.18857049203271256


## Dataset loading & pre-processing

In [None]:
import transformers
!pip install datasets
from datasets import load_dataset
data = load_dataset('medmcqa')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
train_data = data['train']
print(len(train_data))

182822


In [None]:
train_data

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
    num_rows: 182822
})

In [None]:
def pred_answer(example):

    example["prediction"] = "Question asked is " + example["question"] + " ; Options were " + example["opa"] + " or " + example["opb"] + " or " + example["opc"] + " or " + example["opd"] + " ; Answer is -> " + "(" + str(example["cop"]) + ") " + str(example["exp"])
    return example

train_data = train_data.map(pred_answer)



In [None]:
train_data = train_data.map(lambda samples: tokenizer(samples['prediction'], padding = 'max_length', truncation = True, max_length = 1000), batched=True)

Map:   0%|          | 0/182822 [00:00<?, ? examples/s]

In [None]:
train_data

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'prediction', 'input_ids', 'attention_mask'],
    num_rows: 182822
})

### Training

In [None]:
#Training was performed with varying per_device_train_size = {1,2,4,8} & gradient_accumulation_steps = {1,2,4,8} at different runtimes.
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        warmup_steps=50,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  attn_scores = torch.where(causal_mask, attn_scores, mask_value)


Step,Training Loss
1,3.7613
2,3.6935
3,3.7562
4,3.8158
5,3.8466
6,3.6591
7,3.9934
8,3.8876
9,3.5588
10,3.6739


## Inference

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#Model was pushed as a commit after all possible iters. Final fine-tuned model is then imported and inferenced upon.
model.push_to_hub("Harshit0722/dolly-fine-tuned-on-med-data",
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)

In [None]:
#Code to retrieve model from huggingface:-
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "Harshit0722/dolly-fine-tuned-on-med-data"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
#Analytics using the following code is done and compared in another notebook:-
batch = tokenizer("“Question asked is what is blood? ; Answer is ” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))