In [None]:
!pip install -q -U watermark
!pip install -q accelerate peft bitsandbytes transformers trl datasets langchain langchain-huggingface

In [None]:
import trl
import peft
import torch
import datasets
import watermark
import langchain
import accelerate
import transformers
import bitsandbytes

from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from transformers import pipeline, TrainingArguments
from langchain.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

# **Loading Dataset**

In [None]:
dataset = load_dataset('nlpie/Llama2-MedTuned-Instructions')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.96k [00:00<?, ?B/s]

(…)-00000-of-00001-a8790d88efc2bc45.parquet:   0%|          | 0.00/91.1M [00:00<?, ?B/s]

(…)-00000-of-00001-b543c64b1786c03e.parquet:   0%|          | 0.00/6.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200252 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/70066 [00:00<?, ? examples/s]

In [None]:
train_dataset = dataset['train'].select(indices = range(1000))
test_dataset = dataset['train'].select(indices = range(1000, 1200))

In [None]:
train_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'source'],
    num_rows: 1000
})

# **Create Prompts**

In [None]:
# Creates prompt based on dataset's sample. (instruction, input e output)
def create_prompt(sample):
  pre_prompt = """[INST]<<SYS>> {instruction}\n"""

  prompt = pre_prompt + "{input}" +"[/INST]"+"\n{output}"

  example_instruction = sample['instruction']

  example_input = sample['input']

  example_output = sample['output']

  prompt_template = PromptTemplate(template = prompt,
                                   input_variables = ['instruction', 'input', 'output'])

  formated_prompt = prompt_template.format(instruction = example_instruction,
                                           input = example_input,
                                           output = example_output)

  return [formated_prompt]

In [None]:
create_prompt(train_dataset[0])

"[INST]<<SYS>> In your role as a medical professional, address the user's medical questions and concerns.\nMy relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.[/INST]\nHi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health."

# **Quantization Configuration**

In [None]:
#Enables 4bit precision
use_4bit = True

# Type of the model
bnb_4bit_compute_dtype = 'float16'

# Type of quantization
bnb_4bit_quant_type = 'nf4'

# Double quantization
use_double_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(load_in_4bit = use_4bit,
                                bnb_4bit_quant_type = bnb_4bit_quant_type,
                                bnb_4bit_compute_dtype = compute_dtype,
                                bnb_4bit_use_double_quant = use_double_quant)



# **Loading LLM and Tokenizer**

In [None]:
llm_name = 'NousResearch/Llama-2-7b-chat-hf'

tokenizer = AutoTokenizer.from_pretrained(llm_name)

model = AutoModelForCausalLM.from_pretrained(llm_name,
                                             quantization_config = bnb_config,
                                             device_map = 'auto',
                                             use_cache = False)

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = 'right'

# **LoRA Configuration**

In [None]:
peft_config = LoraConfig(r = 8,
                         lora_alpha = 16,
                         lora_dropout = 0.05,
                         bias = 'none',
                         task_type = 'CAUSAL_LM')

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, peft_config)

# **Fine-Tunning**

In [None]:
output_dir = 'models'

In [None]:
training_arguments = TrainingArguments(output_dir = output_model,
                                       per_device_train_batch_size = 1,
                                       gradient_accumulation_steps = 4,
                                       optim = 'paged_adamw_32bit',
                                       learning_rate = 2e-4,
                                       lr_scheduler_type = 'cosine',
                                       save_strategy = 'epoch',
                                       logging_steps = 10,
                                       num_train_epochs = 3,
                                       max_steps = 150,
                                       fp16 = True)

In [None]:
training_args = SFTConfig(
    max_seq_length=512,
    packing = True,
    output_dir= output_dir)

trainer = SFTTrainer(model = model,
                     peft_config = peft_config,
                     formatting_func = create_prompt,
                     processing_class = tokenizer,
                     args = training_arguments,
                     train_dataset = train_dataset,
                     eval_dataset = test_dataset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
trainer.train()

In [None]:
trainer.save_model('latest_model')

In [None]:
merged_model = model.merge_and_unload()



# **LangChain Configuration**

In [None]:
pre_prompt = """[INST] <<SYS>>\nAnalyze the question and answer with the best option.\n"""

prompt = pre_prompt + 'Here is my question {context}'+'[\INST]'

prompt = PromptTemplate(template = prompt, input_variables = ['context'])

In [None]:
pipe = pipeline(task = 'text-generation',
                model = merged_model,
                tokenizer = tokenizer,
                max_new_tokens = 512,
                use_cache = False,
                do_sample = True,
                pad_token_id = tokenizer.eos_token_id,
                top_p = 0.7,
                temperature = 0.4)

llm_pipeline = HuggingFacePipeline(pipeline = pipe)

Device set to use cuda:0


# **Create LLM Chain**

In [None]:
memory = ConversationBufferMemory()

chat_llm_chain = LLMChain(llm = llm_pipeline,
                          prompt = prompt,
                          verbose = False,
                          memory = memory)

#chain = prompt | llm_pipeline | StrOutputParser()

# **Deploy**

In [None]:
context = """###Question: All of the following provisions are included in the Primary health care according to the Alma ata declaration except:
###Options:
A. Adequate supply of safe drinking water
B. Nutrition
C. Provision of free medicines
D. Basic sanitation"""

In [None]:
chat_llm_chain.predict(context = context)

  return fn(*args, **kwargs)


"[INST]<<SYS>>\nAnalyze the question and answer with the best option.\nHere is my question ###Question: All of the following provisions are included in the Primary health care according to the Alma ata declaration except:\n###Options:\nA. Adequate supply of safe drinking water\nB. Nutrition\nC. Provision of free medicines\nD. Basic sanitation[\\INST]  Great, let's analyze the question and answer options:\n\nQuestion: All of the following provisions are included in the Primary Health Care according to the Alma Ata Declaration except:\n\nOptions:\nA. Adequate supply of safe drinking water\nB. Nutrition\nC. Provision of free medicines\nD. Basic sanitation\n\nAnalysis:\nThe Alma Ata Declaration is a international document that outlines the minimum requirements for a comprehensive primary health care system. The declaration was adopted by the World Health Organization (WHO) in 1978 and has been widely accepted as the basis for primary health care policies and programs around the world.\n\nT

# **Libraries Versions**

In [None]:
%load_ext watermark
%watermark -v -m

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [None]:
%watermark --iversions

langchain_huggingface: 0.1.2
transformers         : 4.47.1
datasets             : 3.2.0
trl                  : 0.13.0
peft                 : 0.14.0
langchain            : 0.3.14
bitsandbytes         : 0.45.0
langchain_core       : 0.3.29
torch                : 2.5.1+cu121
watermark            : 2.5.0
accelerate           : 1.2.1

