# LangQA – Language-powered question and answer system

## Imports

In [1]:
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import pipeline, TrainingArguments
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain
import warnings

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

https://huggingface.co/datasets/nlpie/Llama2-MedTuned-Instructions

In [2]:
# Load dataset
dataset = load_dataset('nlpie/Llama2-MedTuned-Instructions')

In [3]:
train_data = dataset['train'].select(indices=range(1000))

train_data

Dataset({
    features: ['instruction', 'input', 'output', 'source'],
    num_rows: 1000
})

In [4]:
# Selecting the lines to test the model
test_data = dataset['train'].select(indices=range(1000, 1200))

## Understanding the format of the text

In [5]:
for i in range(3):
    data = dataset['train'][i]
    print(f"Data point {i + 1}:")
    print("Instruction:", data['instruction'])
    print("Input:", data['input'])
    print("Output:", data['output'])
    print("\n-----------------------------\n")

Data point 1:
Instruction: In your role as a medical professional, address the user's medical questions and concerns.
Input: My relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.
Output: Hi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health.

-----------------------------



## Automating the Creation of Prompts for Model Training

In [6]:
# Defines a function that takes a dictionary named sample
def create_prompt(sample):

    # Defines a pre_prompt string that serves as a template for the first part of the prompt
    pre_prompt = """[INST]<<SYS>> {instruction}\n"""

    # Concatenates pre_prompt with additional strings to form the complete prompt
    prompt = pre_prompt + "{input}" +"[/INST]"+"\n{output}"

    # Assigns the value of the 'instruction' key of the dictionary sample to the variable example_instruction
    example_instruction = sample['instruction']

    # Assigns the value of the 'input' key of the dictionary sample to the variable example_input
    example_input = sample['input']

    # Assigns the value of the 'output' key of the dictionary sample to the variable example_output
    example_output = sample['output']

    # Creates an instance of PromptTemplate with the previously defined prompt and input variables
    prompt_template = PromptTemplate(template = prompt,
    input_variables = ["instruction", "input", "output"])

    # Uses the format method of the prompt_template instance to replace the variables
    # in the template with the specified values
    unique_prompt = prompt_template.format(instruction = example_instruction,
                                          input = example_input,
                                          output = example_output)

    # Returns the formatted prompt
    return [unique_prompt]

In [7]:
# Testing the function
prompt = create_prompt(train_data[0])
print(prompt)

["[INST]<<SYS>> In your role as a medical professional, address the user's medical questions and concerns.\nMy relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.[/INST]\nHi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health."]


## Quantization Process

In [8]:
# Enables loading of the base model with 4-bit precision
use_4bit = True

# Sets the dtype for the base model
bnb_4bit_compute_dtype = "float16"

# Quantization type
bnb_4bit_quant_type = "nf4"

# Disables double quantization
use_nested_quant = False

# Sets the dtype for computation in PyTorch
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

In [9]:
# Defining the config
bnb_config = BitsAndBytesConfig(load_in_4bit = use_4bit,
                                bnb_4bit_quant_type = bnb_4bit_quant_type,
                                bnb_4bit_compute_dtype = compute_dtype,
                                bnb_4bit_use_double_quant = use_nested_quant)

In [10]:
# Verifying if the GPU supports bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("The GPU suporrts bfloat16. You can accelerate the train using bf16=True")
        print("=" * 80)

The GPU suporrts bfloat16. You can accelerate the train using bf16=True


## Load the LLM and the Tokenizer

https://huggingface.co/NousResearch/Llama-2-7b-chat-hf

In [11]:
# LLM
# llm_name = "NousResearch/Llama-2-7b-chat-hf"
llm_name = "Qwen/Qwen2.5-7B-Instruct"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_name)

# Load the base model with quantization
model = AutoModelForCausalLM.from_pretrained(llm_name,
                                              quantization_config = bnb_config,
                                            #   trust_remote_code=True,
                                              device_map = "auto",
                                              use_cache = False
                                              )

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:20<00:00,  5.01s/it]


In [12]:
# Use the EOS token from the tokenizer to pad at the end of each sequence
tokenizer.pad_token = tokenizer.eos_token

# Enable padding at the end of each sentence
tokenizer.padding_side = "right"

## Configuring LoRa Adapters

Quantization represents data with fewer bits, making it a useful technique for reducing memory usage and speeding up inference, especially in the context of LLMs.  

Once a model is quantized, it is typically not trained **directly** for downstream tasks because training can become unstable due to the reduced precision of weights and activations. However, since PEFT methods only add extra trainable parameters, this allows for training a quantized model with a PEFT adapter on top! Combining quantization with PEFT can be a good strategy to train even the largest models on a single GPU. For example, QLoRA is a method that quantizes a model to 4 bits and then trains it with LoRA. This method enables fine-tuning a 65B parameter model on a single 48GB GPU, for instance.  

The goal of PEFT (Parameter-Efficient Fine-Tuning) is to keep most of the pre-trained model's parameters fixed while adjusting only a small subset of parameters to adapt the model to a specific task.

In [13]:
# LoRa Parameters
peft_config = LoraConfig(r = 8,
                        lora_alpha = 16,
                        lora_dropout = 0.05,
                        bias = "none",
                        task_type = "CAUSAL_LM")

In [14]:
# Prepare the model to train
model = prepare_model_for_kbit_training(model)

In [15]:
# Merge the quantized model with the LoRa adapters
model = get_peft_model(model, peft_config=peft_config)

## Fine-tuning parameters

In [16]:
output_model = 'adjusted_model'

In [17]:
# Train arguments
training_arguments = TrainingArguments(output_dir = output_model,
                                       per_device_train_batch_size = 1,
                                       gradient_accumulation_steps = 4,
                                       optim = "paged_adamw_32bit",
                                       learning_rate = 2e-4,
                                       lr_scheduler_type = "cosine",
                                       save_strategy = "epoch",
                                       logging_steps = 10,
                                       num_train_epochs = 3,
                                       max_steps = 150,
                                       fp16 = True)

In [None]:
# Force the model to allocate memory correctly
model = model.to("cuda")

In [18]:
# Creates the Trainer
# Optimized for fine-tuning pre-trained models with smaller datasets on supervised learning tasks.
trainer = SFTTrainer(model = model,
                     peft_config = peft_config,
                    #  max_seq_length = 512,
                     tokenizer = tokenizer,
                    #  packing = True,
                     formatting_func = create_prompt,
                     args = training_arguments,
                     train_dataset = train_data,
                     eval_dataset = test_data)

Map: 100%|██████████| 200/200 [00:00<00:00, 1395.70 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Fine-tuning training

In [19]:
%%time
trainer.train()

Step,Training Loss
10,1.4747
20,1.1346


KeyboardInterrupt: 

In [None]:
# Model save
trainer.save_model('final_model')

In [None]:
# Merge
merged_model = model.merge_and_unload()

## Building the pipeline of Text Generation with LangChain

In [None]:
# Create pre-prompt with the instruction
pre_prompt = """[INST] <<SYS>>\nAnalyze the question and answer with the best option.\n"""

# Create the prompt adding the input
prompt = pre_prompt + "Here is my question {context}" + "[\INST]"

# Create the prompt template with LangChain
prompt = PromptTemplate(template = prompt, input_variables=["context"])

Pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract away most of the complex code in the library, providing a simple API dedicated to a variety of tasks, including named entity recognition, masked language modeling, sentiment analysis, feature extraction, and question answering.

In [None]:
# Create the pipeline object
pipe = pipeline("text-generation",
                 model = merged_model,
                 tokenizer = tokenizer,
                 max_new_tokens = 512,
                 use_cache = False,
                 do_sample = True,
                 pad_token_id = tokenizer.eos_token_id,
                 top_p = 0.7,
                 temperature = 0.5)

In [None]:
# Create the Hugging Face Pipeline
llm_pipeline = HuggingFacePipeline(pipeline = pipe)

## Creating the LLM Chain

In [None]:
# create the memory
memory = ConversationBufferMemory()

In [None]:
# Create the LLM Chain
chat_llm_chain = LLMChain(llm = llm_pipeline,
                          prompt = prompt,
                          verbose = False,
                          memory = memory)

## Deploying the Model and Using the Question and Answer System

In [None]:
context = '''###Question: All of the following provisions are included in the Primary health care according to the Alma Ata declaration except:
###Options:
A. Adequate supply of safe drinking water
B. Nutrition
C. Provision of free medicines
D. Basic sanitation'''

In [None]:
%%time
response = chat_llm_chain.predict(context = context)