In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/medical/cleaned_medical.jsonl


In [2]:
%%capture
pip install -q transformers datasets peft bitsandbytes torch accelerate

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch
import os
from collections import Counter
import re
import logging


In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256,expandable_segments:True"
torch.cuda.empty_cache()

In [1]:
dataset_path = "/kaggle/input/medical/cleaned_medical.jsonl"
try:
    dataset = load_dataset("json", data_files=dataset_path)
    print(f"Dataset loaded successfully with {len(dataset['train'])} examples")
except Exception as e:
    print(f"Error loading dataset: {e}")

In [9]:
print(dataset["train"].column_names)


['text']


In [3]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

In [11]:
def format_prompt_with_system_instruction(text):
    # Inject a domain-specific system prompt for general medical field
    if "[INST]" in text:
        return text.replace(
            "[INST]",
            "[INST] <<SYS>>\nYou are a knowledgeable and reliable AI assistant specialized in the medical domain. "
            "Only answer questions related to medicine, diseases, symptoms, treatments, healthcare, or related topics. "
            "If a question is outside the medical field, respond with: "
            "\"I'm sorry, I can only answer questions related to the medical field.\"\n<</SYS>>\n\n"
        )
    return text


In [4]:
def tokenize_function(examples):
    # Inject prompt engineering system message
    formatted = [format_prompt_with_system_instruction(t) for t in examples["text"]]
    tokenized = tokenizer(
        formatted,
        padding="max_length",
        truncation=True,
        max_length=64,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)



In [13]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "NousResearch/Llama-2-7b-chat-hf"

In [14]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Compute dtype for 4-bit
    bnb_4bit_quant_type="nf4",  # Use NF4 quantization (normal float 4-bit)
    bnb_4bit_use_double_quant=True  # Enable double quantization for better accuracy
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,  # Pass quantization config
    device_map="auto",  # Automatically map to available devices
    torch_dtype=torch.float16  # Use FP Laura for FP16
)

In [16]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [17]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    fp16=True,
    save_steps=200,
    save_total_limit=1,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    max_grad_norm=0.3,
    warmup_steps=50,
    report_to="none",
    max_steps=500,
    label_names=["labels"],
    dataloader_num_workers=4
)

In [18]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Move all inputs to the model's device
        device = model.device if hasattr(model, 'device') else torch.device("cuda:0")
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        # Compute outputs
        outputs = model(**inputs)
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

In [19]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)
trainer.train()

  trainer = CustomTrainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism 

Step,Training Loss,Validation Loss
200,0.022,0.022032
400,0.0213,0.022377


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=500, training_loss=0.23133858108520508, metrics={'train_runtime': 2806.0643, 'train_samples_per_second': 2.851, 'train_steps_per_second': 0.178, 'total_flos': 2.031064449024e+16, 'train_loss': 0.23133858108520508, 'epoch': 0.056682915769187166})

In [32]:
# Save the full model (including LoRA adapter)
output_dir = "/kaggle/working/medical-llama2"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /kaggle/working/medical-llama2


In [33]:
import pickle
from tqdm import tqdm

output_pkl_path = "/kaggle/working/medical_llama2_meta.pkl"
metadata = {
    "model_path": output_dir,
    "tokenizer_path": output_dir,
    "notes": "LoRA fine-tuned LLaMA2 medical chatbot model."
}

# Save model with tqdm progress bar
print("Pickling metadata...")
with open(output_pkl_path, "wb") as f:
    with tqdm(total=1, desc="Saving metadata") as pbar:
        pickle.dump(metadata, f)
        pbar.update(1)

print(f"Metadata saved to: {output_pkl_path}")


Pickling metadata...


Saving metadata: 100%|██████████| 1/1 [00:00<00:00, 8240.28it/s]

Metadata saved to: /kaggle/working/medical_llama2_meta.pkl





In [20]:
model_path = "/kaggle/working/results/checkpoint-500"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

In [22]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    repetition_penalty=1.2,
)


Device set to use cuda:0


In [36]:
def generate_response(prompt):
    """Generate a response for the given prompt."""
    formatted_prompt = f"<s>[INST] {prompt} [/INST]"
    result = pipe(formatted_prompt)
    raw_text = result[0]['generated_text']
    if "[/INST]" in raw_text:
        clean_text = raw_text.split("[/INST]")[1].split("</s>")[0].strip()
    else:
        clean_text = raw_text.strip()
    print(f"Question: {prompt}")
    print(f"Answer: {clean_text}")
    print("-" * 50)

In [49]:
print("Please enter your prompt (type 'exit' to quit):")
while True:
    user_prompt = input("Enter your Question: ")
    if user_prompt.lower() == "exit":
        print("Bye!")
        break
    generate_response(user_prompt)

Please enter your prompt (type 'exit' to quit):


Enter your Question:  exit


Bye!
