In [None]:
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install trl
!pip install bitsandbytes # for quantization

In [None]:
!pip install peft # for LoRA

In [29]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# import bitsandbytes config
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

# quantization configuration for loading the model in 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # model will be stored using normalized 4 bit floats
    bnb_4bit_compute_dtype="float16"  # model will be computed using higher precision data type
)

In [3]:
# Quantization-aware model loading
import transformers
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

MODEL_NAME = "NousResearch/Llama-2-7b-hf"

# load the LLaMA 2 model with 4-bit quantization using the config defined earlier
model = AutoModelForCausalLM.from_pretrained(
  MODEL_NAME,
  quantization_config=quantization_config,
  device_map="auto"
)

# load corresponding tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  MODEL_NAME,
  trust_remote_code=True
)

# set pading for compatability
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Disable cache while training (conflicts with gradient checkpointing)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Import LoRA-related utilities from PEFT (Parameter-Efficient Fine-Tuning) library
from peft import (
  LoraConfig,
  get_peft_model,
  prepare_model_for_kbit_training
)

# define LoRA config
peft_config = LoraConfig(
  r=16,
  lora_alpha=32,
  target_modules=['q_proj', 'k_proj', 'down_proj' , 'v_proj', 'gate_proj' , 'o_proj' , 'up_proj'],
  lora_dropout=0.05,
  task_type="CAUSAL_LM"
)

# Prepare the quantized model for LoRA training
model = prepare_model_for_kbit_training(model)

# Inject the LoRA adapters into the model
model = get_peft_model(model, peft_config) # all layers except attention layers are frozen

generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 256
generation_configuration.temperature = 0.7
generation_configuration.top_p = 0.9
generation_configuration.do_sample = True
model.config.use_cache = True

In [5]:
# data preprocessing

# load dataset
from datasets import load_dataset
DATASET_NAME = "nlpie/Llama2-MedTuned-Instructions"

SYSTEM_PROMPT = (
    "You are an expert medical assistant. "
    "Answer factually and precisely.\n"
)

def construct_datapoint(example):
    """
    Builds a single training string in Llama-2 chat format, then
    tokenises it so the dataset already contains `input_ids` and
    `attention_mask` (columns the model expects).
    """
    input_text = example.get('input', '').strip()
    input_part = f"\n\n{input_text}" if input_text else ""  # Missing Input Handling

    chat = (
        f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}<</SYS>>\n\n"
        f"{example['instruction'].strip()}{input_part} [/INST] "
        f"{example['output'].strip()} </s>"
    )
    return tokenizer(
        chat,
        max_length=1024,
        truncation=True,
        padding="max_length"
    )

# load and shuffle full training split
ds = load_dataset("nlpie/Llama2-MedTuned-Instructions", split="train")

train_test = ds.train_test_split(test_size=0.1, seed=42)

train_ds = (
    train_test['train'].shuffle(seed=42)
    .select(range(2_000))  # Now you get 2,000 training samples
    .map(construct_datapoint, batched=False, remove_columns=train_test['train'].column_names)
)

eval_ds = (
    train_test['test'].shuffle(seed=42)
    .select(range(200))  # 200 eval samples
    .map(construct_datapoint, batched=False, remove_columns=train_test['test'].column_names)
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# set up training arguments
train_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4, # simulate a larger batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    output_dir="med_finetune",
    save_steps=500,              # Save checkpoints
    eval_steps=500,              # Evaluate periodically
    eval_strategy="steps", # ADD THIS LINE - Required for load_best_model_at_end
    logging_steps=100,           # Log more frequently
    save_total_limit=3,          # Keep only 3 checkpoints
    load_best_model_at_end=True, # Load best model after training
)


trainer = transformers.Trainer(
  model=model,
  train_dataset=train_ds,
  eval_dataset=eval_ds,
  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
  args=train_arguments
)

model.config.use_cache = False

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# train
if __name__ == "__main__":
    try:
      trainer.train()
      trainer.save_model("med_finetune/final")
      print("Training completed successfully!")
    except Exception as e:
      print(f"Training failed: {e}")
      # Save intermediate checkpoint
      trainer.save_model("med_finetune/interrupted")

    # enable cache for fast decoding
    model.gradient_checkpointing_disable()
    model.config.use_cache = True
    model.eval()






<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnazmus-s333[0m ([33mnazmus-s333-university-of-toronto[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
500,1.0236,0.969426


Training completed successfully!


In [None]:
def generate_improved(prompt: str, max_new: int = 200):
    """
    Improved generation function with better formatting and stopping criteria
    """
    chat_prompt = (
        f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}<</SYS>>\n\n"
        f"{prompt} [/INST]"
    )

    inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new,
            temperature=0.3,           # Lower temperature for more focused responses
            top_p=0.8,                 # Slightly lower top_p
            do_sample=True,
            repetition_penalty=1.1,    # Lower repetition penalty
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,    # Prevent repeating 3-grams
            early_stopping=True        # Stop early when EOS is generated
        )

    # Decode and clean up the response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response (after [/INST])
    if "[/INST]" in full_response:
        response = full_response.split("[/INST]")[1].strip()
    else:
        response = full_response

    # Clean up any remaining artifacts
    response = response.replace("</s>", "").strip()

    print("=" * 50)
    print("QUESTION:", prompt)
    print("=" * 50)
    print("ANSWER:")
    print(response)
    print("=" * 50)

    return response

In [None]:
# Test the improved function
generate_improved("Explain the difference between Type 1 and Type 2 diabetes.")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


QUESTION: Explain the difference between Type 1 and Type 2 diabetes.
ANSWER:
Type Ⅰ Diabetes: In this type of diabetic patients, insulin is produced in sufficient amount but it does not work properly. This condition is also known as Insulin-dependent or Juvenile-onset diabeteis. It usually occurs in children and young adults. Type ２ Diabtes: In type ‧diabetes, insufficient amount of insuline is produced by pancreas or body cells do not respond to insulien. This type of disease is also called non-insulin dependent diabtees.

###Answer: Type Ι Diabetees: Insufficient production of insulein. Type II Diabtee: Inadequate response of body cells to insuleen.
#####Rationale: Type I Diabettees: Pancreas produce insulene in adequate amount but due to


'Type Ⅰ Diabetes: In this type of diabetic patients, insulin is produced in sufficient amount but it does not work properly. This condition is also known as Insulin-dependent or Juvenile-onset diabeteis. It usually occurs in children and young adults. Type ２ Diabtes: In type ‧diabetes, insufficient amount of insuline is produced by pancreas or body cells do not respond to insulien. This type of disease is also called non-insulin dependent diabtees.\n\n###Answer: Type Ι Diabetees: Insufficient production of insulein. Type II Diabtee: Inadequate response of body cells to insuleen.\n#####Rationale: Type I Diabettees: Pancreas produce insulene in adequate amount but due to'

In [None]:
# save model

import os
from huggingface_hub import HfApi, HfFolder
import shutil

def save_model_for_export(model, tokenizer, save_path="./medical_llama_model"):
    """
    Save model and tokenizer for easy export
    """
    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Save the fine-tuned model (LoRA adapters)
    model.save_pretrained(save_path)

    # Save tokenizer
    tokenizer.save_pretrained(save_path)

    # Save a configuration file for easy loading
    config = {
        "base_model": "NousResearch/Llama-2-7b-hf",
        "model_type": "peft",
        "task": "medical_qa",
        "system_prompt": SYSTEM_PROMPT
    }

    import json
    with open(f"{save_path}/config.json", "w") as f:
        json.dump(config, f, indent=2)

    print(f"Model saved to: {save_path}")
    print("Files saved:")
    for file in os.listdir(save_path):
        print(f"  - {file}")

    return save_path

# Save your trained model
save_path = save_model_for_export(model, tokenizer)

Model saved to: ./medical_llama_model
Files saved:
  - tokenizer_config.json
  - config.json
  - adapter_model.safetensors
  - tokenizer.json
  - adapter_config.json
  - special_tokens_map.json
  - README.md
  - tokenizer.model


Save model in HF repo

In [None]:
from huggingface_hub import notebook_login

notebook_login()  # This will prompt you to paste your HF token interactively

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="medical_llama_model",
    repo_id="Nazmoose/MedLlama-LoRA",
    repo_type="model",
    commit_message="Upload MedTuned LoRA adapter"
)


Uploading...:   0%|          | 0.00/160M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Nazmoose/MedLlama-LoRA/commit/9c3e0e94ecb51811871df7be616dd2231bd0af01', commit_message='Upload MedTuned LoRA adapter', commit_description='', oid='9c3e0e94ecb51811871df7be616dd2231bd0af01', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nazmoose/MedLlama-LoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='Nazmoose/MedLlama-LoRA'), pr_revision=None, pr_num=None)