# 1. ENVIRONMENT SETUP
Install required libraries, check for GPU availability, and authenticate using environment variables.

In [None]:
# Install runtime dependencies for fine tuning
!pip install -q unsloth==0.5.0 datasets evaluate rouge-score peft accelerate bitsandbytes transformers==4.41.2 trl gradio==4.36.1 --extra-index-url https://download.pytorch.org/whl/cu121

import os
import platform
import torch

# Detect the available device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")
print(f"Python version: {platform.python_version()}")

# Authenticate with Hugging Face using an environment variable
hf_token = os.getenv("HF_TOKEN", "")
if hf_token:
    from huggingface_hub import login
    login(token=hf_token)
else:
    print("HF_TOKEN environment variable not set. Skipping login.")

# Authenticate with Weights and Biases using an environment variable
wandb_api_key = os.getenv("WANDB_API_KEY", "")
if wandb_api_key:
    import wandb
    wandb.login(key=wandb_api_key)
else:
    print("WANDB_API_KEY environment variable not set. W&B login skipped.")


# 2. CONFIGURATION
Define base model, Hugging Face repo, Weights and Biases project, and core training hyperparameters.

In [None]:
# Base 4 bit model from Unsloth for efficient fine tuning
MODEL_NAME_BASE = "unsloth/Meta-Llama-3-8B-Instruct-bnb-4bit"
HF_REPO_ID = os.getenv("HF_REPO_ID", "your-username/llama3-medical-cot")
WANDB_PROJECT = os.getenv("WANDB_PROJECT", "llama3-medical-cot")

hyperparameters = {
    "num_train_epochs": 1,
    "train_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "max_length": 2048,
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 10,
}

print(f"Base model: {MODEL_NAME_BASE}")
print(f"Hugging Face repo ID: {HF_REPO_ID}")
print(f"W&B project: {WANDB_PROJECT}")


# 3. DATASET PREPARATION
Load pubmed_qa (pqa_labeled), build train and validation splits, and format into <think> and <response> text pairs.

In [None]:
from datasets import load_dataset

# Load the labeled PubMedQA split
raw_dataset = load_dataset("pubmed_qa", "pqa_labeled")

# Create validation split from the training set
split_dataset = raw_dataset["train"].train_test_split(test_size=100, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Format each sample into reasoning and answer fields
system_prompt = (
    "You are a medical reasoning assistant. Provide step by step clinical thinking "
    "before giving a concise answer."
)

def format_sample(example):
    context = example.get("context", "")
    question = example.get("question", "")
    long_answer = example.get("long_answer", "")
    thought = f"<think> {system_prompt} Context: {context} Question: {question}".strip()
    response = f"<response> {long_answer}".strip()
    return {
        "prompt": thought,
        "response": response,
        "text": f"{thought}
{response}"
    }

train_dataset = train_dataset.map(format_sample, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(format_sample, remove_columns=val_dataset.column_names)

print(train_dataset[0])
print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)}")


# 4. TOKENIZATION
Load tokenizer and tokenize the formatted datasets with padding and truncation.

In [None]:
from transformers import AutoTokenizer

# Load tokenizer from the base model
base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_BASE, use_fast=True)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# Tokenization function with truncation and padding

def tokenize_batch(batch):
    return base_tokenizer(
        batch["text"],
        max_length=hyperparameters["max_length"],
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

train_tokenized = train_dataset.map(tokenize_batch, batched=True, remove_columns=train_dataset.column_names)
val_tokenized = val_dataset.map(tokenize_batch, batched=True, remove_columns=val_dataset.column_names)

print(train_tokenized)


# 5. BASELINE EVALUATION
Generate answers with the base model and compute ROUGE L on a small validation subset.

In [None]:
import evaluate
from transformers import AutoModelForCausalLM, pipeline

# Load the base model for inference
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME_BASE,
    device_map="auto",
)

text_generation = pipeline(
    task="text-generation",
    model=base_model,
    tokenizer=base_tokenizer,
    max_new_tokens=128,
    do_sample=False,
)

# Evaluate on a small subset for speed
subset = val_dataset.select(range(min(20, len(val_dataset))))
rouge = evaluate.load("rouge")

preds = []
refs = []
for example in subset:
    generated = text_generation(example["prompt"], return_full_text=False)[0]["generated_text"]
    preds.append(generated)
    refs.append(example["response"])

baseline_scores = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
print("Baseline ROUGE-L:", baseline_scores.get("rougeL"))


# 6. LORA FINE TUNING
Load the model with Unsloth, apply LoRA using PEFT, and train with gradient accumulation and mixed precision while logging to Weights and Biases.

In [None]:
from unsloth import FastLanguageModel
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

# Load model in 4 bit for efficient training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME_BASE,
    max_seq_length=hyperparameters["max_length"],
    load_in_4bit=True,
    dtype=None,
)

# Ensure tokenizer padding token is defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

FastLanguageModel.get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="outputs",
    num_train_epochs=hyperparameters["num_train_epochs"],
    per_device_train_batch_size=hyperparameters["train_batch_size"],
    gradient_accumulation_steps=hyperparameters["gradient_accumulation_steps"],
    learning_rate=hyperparameters["learning_rate"],
    weight_decay=hyperparameters["weight_decay"],
    warmup_ratio=hyperparameters["warmup_ratio"],
    logging_steps=hyperparameters["logging_steps"],
    fp16=torch.cuda.is_available(),
    bf16=False,
    report_to=["wandb"] if wandb_api_key else [],
    run_name=WANDB_PROJECT,
    max_grad_norm=1.0,
    save_strategy="no",
)

# Supervised fine tuning trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=hyperparameters["max_length"],
    args=training_args,
)

train_result = trainer.train()
print(train_result)


# 7. SAVE AND PUSH
Save the LoRA adapter and tokenizer, then push them to Hugging Face without the base model weights.

In [None]:
from huggingface_hub import HfApi

adapter_dir = "lora_adapter"
tokenizer_dir = "lora_tokenizer"

# Save adapter and tokenizer locally
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(tokenizer_dir)

# Push to the specified repository if credentials are available
if hf_token and HF_REPO_ID:
    api = HfApi()
    api.create_repo(HF_REPO_ID, exist_ok=True, token=hf_token)
    api.upload_folder(folder_path=adapter_dir, repo_id=HF_REPO_ID, token=hf_token)
    api.upload_folder(folder_path=tokenizer_dir, repo_id=HF_REPO_ID, token=hf_token)
    print(f"Uploaded adapter and tokenizer to {HF_REPO_ID}")
else:
    print("Skipping upload because HF_TOKEN or HF_REPO_ID is missing.")


# 8. POST TRAINING EVALUATION
Reload the base model with the LoRA adapter and recompute ROUGE L to compare against the baseline.

In [None]:
from peft import PeftModel

# Reload base model and merge LoRA adapter for evaluation
base_for_eval = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME_BASE,
    device_map="auto",
)
base_for_eval = PeftModel.from_pretrained(base_for_eval, adapter_dir)

finetuned_generation = pipeline(
    task="text-generation",
    model=base_for_eval,
    tokenizer=base_tokenizer,
    max_new_tokens=128,
    do_sample=False,
)

finetuned_preds = []
for example in subset:
    generated = finetuned_generation(example["prompt"], return_full_text=False)[0]["generated_text"]
    finetuned_preds.append(generated)

finetuned_scores = rouge.compute(predictions=finetuned_preds, references=refs, use_aggregator=True)
print("Baseline ROUGE-L:", baseline_scores.get("rougeL"))
print("Finetuned ROUGE-L:", finetuned_scores.get("rougeL"))


# 9. INFERENCE AND GRADIO
Provide a helper to generate medical answers and expose a minimal Gradio interface.

In [None]:
import gradio as gr

def generate_medical_answer(question: str, max_new_tokens: int = 128) -> str:
    formatted_prompt = f"<think> {system_prompt} Question: {question}
<response>"
    outputs = finetuned_generation(formatted_prompt, return_full_text=False, max_new_tokens=max_new_tokens)
    return outputs[0]["generated_text"]

# Build a simple Gradio interface
interface = gr.Interface(
    fn=generate_medical_answer,
    inputs=[gr.Textbox(label="Medical Question", lines=4), gr.Slider(32, 256, value=128, step=8, label="Max New Tokens")],
    outputs=gr.Textbox(label="Model Answer"),
    title="Medical Reasoning with Llama 3",
    description="LoRA finetuned medical reasoning model built with Unsloth.",
)

# Launch only when running interactively
if __name__ == "__main__":
    interface.launch(share=False)


# 10. QUALITY CHECK
Remove unused code, keep comments concise, and verify the notebook runs cleanly from top to bottom in a fresh runtime.