In [None]:
# Install required libraries
%pip install -q torch transformers peft bitsandbytes trl datasets pandas accelerate

# Install specific version for compatibility
%pip install -q -U transformers==4.44.0


In [None]:
import torch
import pandas as pd
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from datasets import Dataset
from trl import SFTTrainer
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
if torch.cuda.is_available():
    print(f"GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("WARNING: No GPU detected! This notebook requires a GPU.")
    
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Load the dataset
df = pd.read_csv('medquad.csv')

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 3 rows:")
print(df.head(3))

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Explore source and focus_area columns for potential filtering
print("\nUnique sources:")
print(df['source'].value_counts())
print("\nUnique focus areas:")
print(df['focus_area'].value_counts())


In [None]:
# Format the dataset for instruction-following
def format_instruction(row):
    """Format question and answer into instruction-response format for MedGemma"""
    instruction = f"Instruction: {row['question']}\nResponse: {row['answer']}"
    return instruction

# Create formatted dataset
df['text'] = df.apply(format_instruction, axis=1)

# Sample the dataset to fit within memory constraints (1000 examples as requested)
df_sample = df.sample(n=min(1000, len(df)), random_state=42)
print(f"Using {len(df_sample)} examples for fine-tuning")

# Display a sample formatted text
print("\nSample formatted text:")
print(df_sample['text'].iloc[0][:500] + "...")


In [None]:
# Model ID
model_id = "google/medgemma-4b-multimodal"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization
# Note: If model loading fails due to access restrictions, you may need to:
# 1. Accept the model's license agreement on HuggingFace
# 2. Use your HuggingFace token: from huggingface_hub import login; login()
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Check memory usage
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Alpha parameter for LoRA scaling
    lora_dropout=0.05,  # Dropout probability
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"],  # Target attention modules
    # Note: If these modules don't exist in MedGemma, the model will automatically
    # find suitable modules. Common alternatives include:
    # ["qkv_proj", "o_proj"] or ["query", "value"] or ["attention"]
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Prepare model for k-bit training
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

# Get PEFT model
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
print(f"All parameters: {all_params:,}")


In [None]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df_sample[['text']])

# Define a preprocessing function
def preprocess_function(examples):
    """Tokenize the texts"""
    # Add end-of-sequence token to ensure proper generation stopping
    texts = [text + tokenizer.eos_token for text in examples['text']]
    
    # Tokenize with truncation and padding
    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,  # Adjust based on your GPU memory
        return_tensors="pt"
    )
    
    # Set labels same as input_ids for causal language modeling
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    
    return model_inputs

# Note: We'll let the SFTTrainer handle tokenization for better efficiency
print(f"Dataset prepared with {len(dataset)} examples")


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-5,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    save_total_limit=2,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

print("Training arguments configured:")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Mixed precision: BF16 = {training_args.bf16}")


In [None]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

# Check memory before training
if torch.cuda.is_available():
    print(f"GPU memory before training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    print(f"  Available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved()) / 1024**3:.2f} GB")

print("\nStarting training...")
print("Note: Training will take some time. Monitor GPU memory usage with 'nvidia-smi' in another terminal.")


In [None]:
# Start training
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")
    print("This might be due to:")
    print("1. Insufficient GPU memory - try reducing batch_size or max_seq_length")
    print("2. Model access issues - ensure you have access to the MedGemma model")
    print("3. CUDA compatibility issues - check your PyTorch and CUDA versions")
    
# Check memory after training
if torch.cuda.is_available():
    print(f"\nGPU memory after training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
# Save the fine-tuned model
output_dir = "./finetuned_medgemma_4b"

# Save the LoRA adapters
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Fine-tuned model saved to: {output_dir}")
print("Contents:")
for file in os.listdir(output_dir):
    print(f"  - {file}")

# Save training arguments for reference
import json
training_config = {
    "model_id": model_id,
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": lora_config.target_modules,
    },
    "training_args": {
        "learning_rate": training_args.learning_rate,
        "num_train_epochs": training_args.num_train_epochs,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    },
    "dataset_size": len(dataset),
}

with open(os.path.join(output_dir, "training_config.json"), "w") as f:
    json.dump(training_config, f, indent=2)

print("\nTraining configuration saved to training_config.json")


In [None]:
# Clear GPU memory
torch.cuda.empty_cache()

# Load the fine-tuned model for inference
print("Loading fine-tuned model for inference...")

# Load base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Load the fine-tuned LoRA adapters
finetuned_model = PeftModel.from_pretrained(base_model, output_dir)

# Set model to evaluation mode
finetuned_model.eval()

print("Fine-tuned model loaded successfully!")


In [None]:
# Test with a sample question
sample_question = df_sample['question'].iloc[0]
print(f"Sample Question: {sample_question}")
print("\n" + "="*50)

# Format the input
test_input = f"Instruction: {sample_question}\nResponse:"

# Tokenize the input
inputs = tokenizer(test_input, return_tensors="pt").to(device)

# Generate response
print("Generating response...")
with torch.no_grad():
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the generated part (after "Response:")
if "Response:" in response:
    generated_response = response.split("Response:")[-1].strip()
else:
    generated_response = response

print(f"Generated Response: {generated_response}")
print("\n" + "="*50)

# Compare with original answer
original_answer = df_sample[df_sample['question'] == sample_question]['answer'].iloc[0]
print(f"Original Answer: {original_answer}")


In [None]:
# Check final GPU memory usage
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    
    print("Final GPU Memory Usage:")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved: {reserved:.2f} GB")
    print(f"  Total GPU Memory: {total:.2f} GB")
    print(f"  Memory Utilization: {(reserved/total)*100:.1f}%")
    
    # Provide optimization suggestions
    print("\nMemory Optimization Suggestions:")
    if reserved > 20:  # If using more than 20GB
        print("⚠️  High memory usage detected (>20GB). Consider:")
        print("   - Reducing batch_size from 4 to 2")
        print("   - Reducing max_seq_length from 512 to 256")
        print("   - Using gradient_accumulation_steps=8 to maintain effective batch size")
    elif reserved > 15:
        print("⚡ Good memory usage (15-20GB). Current settings are optimal.")
    else:
        print("✅ Low memory usage (<15GB). You could:")
        print("   - Increase batch_size to 8 for faster training")
        print("   - Increase max_seq_length to 1024 for longer sequences")
        print("   - Use a larger dataset sample")
        
    # Run nvidia-smi equivalent
    print("\nFor real-time monitoring, run this command in terminal:")
    print("nvidia-smi -l 1")
    
else:
    print("No GPU detected. This notebook requires a CUDA-capable GPU.")


In [None]:
def generate_medical_response(question, model=finetuned_model, tokenizer=tokenizer, max_length=256):
    """
    Generate a medical response for a given question using the fine-tuned model.
    
    Args:
        question (str): The medical question to answer
        model: The fine-tuned model
        tokenizer: The tokenizer
        max_length (int): Maximum length of the response
    
    Returns:
        str: Generated medical response
    """
    # Format the input
    test_input = f"Instruction: {question}\nResponse:"
    
    # Tokenize
    inputs = tokenizer(test_input, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode and extract response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Response:" in response:
        generated_response = response.split("Response:")[-1].strip()
    else:
        generated_response = response
    
    return generated_response

# Test with a few custom questions
test_questions = [
    "What are the symptoms of diabetes?",
    "How is hypertension treated?",
    "What causes heart disease?",
]

print("Testing with custom questions:")
print("="*60)

for i, question in enumerate(test_questions, 1):
    print(f"\nTest {i}:")
    print(f"Question: {question}")
    response = generate_medical_response(question)
    print(f"Response: {response}")
    print("-" * 40)
