In [None]:
%pip install -q torch transformers peft bitsandbytes datasets pandas accelerate huggingface_hub

%pip install -q -U transformers==4.44.0


In [None]:
# 🔐 AUTHENTICATION CELL - RUN THIS FIRST!
# Uncomment ONE of the following methods to authenticate:

# Method 1: Interactive login (recommended - will prompt for token)
# from huggingface_hub import login
# login()

# Method 2: Direct token login (replace with your actual token)
# from huggingface_hub import login
# login(token="hf_your_token_here")

# Method 3: Environment variable (set token as environment variable)
# import os
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "hf_your_token_here"

print("⚠️  IMPORTANT: Uncomment and run ONE of the authentication methods above!")
print("📝 Steps to get your token:")
print("1. Go to https://huggingface.co/google/medgemma-4b-multimodal")
print("2. Click 'Request Access' and accept the license agreement")
print("3. Wait for approval (usually takes a few minutes to hours)")
print("4. Get your token from https://huggingface.co/settings/tokens")
print("5. Uncomment and run one of the login methods above")


In [None]:
import torch
import pandas as pd
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, prepare_model_for_kbit_training
from datasets import Dataset
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')

# Authentication for gated models like MedGemma
print("🔐 MedGemma Authentication Required")
print("=" * 50)
print("MedGemma is a gated model that requires authentication.")
print("Please follow these steps:")
print("1. Go to https://huggingface.co/google/medgemma-4b-multimodal")
print("2. Click 'Request Access' and accept the license agreement")
print("3. Wait for approval (usually takes a few minutes to hours)")
print("4. Get your HuggingFace token from https://huggingface.co/settings/tokens")
print("5. Run the login command below with your token")
print("=" * 50)

# Uncomment and run this line with your HuggingFace token
# login(token="your_huggingface_token_here")

# Alternative: You can also set the token as an environment variable
# import os
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here"

# Or use the interactive login (will prompt for token)
# login()

print("\n⚠️  IMPORTANT: Uncomment one of the login methods above before proceeding!")
print("Without authentication, you'll get 'unauthorized' errors when loading MedGemma.")

if torch.cuda.is_available():
    print(f"\n✅ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("WARNING: No GPU detected! This notebook requires a GPU.")
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
df = pd.read_csv('medquad.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 3 rows:")
print(df.head(3))

print("\nMissing values:")
print(df.isnull().sum())

print("\nUnique sources:")
print(df['source'].value_counts())
print("\nUnique focus areas:")
print(df['focus_area'].value_counts())


In [None]:
def format_instruction(row):
    """Format question and answer into instruction-response format for MedGemma"""
    instruction = f"Instruction: {row['question']}\nResponse: {row['answer']}"
    return instruction

df['text'] = df.apply(format_instruction, axis=1)

df_sample = df.sample(n=min(1000, len(df)), random_state=42)
print(f"Using {len(df_sample)} examples for fine-tuning")

print("\nSample formatted text:")
print(df_sample['text'].iloc[0][:500] + "...")


In [None]:
model_id = "google/medgemma-4b-multimodal"

# Check if user is authenticated
try:
    from huggingface_hub import HfApi
    api = HfApi()
    # Try to get model info - this will fail if not authenticated
    model_info = api.model_info(model_id)
    print("✅ Authentication successful! Model access confirmed.")
except Exception as e:
    print("❌ Authentication failed!")
    print(f"Error: {e}")
    print("\n🔧 Quick Fix:")
    print("Run this in a new cell:")
    print("from huggingface_hub import login")
    print("login()  # This will prompt for your token")
    print("\nOr set your token directly:")
    print("login(token='your_huggingface_token_here')")
    print("\nThen re-run this cell.")
    raise Exception("Please authenticate with HuggingFace first!")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    print("✅ Tokenizer loaded successfully!")
except Exception as e:
    print(f"❌ Failed to load tokenizer: {e}")
    print("This usually means authentication is required.")
    raise e

print("Loading model with 4-bit quantization...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Failed to load model: {e}")
    print("Common solutions:")
    print("1. Make sure you're authenticated with HuggingFace")
    print("2. Check that you have access to the MedGemma model")
    print("3. Verify your internet connection")
    raise e

if torch.cuda.is_available():
    print(f"\n📊 GPU Memory Usage:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"],
)

model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
print(f"All parameters: {all_params:,}")


In [None]:
dataset = Dataset.from_pandas(df_sample[['text']])

def tokenize_function(examples):
    """Tokenize the texts with proper formatting for causal language modeling"""
    texts = [text + tokenizer.eos_token for text in examples['text']]
    
    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding=False,
        max_length=512,
        return_tensors=None
    )
    
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    
    return model_inputs

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing dataset"
)

print(f"Tokenized dataset: {tokenized_dataset}")
print(f"Sample tokenized example length: {len(tokenized_dataset[0]['input_ids'])}")


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-5,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    save_total_limit=2,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

print("Training arguments configured:")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Mixed precision: BF16 = {training_args.bf16}")


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

if torch.cuda.is_available():
    print(f"GPU memory before training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    print(f"  Available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved()) / 1024**3:.2f} GB")

print("\nTrainer initialized with normal Trainer class")
print("Note: Training will take some time. Monitor GPU memory usage with 'nvidia-smi' in another terminal.")


In [None]:
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")
    print("This might be due to:")
    print("1. Insufficient GPU memory - try reducing batch_size or max_length")
    print("2. Model access issues - ensure you have access to the MedGemma model")
    print("3. CUDA compatibility issues - check your PyTorch and CUDA versions")
    
if torch.cuda.is_available():
    print(f"\nGPU memory after training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
output_dir = "./finetuned_medgemma_4b"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Fine-tuned model saved to: {output_dir}")
print("Contents:")
for file in os.listdir(output_dir):
    print(f"  - {file}")

import json
training_config = {
    "model_id": model_id,
    "trainer_type": "normal_trainer",
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": lora_config.target_modules,
    },
    "training_args": {
        "learning_rate": training_args.learning_rate,
        "num_train_epochs": training_args.num_train_epochs,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    },
    "dataset_size": len(tokenized_dataset),
}

with open(os.path.join(output_dir, "training_config.json"), "w") as f:
    json.dump(training_config, f, indent=2)

print("\nTraining configuration saved to training_config.json")


In [None]:
torch.cuda.empty_cache()

print("Loading fine-tuned model for inference...")

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

finetuned_model = PeftModel.from_pretrained(base_model, output_dir)

finetuned_model.eval()

print("Fine-tuned model loaded successfully!")


In [None]:
sample_question = df_sample['question'].iloc[0]
print(f"Sample Question: {sample_question}")
print("\n" + "="*50)

test_input = f"Instruction: {sample_question}\nResponse:"

inputs = tokenizer(test_input, return_tensors="pt").to(device)

print("Generating response...")
with torch.no_grad():
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)

if "Response:" in response:
    generated_response = response.split("Response:")[-1].strip()
else:
    generated_response = response

print(f"Generated Response: {generated_response}")
print("\n" + "="*50)

original_answer = df_sample[df_sample['question'] == sample_question]['answer'].iloc[0]
print(f"Original Answer: {original_answer}")


In [None]:
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    
    print("Final GPU Memory Usage:")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved: {reserved:.2f} GB")
    print(f"  Total GPU Memory: {total:.2f} GB")
    print(f"  Memory Utilization: {(reserved/total)*100:.1f}%")
    
    print("\nMemory Optimization Suggestions:")
    if reserved > 20:
        print("⚠️  High memory usage detected (>20GB). Consider:")
        print("   - Reducing batch_size from 4 to 2")
        print("   - Reducing max_length from 512 to 256")
        print("   - Using gradient_accumulation_steps=8 to maintain effective batch size")
    elif reserved > 15:
        print("⚡ Good memory usage (15-20GB). Current settings are optimal.")
    else:
        print("✅ Low memory usage (<15GB). You could:")
        print("   - Increase batch_size to 8 for faster training")
        print("   - Increase max_length to 1024 for longer sequences")
        print("   - Use a larger dataset sample")
        
    print("\nFor real-time monitoring, run this command in terminal:")
    print("nvidia-smi -l 1")
    
else:
    print("No GPU detected. This notebook requires a CUDA-capable GPU.")


In [None]:
def generate_medical_response(question, model=finetuned_model, tokenizer=tokenizer, max_length=256):
    """
    Generate a medical response for a given question using the fine-tuned model.
    
    Args:
        question (str): The medical question to answer
        model: The fine-tuned model
        tokenizer: The tokenizer
        max_length (int): Maximum length of the response
    
    Returns:
        str: Generated medical response
    """
    test_input = f"Instruction: {question}\nResponse:"
    
    inputs = tokenizer(test_input, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Response:" in response:
        generated_response = response.split("Response:")[-1].strip()
    else:
        generated_response = response
    
    return generated_response

test_questions = [
    "What are the symptoms of diabetes?",
    "How is hypertension treated?",
    "What causes heart disease?",
]

print("Testing with custom questions (Normal Trainer):")
print("="*60)

for i, question in enumerate(test_questions, 1):
    print(f"\nTest {i}:")
    print(f"Question: {question}")
    response = generate_medical_response(question)
    print(f"Response: {response}")
    print("-" * 40)


In [None]:
# 🎉 NORMAL TRAINER COMPLETE!

print("✅ Normal Trainer Implementation Complete!")
print("=" * 50)
print("📋 What you have accomplished:")
print("1. ✅ Environment setup with authentication")
print("2. ✅ Dataset loading and preprocessing")
print("3. ✅ Model configuration with 4-bit quantization")
print("4. ✅ LoRA configuration")
print("5. ✅ Manual tokenization for Normal Trainer")
print("6. ✅ Training arguments optimization")
print("7. ✅ Data collator and trainer initialization")
print("8. ✅ Training execution")
print("9. ✅ Model saving")
print("10. ✅ Inference testing")
print("=" * 50)

print("\n🔧 To fix your 'unauthorized' error:")
print("1. Run the authentication cell (cell 4)")
print("2. Follow the authentication steps")
print("3. Re-run the model loading cell (cell 9)")
print("4. Continue with the rest of the training")

print("\n💡 The Normal Trainer approach gives you:")
print("- Full control over tokenization")
print("- Manual dataset preprocessing")
print("- Explicit label handling")
print("- Better debugging capabilities")
print("- More flexibility for custom training")

print("\n🚀 Ready to train your MedGemma model!")
print("All SFTTrainer cells have been removed for clarity.")


In [None]:
# This cell has been removed - duplicate of Normal Trainer section
print("ℹ️  SFTTrainer import section removed")
print("📋 Please use the Normal Trainer section above instead")


In [None]:
# 🔧 QUICK SOLUTION FOR UNAUTHORIZED ACCESS

print("❌ Getting 'unauthorized' error when accessing MedGemma?")
print("✅ Here's the solution:")
print()
print("1. 🌐 Go to: https://huggingface.co/google/medgemma-4b-multimodal")
print("2. 🔑 Click 'Request Access' button")
print("3. ✅ Accept the license agreement")
print("4. ⏳ Wait for approval (usually takes a few minutes to hours)")
print("5. 🎫 Get your token from: https://huggingface.co/settings/tokens")
print("6. 🔐 Run the authentication cell above (cell 4)")
print("7. ▶️ Continue with Normal Trainer section (cells 1-30)")
print()
print("💡 Most common issue: Forgetting to request access to the model!")
print("📧 You'll get an email when access is approved.")
print()
print("🚀 Once authenticated, the Normal Trainer section will work perfectly!")

# Remove all SFTTrainer cells - use only Normal Trainer approach
print("\n" + "="*60)
print("📋 NOTE: All SFTTrainer cells have been removed from this notebook")
print("🎯 Focus on the Normal Trainer approach (cells 1-30) for your fine-tuning")
print("="*60)


In [None]:
## 🎯 Summary: Normal Trainer Only

This notebook has been cleaned up to focus **only on the Normal Trainer approach** (cells 1-30).

### ✅ What's Working:
- **Normal Trainer**: Complete implementation with manual tokenization
- **Authentication**: Added proper HuggingFace authentication
- **Memory Optimization**: Configured for RTX 4090 24GB VRAM
- **LoRA Fine-tuning**: Efficient parameter updates

### 🔧 To Fix Your "Unauthorized" Error:

1. **Request Access**: https://huggingface.co/google/medgemma-4b-multimodal
2. **Get Token**: https://huggingface.co/settings/tokens
3. **Run Authentication Cell**: Use cell 4 above
4. **Continue with Normal Trainer**: Cells 1-30 contain everything you need

### 🚀 Next Steps:
1. Authenticate using cell 4
2. Run the Normal Trainer section (cells 1-30)
3. The model will fine-tune successfully once authenticated

### 📋 Note:
All SFTTrainer cells have been removed for clarity. The Normal Trainer approach is more flexible and gives you full control over the training process.


In [None]:
# Model ID
model_id = "google/medgemma-4b-multimodal"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization
# Note: If model loading fails due to access restrictions, you may need to:
# 1. Accept the model's license agreement on HuggingFace
# 2. Use your HuggingFace token: from huggingface_hub import login; login()
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Check memory usage
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Alpha parameter for LoRA scaling
    lora_dropout=0.05,  # Dropout probability
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"],  # Target attention modules
    # Note: If these modules don't exist in MedGemma, the model will automatically
    # find suitable modules. Common alternatives include:
    # ["qkv_proj", "o_proj"] or ["query", "value"] or ["attention"]
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Prepare model for k-bit training
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

# Get PEFT model
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
print(f"All parameters: {all_params:,}")


In [None]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df_sample[['text']])

# Define a preprocessing function
def preprocess_function(examples):
    """Tokenize the texts"""
    # Add end-of-sequence token to ensure proper generation stopping
    texts = [text + tokenizer.eos_token for text in examples['text']]
    
    # Tokenize with truncation and padding
    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,  # Adjust based on your GPU memory
        return_tensors="pt"
    )
    
    # Set labels same as input_ids for causal language modeling
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    
    return model_inputs

# Note: We'll let the SFTTrainer handle tokenization for better efficiency
print(f"Dataset prepared with {len(dataset)} examples")


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=25,
    learning_rate=2e-5,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    save_total_limit=2,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

print("Training arguments configured:")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Mixed precision: BF16 = {training_args.bf16}")


In [None]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

# Check memory before training
if torch.cuda.is_available():
    print(f"GPU memory before training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    print(f"  Available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved()) / 1024**3:.2f} GB")

print("\nStarting training...")
print("Note: Training will take some time. Monitor GPU memory usage with 'nvidia-smi' in another terminal.")


In [None]:
# Start training
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")
    print("This might be due to:")
    print("1. Insufficient GPU memory - try reducing batch_size or max_seq_length")
    print("2. Model access issues - ensure you have access to the MedGemma model")
    print("3. CUDA compatibility issues - check your PyTorch and CUDA versions")
    
# Check memory after training
if torch.cuda.is_available():
    print(f"\nGPU memory after training:")
    print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
# Save the fine-tuned model
output_dir = "./finetuned_medgemma_4b"

# Save the LoRA adapters
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Fine-tuned model saved to: {output_dir}")
print("Contents:")
for file in os.listdir(output_dir):
    print(f"  - {file}")

# Save training arguments for reference
import json
training_config = {
    "model_id": model_id,
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": lora_config.target_modules,
    },
    "training_args": {
        "learning_rate": training_args.learning_rate,
        "num_train_epochs": training_args.num_train_epochs,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    },
    "dataset_size": len(dataset),
}

with open(os.path.join(output_dir, "training_config.json"), "w") as f:
    json.dump(training_config, f, indent=2)

print("\nTraining configuration saved to training_config.json")


In [None]:
# Clear GPU memory
torch.cuda.empty_cache()

# Load the fine-tuned model for inference
print("Loading fine-tuned model for inference...")

# Load base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Load the fine-tuned LoRA adapters
finetuned_model = PeftModel.from_pretrained(base_model, output_dir)

# Set model to evaluation mode
finetuned_model.eval()

print("Fine-tuned model loaded successfully!")


In [None]:
# Test with a sample question
sample_question = df_sample['question'].iloc[0]
print(f"Sample Question: {sample_question}")
print("\n" + "="*50)

# Format the input
test_input = f"Instruction: {sample_question}\nResponse:"

# Tokenize the input
inputs = tokenizer(test_input, return_tensors="pt").to(device)

# Generate response
print("Generating response...")
with torch.no_grad():
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the generated part (after "Response:")
if "Response:" in response:
    generated_response = response.split("Response:")[-1].strip()
else:
    generated_response = response

print(f"Generated Response: {generated_response}")
print("\n" + "="*50)

# Compare with original answer
original_answer = df_sample[df_sample['question'] == sample_question]['answer'].iloc[0]
print(f"Original Answer: {original_answer}")


In [None]:
# Check final GPU memory usage
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    
    print("Final GPU Memory Usage:")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved: {reserved:.2f} GB")
    print(f"  Total GPU Memory: {total:.2f} GB")
    print(f"  Memory Utilization: {(reserved/total)*100:.1f}%")
    
    # Provide optimization suggestions
    print("\nMemory Optimization Suggestions:")
    if reserved > 20:  # If using more than 20GB
        print("⚠️  High memory usage detected (>20GB). Consider:")
        print("   - Reducing batch_size from 4 to 2")
        print("   - Reducing max_seq_length from 512 to 256")
        print("   - Using gradient_accumulation_steps=8 to maintain effective batch size")
    elif reserved > 15:
        print("⚡ Good memory usage (15-20GB). Current settings are optimal.")
    else:
        print("✅ Low memory usage (<15GB). You could:")
        print("   - Increase batch_size to 8 for faster training")
        print("   - Increase max_seq_length to 1024 for longer sequences")
        print("   - Use a larger dataset sample")
        
    # Run nvidia-smi equivalent
    print("\nFor real-time monitoring, run this command in terminal:")
    print("nvidia-smi -l 1")
    
else:
    print("No GPU detected. This notebook requires a CUDA-capable GPU.")


In [None]:
def generate_medical_response(question, model=finetuned_model, tokenizer=tokenizer, max_length=256):
    """
    Generate a medical response for a given question using the fine-tuned model.
    
    Args:
        question (str): The medical question to answer
        model: The fine-tuned model
        tokenizer: The tokenizer
        max_length (int): Maximum length of the response
    
    Returns:
        str: Generated medical response
    """
    # Format the input
    test_input = f"Instruction: {question}\nResponse:"
    
    # Tokenize
    inputs = tokenizer(test_input, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode and extract response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Response:" in response:
        generated_response = response.split("Response:")[-1].strip()
    else:
        generated_response = response
    
    return generated_response

# Test with a few custom questions
test_questions = [
    "What are the symptoms of diabetes?",
    "How is hypertension treated?",
    "What causes heart disease?",
]

print("Testing with custom questions:")
print("="*60)

for i, question in enumerate(test_questions, 1):
    print(f"\nTest {i}:")
    print(f"Question: {question}")
    response = generate_medical_response(question)
    print(f"Response: {response}")
    print("-" * 40)
