In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton unsloth
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --quiet huggingface_hub
!pip install unsloth_zoo

1Ô∏è‚É£ Load LoRA fine-tuned model

In [None]:
import os
from datetime import datetime
from datasets import load_dataset
from huggingface_hub import HfApi, upload_folder
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from trl import SFTTrainer, SFTConfig






In [None]:
if not torch.cuda.is_available():
    print("‚ùå No GPU available ‚Äî exiting notebook")
    sys.exit(1)  # Exit with failure
else:
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    print(f"‚úÖ GPU detected: {gpu_name}")
    
    # Detect if we're on P100
    is_p100 = "P100" in gpu_name.upper()
    if is_p100:
        print("‚ö†Ô∏è P100 GPU detected ‚Äî applying compatibility settings")
        # CRITICAL: Disable torch.compile for P100 (CUDA capability 6.0)
        # P100 is not supported by Triton compiler
        import torch._dynamo
        torch._dynamo.config.suppress_errors = True
        torch._dynamo.config.disable = True
        os.environ["DISABLE_COMPILE"] = "1"
        print("üîß Disabled torch.compile for P100 compatibility")
    else:
        print("‚úÖ Modern GPU detected ‚Äî using standard settings")


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

HF_REPO_ID = "HamzaKhan-03/Hamza_the_doctor.ai"

print(f"‚úÖ Using Hugging Face repo: {HF_REPO_ID}")

In [None]:

print("‚¨áÔ∏è Loading model + tokenizer from Hugging Face...")

# P100-compatible model loading parameters
if is_p100:
    print("üîß Using P100-compatible settings...")
    # Disable compilation for unsloth
    os.environ["UNSLOTH_COMPILE"] = "0"
    
    model, tokenizer = FastModel.from_pretrained(
        HF_REPO_ID,
        load_in_4bit=True,
        token=HF_TOKEN,
        device_map="cuda:0",
        max_seq_length=2048,  # Reduced for P100 memory
        dtype=None,  # Auto-detect, avoiding bfloat16
        use_gradient_checkpointing="unsloth",
    )
else:
    print("üîß Using standard settings...")
    model, tokenizer = FastModel.from_pretrained(
        HF_REPO_ID,
        load_in_4bit=True,
        token=HF_TOKEN,
        max_seq_length=4096,
        dtype=None,
    )

model.to("cuda")
print("‚úÖ Model loaded successfully!")



2Ô∏è‚É£ Prepare Dataset

In [None]:
print("‚¨áÔ∏è Loading dataset (lavita/ChatDoctor-HealthCareMagic-100k)...")
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split="train")

def convert_to_chatml(example):
    return {
        "conversations": [
            {"role": "system", "content": example["instruction"]},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
        ]
    }

dataset = dataset.map(convert_to_chatml)
print("‚úÖ Converted dataset to chatML format")

# Apply Gemma chat template
tokenizer = get_chat_template(tokenizer, chat_template="gemma3")

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix("<bos>")
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print("‚úÖ Applied chat template")


3Ô∏è‚É£ Fine-tuning Configuration

In [None]:

print("‚öôÔ∏è Setting up trainer for 1 epoch run...")

# Adjust training parameters based on GPU
if is_p100:
    batch_size = 4  # Smaller batch for P100
    gradient_accum = 2  # Compensate with gradient accumulation
    max_seq_len = 2048
    print(f"üîß P100 settings: batch_size={batch_size}, grad_accum={gradient_accum}, max_seq={max_seq_len}")
else:
    batch_size = 8
    gradient_accum = 1
    max_seq_len = 4096
    print(f"üîß Standard settings: batch_size={batch_size}, grad_accum={gradient_accum}, max_seq={max_seq_len}")

# Configure training arguments with GPU-specific settings
training_args = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accum,
    warmup_steps=5,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="/kaggle/working/my_lora_model",
    report_to="none",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    load_best_model_at_end=False,
    max_seq_length=max_seq_len,
    # P100 compatibility settings
    fp16=not is_p100,  # Use fp16 only on non-P100 GPUs
    bf16=False,  # P100 doesn't support bfloat16
    dataloader_pin_memory=True,
    gradient_checkpointing=True if is_p100 else False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset.shuffle(seed=42).select(range(2000)),  # subset for quick daily run
    eval_dataset=None,
    args=training_args,
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user\n",
    response_part="<start_of_turn>model\n",
)


4Ô∏è‚É£ Train and Save Model as .safetensors

In [None]:
print("üöÄ Starting fine-tuning (1 epoch)...")
trainer.train()
print("‚úÖ Training complete!")


In [None]:
SAVE_DIR = "/kaggle/working/my_lora_model"
print(f"üíæ Saving merged model to {SAVE_DIR}...")

# Save with float32 precision for cross-GPU compatibility
model.save_pretrained_merged(
    SAVE_DIR,
    tokenizer,
    save_method="merged_16bit",  # Compatible across all GPUs
)
print("‚úÖ Model saved successfully!")

# Save GPU info for future reference
gpu_info_path = os.path.join(SAVE_DIR, "training_gpu_info.txt")
with open(gpu_info_path, "w") as f:
    f.write(f"Trained on: {gpu_name}\n")
    f.write(f"Date: {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}\n")
    f.write(f"P100 Compatible: Yes\n")
    f.write(f"Batch Size: {batch_size}\n")
    f.write(f"Gradient Accumulation: {gradient_accum}\n")
print(f"üìù Saved GPU info to {gpu_info_path}")



5Ô∏è‚É£ Upload to Kaggle Dataset (for GitHub sync)

In [None]:
print("‚¨ÜÔ∏è Uploading updated weights to Hugging Face...")

api = HfApi()
commit_message = f"Daily fine-tune update ({gpu_name}) ‚Äî {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"

api.upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,
    commit_message=commit_message,
    token=HF_TOKEN,
)
print(f"‚úÖ Successfully pushed new version: https://huggingface.co/{HF_REPO_ID}")


In [None]:
import shutil
shutil.rmtree(SAVE_DIR, ignore_errors=True)
print("üßπ Cleaned temporary model files. All done!")