In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton unsloth
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --quiet huggingface_hub
!pip install unsloth_zoo

1Ô∏è‚É£ Load LoRA fine-tuned model

In [None]:
import os
from datetime import datetime
from datasets import load_dataset
from huggingface_hub import HfApi, upload_folder
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from trl import SFTTrainer, SFTConfig
import torch

AUTO_RESTART_ON_P100 = True  # Set to False to train on P100 anyway
MAX_RESTART_ATTEMPTS = 10     # Maximum number of restart attempts

In [None]:
if not torch.cuda.is_available():
    print("‚ùå No GPU available ‚Äî exiting notebook")
    sys.exit(1)  # Exit with failure
else:
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    gpu_props = torch.cuda.get_device_properties(0)
    
    print("="*60)
    print(f"üñ•Ô∏è  GPU: {gpu_name}")
    print(f"   CUDA Capability: {gpu_props.major}.{gpu_props.minor}")
    print(f"   Total Memory: {gpu_props.total_memory / 1024**3:.2f} GB")
    print("="*60)
    
    # Detect if we're on P100
    is_p100 = "P100" in gpu_name.upper()
    
    if is_p100 and AUTO_RESTART_ON_P100:
        print("\n‚ùå P100 GPU detected!")
        print("\n‚ö†Ô∏è  P100 Issues:")
        print("  ‚Ä¢ 3-5x slower than T4/V100/A100")
        print("  ‚Ä¢ No Tensor Cores")
        print("  ‚Ä¢ No Triton compiler support")
        print("  ‚Ä¢ May timeout on long training")
        
        # Check restart counter
        restart_file = "/kaggle/working/.restart_counter"
        if os.path.exists(restart_file):
            with open(restart_file, 'r') as f:
                restart_count = int(f.read().strip())
        else:
            restart_count = 0
        
        restart_count += 1
        
        if restart_count >= MAX_RESTART_ATTEMPTS:
            print(f"\n‚ö†Ô∏è  Reached maximum restart attempts ({MAX_RESTART_ATTEMPTS})")
            print("Proceeding with P100 training...")
            # Clean up restart counter
            if os.path.exists(restart_file):
                os.remove(restart_file)
        else:
            # Save restart counter
            with open(restart_file, 'w') as f:
                f.write(str(restart_count))
            
            print(f"\nüîÑ Restarting kernel (Attempt {restart_count}/{MAX_RESTART_ATTEMPTS})...")
            print("Waiting 3 seconds before restart...")
            time.sleep(3)
            
            # Restart kernel
            os._exit(0)
    
    elif is_p100:
        print("\n‚ö†Ô∏è  P100 detected but auto-restart is disabled")
        print("Training will proceed with P100 compatibility mode...\n")
        
        # CRITICAL: Disable torch.compile for P100 (CUDA capability 6.0)
        import torch._dynamo
        torch._dynamo.config.suppress_errors = True
        torch._dynamo.config.disable = True
        os.environ["DISABLE_COMPILE"] = "1"
        print("üîß Disabled torch.compile for P100 compatibility\n")
        
        # Clean up restart counter if exists
        restart_file = "/kaggle/working/.restart_counter"
        if os.path.exists(restart_file):
            os.remove(restart_file)
    else:
        print("\n‚úÖ Excellent GPU for training!")
        print("All optimizations enabled.\n")
        is_p100 = False
        
        # Clean up restart counter if exists
        restart_file = "/kaggle/working/.restart_counter"
        if os.path.exists(restart_file):
            os.remove(restart_file)



In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

HF_REPO_ID = "HamzaKhan-03/Hamza_the_doctor.ai"

print(f"‚úÖ Using Hugging Face repo: {HF_REPO_ID}")

In [None]:
print("‚¨áÔ∏è Loading model + tokenizer from Hugging Face...")
model, tokenizer = FastModel.from_pretrained(
    HF_REPO_ID,
    load_in_4bit=True,   # Efficient low-bit training
    token=HF_TOKEN,
)
model.to("cuda")
print("‚úÖ Model loaded successfully!")

2Ô∏è‚É£ Prepare Dataset

In [None]:
print("‚¨áÔ∏è Loading dataset (lavita/ChatDoctor-HealthCareMagic-100k)...")
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split="train")

def convert_to_chatml(example):
    return {
        "conversations": [
            {"role": "system", "content": example["instruction"]},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
        ]
    }

dataset = dataset.map(convert_to_chatml)
print("‚úÖ Converted dataset to chatML format")

# Apply Gemma chat template
tokenizer = get_chat_template(tokenizer, chat_template="gemma3")

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix("<bos>")
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print("‚úÖ Applied chat template")

3Ô∏è‚É£ Fine-tuning Configuration

In [None]:
print("‚öôÔ∏è Setting up trainer for 1 epoch run...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset.shuffle(seed=42).select(range(2000)),  # subset for quick daily run
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=5e-5,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="/kaggle/working/my_lora_model",
        report_to="none",
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=1,
        load_best_model_at_end=False,
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user\n",
    response_part="<start_of_turn>model\n",
)

4Ô∏è‚É£ Train and Save Model as .safetensors

In [None]:
print("üöÄ Starting fine-tuning (1 epoch)...")
trainer.train()
print("‚úÖ Training complete!")

In [None]:
SAVE_DIR = "/kaggle/working/my_lora_model"
print(f"üíæ Saving merged model to {SAVE_DIR}...")

FastModel.save_pretrained(
    model,
    save_directory=SAVE_DIR,
    tokenizer=tokenizer,
    merge_lora=True,
    save_safetensors=True,
)
print("‚úÖ Model saved successfully!")

5Ô∏è‚É£ Upload to Kaggle Dataset (for GitHub sync)

In [None]:
print("‚¨ÜÔ∏è Uploading updated weights to Hugging Face...")

api = HfApi()
commit_message = f"Daily fine-tune update ‚Äî {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"

api.upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,
    commit_message=commit_message,
    token=HF_TOKEN,
)
print(f"‚úÖ Successfully pushed new version: https://huggingface.co/{HF_REPO_ID}")

In [None]:
import shutil
shutil.rmtree(SAVE_DIR, ignore_errors=True)
print("üßπ Cleaned temporary model files. All done!")