In [1]:
"""
Qwen2.5 Medical Diagnostic Agent Fine-tuning
FIXED VERSION - Completely bypasses Triton compilation issues
"""

# ============================================
# CRITICAL FIX: DISABLE ALL COMPILATION FIRST
# ============================================
import os
import warnings

# MUST BE SET BEFORE ANY OTHER IMPORTS
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHINDUCTOR_DISABLE"] = "1"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

print("🔧 Disabling torch compilation to avoid Triton issues...")

# ============================================
# INSTALLATION
# ============================================

🔧 Disabling torch compilation to avoid Triton issues...


In [2]:
# ============================================
# INSTALLATION
# ============================================
import re

print("📦 Installing dependencies (this may take a few minutes)...")

# Uninstall problematic triton first
os.system("pip uninstall -y triton triton-nightly >/dev/null 2>&1")

if "COLAB_" not in "".join(os.environ.keys()):
    os.system("pip install unsloth >/dev/null 2>&1")
else:
    import torch
    v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")

    # Install without triton
    os.system(f"pip install --no-deps bitsandbytes accelerate {xformers} peft trl cut_cross_entropy unsloth_zoo >/dev/null 2>&1")
    os.system("pip install sentencepiece protobuf 'datasets>=3.4.1,<4.0.0' 'huggingface_hub>=0.34.0' hf_transfer >/dev/null 2>&1")
    os.system("pip install --no-deps unsloth >/dev/null 2>&1")

os.system("pip install transformers==4.56.2 >/dev/null 2>&1")
os.system("pip install --no-deps trl==0.22.2 >/dev/null 2>&1")

print("✅ Dependencies installed")

📦 Installing dependencies (this may take a few minutes)...
✅ Dependencies installed


In [3]:
# ============================================
# IMPORT AND CONFIGURE TORCH
# ============================================
import torch
import torch._dynamo

# Disable all compilation features
torch._dynamo.config.suppress_errors = True
torch.backends.cudnn.benchmark = False

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.8.0+cu126
CUDA available: True


In [5]:
!pip install triton

Collecting triton
  Downloading triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (170.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth 2025.10.9 requires tyro, which is not installed.
unsloth-zoo 2025.10.10 requires msgspec, which is not installed.
unsloth-zoo 2025.10.10 requires tyro, which is not installed.
unsloth-zoo 2025.10.10 requires torchao>=0.13.0, but you have torchao 0.10.0 which is incompatible.
torch 2.8.0+cu126 requires triton==3.4.0; platform_system == "Linux" and platform_machine == "x86_64", but you have triton 3.5.0 which is incompatible.[

In [6]:
# ============================================
# MODEL SETUP
# ============================================
from unsloth import FastLanguageModel

max_seq_length = 4096  # Increased for longer reasoning chains
dtype = None
load_in_4bit = True

print("\n🔬 Loading Qwen2.5-7B for Medical Agent Training...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-7B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("✅ Model loaded successfully")



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth Zoo will now patch everything to make training faster!

🔬 Loading Qwen2.5-7B for Medical Agent Training...
==((====))==  Unsloth 2025.10.9: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

✅ Model loaded successfully


In [7]:
# ============================================
# LORA CONFIGURATION
# ============================================
print("\n⚙️ Configuring LoRA adapters...")

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=True,
    loftq_config=None,
)

print("✅ LoRA adapters configured")

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.



⚙️ Configuring LoRA adapters...


Unsloth 2025.10.9 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ LoRA adapters configured


In [10]:
# ============================================
# DATASET PREPARATION
# ============================================
from datasets import load_dataset

print("\n📚 Loading medical agent dataset...")

# Check if dataset file exists
if not os.path.exists("/content/drive/MyDrive/medical_agent_dataset.jsonl"):
    print("❌ ERROR: medical_agent_dataset.jsonl not found!")
    print("Please run the dataset preprocessing script first.")
    raise FileNotFoundError("medical_agent_dataset.jsonl")

dataset = load_dataset("json", data_files="/content/drive/MyDrive/medical_agent_dataset.jsonl", split="train")
print(f"Dataset size: {len(dataset)}")

# Medical Agent Chat Template
EOS_TOKEN = tokenizer.eos_token

def format_medical_agent(examples):
    """Format messages for medical agent training."""
    texts = []
    for messages in examples["messages"]:
        text = ""
        system_content = ""

        for message in messages:
            role = message["role"]
            content = message["content"]

            if role == "system":
                system_content = content
            elif role == "user":
                if not text:  # First message
                    text = f"<|im_start|>system\n{system_content}<|im_end|>\n"
                text += f"<|im_start|>user\n{content}<|im_end|>\n"
            elif role == "assistant":
                text += f"<|im_start|>assistant\n{content}<|im_end|>\n"

        text += EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Apply formatting
print("🔄 Formatting dataset...")
dataset = dataset.map(
    format_medical_agent,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Formatting for training"
)

print(f"✅ Dataset formatted: {len(dataset)} examples")

# Show sample
print("\n📋 Sample formatted example (first 300 chars):")
print(dataset[0]["text"][:300] + "...")


📚 Loading medical agent dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Dataset size: 276623
🔄 Formatting dataset...


Formatting for training:   0%|          | 0/276623 [00:00<?, ? examples/s]

✅ Dataset formatted: 276623 examples

📋 Sample formatted example (first 300 chars):
<|im_start|>system
You are an intelligent medical diagnostic agent. Your role is to:
1. Receive preliminary disease identification from image analysis
2. Ask relevant follow-up questions about symptoms
3. Retrieve and analyze medical knowledge from the database
4. Engage in self-reflective reasoning...


In [11]:
# ============================================
# TRAINING CONFIGURATION (COMPILATION DISABLED)
# ============================================
from trl import SFTConfig, SFTTrainer

print("\n🏋️ Setting up training configuration...")

# Create trainer with compilation completely disabled
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    args=SFTConfig(
        # Training hyperparameters
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=3,
        learning_rate=2e-5,

        # Mixed precision
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),

        # Optimization
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,

        # Output
        output_dir="outputs",
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,

        # CRITICAL: Disable all compilation
        torch_compile=False,
        torch_compile_backend=None,
        torch_compile_mode=None,

        # Disable problematic features
        dataloader_pin_memory=False,

        # Logging
        report_to="none",
        disable_tqdm=False,
    ),
)

print("✅ Trainer configured")


🏋️ Setting up training configuration...


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/276623 [00:00<?, ? examples/s]

✅ Trainer configured


In [12]:
# ============================================
# MEMORY CHECK
# ============================================
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"\n💾 GPU: {gpu_stats.name}")
print(f"💾 Max memory: {max_memory} GB")
print(f"💾 Reserved: {start_gpu_memory} GB")


💾 GPU: Tesla T4
💾 Max memory: 14.741 GB
💾 Reserved: 7.805 GB


In [None]:
# ============================================
# TRAINING (WITH ERROR HANDLING)
# ============================================
print("\n🚀 Starting training...")
print("=" * 60)

try:
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        trainer_stats = trainer.train()

    print("\n✅ Training complete!")
    print(f"⏱️  Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
    print(f"⏱️  Training time: {trainer_stats.metrics['train_runtime']/60:.2f} minutes")

    # Memory statistics
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"\n💾 Peak memory: {used_memory} GB")
    print(f"💾 Memory for training: {used_memory_for_lora} GB")
    print(f"💾 Peak memory %: {used_percentage}%")

except Exception as e:
    print(f"\n❌ Training failed with error: {str(e)}")
    print("\nTrying alternative training approach...")

    # Fallback: Reduce batch size and disable more features
    trainer.args.per_device_train_batch_size = 1
    trainer.args.gradient_accumulation_steps = 8
    trainer.args.max_grad_norm = 0.3

    print("Retrying with smaller batch size...")
    trainer_stats = trainer.train()
    print("✅ Training completed with fallback settings")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.



🚀 Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 276,623 | Num Epochs = 3 | Total steps = 103,734
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 7,696,356,864 (1.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.8509
20,2.6982
30,2.4771
40,2.0734
50,1.9288
60,1.8568
70,1.7879
80,1.8257
90,1.8058
100,1.7733


In [None]:
# ============================================
# INFERENCE TEST
# ============================================
print("\n🧪 Testing inference...")
FastLanguageModel.for_inference(model)

test_prompt = """<|im_start|>system
You are an intelligent medical diagnostic agent. Your role is to:
1. Receive preliminary disease identification from image analysis
2. Ask relevant follow-up questions about symptoms
3. Retrieve and analyze medical knowledge from the database
4. Engage in self-reflective reasoning to arrive at accurate diagnoses
5. Provide clear, evidence-based medical advice

Always think step-by-step, question your assumptions, and prioritize patient safety.<|im_end|>
<|im_start|>user
[IMAGE_ANALYSIS: Detected skin lesion with irregular borders and color variation]

I noticed this mole on my back has been changing. Should I be concerned?<|im_end|>
<|im_start|>assistant
"""

inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
print("\n" + "=" * 60)
print("INFERENCE TEST OUTPUT:")
print("=" * 60)

with torch.no_grad():
    _ = model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True
    )


In [None]:
# ============================================
# SAVE MODEL
# ============================================
print("\n💾 Saving model...")

# Save LoRA adapters locally
model.save_pretrained("medical_agent_lora")
tokenizer.save_pretrained("medical_agent_lora")
print("✅ Saved LoRA adapters to 'medical_agent_lora'")

# Save merged 16-bit model for inference
try:
    model.save_pretrained_merged(
        "medical_agent_merged",
        tokenizer,
        save_method="merged_16bit"
    )
    print("✅ Saved merged 16-bit model to 'medical_agent_merged'")
except Exception as e:
    print(f"⚠️  Merged model save failed: {e}")
    print("ℹ️  LoRA adapters are still saved and can be used")

# Optional: Push to Hugging Face Hub
print("\n📤 To upload to HuggingFace Hub, run:")
print("""
from huggingface_hub import login
login()  # Enter your token

model.push_to_hub("your-username/qwen2.5-7b-medical-agent", tokenizer=tokenizer)
""")

print("\n" + "=" * 60)
print("🎉 TRAINING PIPELINE COMPLETE!")
print("=" * 60)
print("\n📁 Model saved to:")
print("   ✅ medical_agent_lora/ (LoRA adapters)")
if os.path.exists("medical_agent_merged"):
    print("   ✅ medical_agent_merged/ (Full merged model)")
print("\n🔗 Next steps:")
print("   1. Upload to HuggingFace Hub")
print("   2. Set up API endpoint (use vLLM or HF Inference)")
print("   3. Integrate with Swin Transformer vision model")
print("   4. Connect to RAG/vector database")
print("\n💡 API Integration Example:")
print("""
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("your-username/qwen2.5-7b-medical-agent")
tokenizer = AutoTokenizer.from_pretrained("your-username/qwen2.5-7b-medical-agent")

# Use in your agent
prompt = f"[IMAGE_ANALYSIS: {vision_result}]\\n\\n{user_query}"
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs)
response = tokenizer.decode(output[0])
""")