In [1]:
import subprocess
import sys

# Downgrade NumPy to 1.x for PyTorch compatibility
!pip install "numpy<2.0" --force-reinstall

# Remove all conflicting packages
packages_to_remove = [
    'transformers', 'accelerate', 'torch', 'torchvision', 'torchaudio',
    'tokenizers', 'scikit-learn', 'bitsandbytes', 'deepspeed', 'peft'
]

for package in packages_to_remove:
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', package, '-y'],
                  check=False, capture_output=True)

# Clear all caches
subprocess.run([sys.executable, '-m', 'pip', 'cache', 'purge'], check=False)

!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118

!pip install accelerate==0.34.2

!pip install transformers==4.44.2

!pip install tokenizers==0.19.1 scikit-learn==1.3.2

!pip install pydicom

!pip install peft==0.10.0

!pip install trl==0.7.10

!pip install pandas

!pip install matplotlib

!pip install tqdm

!pip install bitsandbytes


try:
    import numpy as np
    print(f"NumPy: {np.__version__}")

    import torch
    print(f"PyTorch: {torch.__version__}")

    if torch.cuda.is_available():
      print("GPU is available:", torch.cuda.get_device_name(0))
    else:
      print("GPU is not available. Go to Runtime > Change runtime type > Select GPU.")

    import transformers
    print(f"Transformers: {transformers.__version__}")

    import accelerate
    print(f"Accelerate: {accelerate.__version__}")

    from accelerate.utils.memory import clear_device_cache
    print("clear_device_cache import successful!")

    print("All packages installed successfully!")
    print("Now restart your runtime and proceed with MedTrinity installation.")

except Exception as e:
    print(f"Verification failed: {e}")
    import traceback
    traceback.print_exc()

Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you ha

In [1]:
# Run AFTER restarting runtime
# Verify our packages are still working
try:
    import torch
    import transformers
    import accelerate
    from accelerate.utils.memory import clear_device_cache
    print("All packages verified after restart")
    print(f"PyTorch: {torch.__version__}")
    print(f"Transformers: {transformers.__version__}")
    print(f"Accelerate: {accelerate.__version__}")
except ImportError as e:
    print(f"Package verification failed: {e}")
    print("Please re-run the compatibility fix script")
    exit()

# Install system dependencies
!apt-get update -y && apt-get install -y git

# Clone MedTrinity
print("\n=== Cloning MedTrinity ===")
!git clone https://github.com/UCSC-VLAA/MedTrinity-25M.git
%cd MedTrinity-25M

# Install MedTrinity
print("\n=== Installing MedTrinity ===")
!pip install -e . --no-deps

# Install only the additional packages we absolutely need
!pip install Pillow requests tqdm

# Try to install the scaling package (optional)
try:
    !pip install git+https://github.com/bfshi/scaling_on_scales.git --no-deps
    print("✓ Scaling package installed")
except:
    print("Scaling package failed - continuing without it")

print("Ready to load your model.")

All packages verified after restart
PyTorch: 2.3.1+cu118
Transformers: 4.44.2
Accelerate: 0.34.2
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,021 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe a

In [2]:
from google.colab import drive
import os
import zipfile

#Mount Google Drive
print("=== Mounting Google Drive ===")
drive.mount('/content/drive')

#Define paths
zip_path = "/content/drive/MyDrive/BaseModelLLaVa.zip"
extract_dir = "/content/llava_model"

#Unzip the model
print(f"=== Extracting {zip_path} to {extract_dir} ===")
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Model unzipped successfully!")
print("\n=== Extracted files ===")

extract_dir = "/content/llava_model/BaseModelLLaVa"

# Load the model
# Check if we have any model files
if os.path.exists(extract_dir) and os.listdir(extract_dir):
    print(f"\n=== Model files found in {extract_dir} ===")
    !ls -la "{extract_dir}"

    # Try to load the model
    try:
        from llava.model.builder import load_pretrained_model

        print("=== Loading model ===")
        tokenizer, model, image_processor, context_len = load_pretrained_model(
            model_path=extract_dir,
            model_base=None,
            model_name="llava-v1.5",
            load_8bit=False,
            load_4bit=False,
            device_map="auto"
        )

        print("Model loaded successfully!")
        print(f"Context length: {context_len}")

    except Exception as e:
        print(f"Model loading failed: {e}")
        print("The model files might be incomplete or in a different format")
else:
    print(f"\nNo model files found in {extract_dir}")
    print("Please try the alternative methods mentioned above")

=== Mounting Google Drive ===
Mounted at /content/drive
=== Extracting /content/drive/MyDrive/BaseModelLLaVa.zip to /content/llava_model ===
Model unzipped successfully!

=== Extracted files ===

=== Model files found in /content/llava_model/BaseModelLLaVa ===
total 16343420
drwxr-xr-x 2 root root       4096 Jun 11 02:46 .
drwxr-xr-x 4 root root       4096 Jun 11 02:43 ..
-rw-r--r-- 1 root root       1323 Jun 11 02:45 config.json
-rw-r--r-- 1 root root        240 Jun 11 02:45 generation_config.json
-rw-r--r-- 1 root root 4976706864 Jun 11 02:45 model-00001-of-00004.safetensors
-rw-r--r-- 1 root root 4999802720 Jun 11 02:48 model-00002-of-00004.safetensors
-rw-r--r-- 1 root root 4915916176 Jun 11 02:46 model-00003-of-00004.safetensors
-rw-r--r-- 1 root root 1833960344 Jun 11 02:43 model-00004-of-00004.safetensors
-rw-r--r-- 1 root root      73152 Jun 11 02:46 model.safetensors.index.json
-rw-r--r-- 1 root root        439 Jun 11 02:45 special_tokens_map.json
-rw-r--r-- 1 root root      5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/llava_model/BaseModelLLaVa were not used when initializing LlavaLlamaForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.laye

Adding pad token as '<pad>'
tokenizer's pad token id is:  128256


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Model loaded successfully!
Context length: 2048


In [None]:
import os
import zipfile

# Dataset import
# Unzip ODIR_Split.zip if not already extracted
zip_path = "/content/drive/MyDrive/ODIR_Split.zip"
extract_path = "/content/ODIR_Split"

if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

print("Unzipped ODIR_Split")

✅ Unzipped ODIR_Split


In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from llava.mm_utils import process_images
from llava.conversation import conv_templates
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import os

# Load data
json_path = "/content/fundus_prompt_label_triples_v3.jsonl"
with open(json_path, 'r') as f:
    lines = [json.loads(line.strip()) for line in f.readlines()]
    data = lines  # Only first 10 for overfitting

# Define custom dataset
class FundusDataset(Dataset):
    def __init__(self, data, tokenizer, image_processor, model_config):
        self.data = data
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      entry = self.data[idx]
      image = Image.open(entry["image_path"]).convert("RGB")
      image_tensor = process_images([image], self.image_processor, self.model_config)[0]

      # Build conversation prompt
      conv = conv_templates["llava_v1"].copy()
      conv.append_message(conv.roles[0], entry["prompt"])
      conv.append_message(conv.roles[1], None)
      input_text = conv.get_prompt()

      # Tokenize prompt and label together
      prompt_ids = self.tokenizer(input_text, return_tensors="pt").input_ids[0]
      label_text = json.dumps(entry["label"])  # e.g., ["Diabetes"]
      label_ids = self.tokenizer(label_text, return_tensors="pt", add_special_tokens=False).input_ids[0]

      input_ids = torch.cat([prompt_ids, label_ids], dim=0)

      # Mask label part for training (ignore prompt in loss)
      labels = torch.full_like(input_ids, -100)
      labels[-len(label_ids):] = label_ids

      return {
          "input_ids": input_ids.to(torch.long),
          "labels": labels.to(torch.long),
          "image_tensor": image_tensor
      }

# Load dataset
dataset = FundusDataset(data, tokenizer, image_processor, model.config)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Prepare model with LoRA
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)
model.train()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

device = model.device
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        images = batch["image_tensor"].unsqueeze(0).to(device, dtype=torch.float32)

        outputs = model(
            input_ids=input_ids,
            images=images,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")

print("\nDone training. Let's run inference on same samples.")

#Save trained model
model.save_pretrained("/content/llava_peft_adapter")

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from llava.mm_utils import process_images, tokenizer_image_token
from llava.conversation import conv_templates
import pandas as pd
from tqdm import tqdm

# ---- LOAD TEST JSONL ----
json_path = "/content/fundus_prompt_label_triples.jsonl"
with open(json_path, 'r') as f:
    lines = [json.loads(line.strip()) for line in f.readlines()]
    data = lines  # use all test samples

# ---- CHAT FUNCTION ----
def chat(model, image, prompt, tokenizer, image_processor, device, max_new_tokens=100):
    # 1. Preprocess image
    image_tensor = process_images([image], image_processor, model.config).to(device, dtype=torch.float32).unsqueeze(0)
    model.model.image_previous = image_tensor  # inject into vision tower

    # 2. Format prompt
    conv = conv_templates["llava_v1"].copy()
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()

    # 3. Insert <image> token and tokenize
    prompt_ids = tokenizer_image_token(raw_prompt, tokenizer, 32000)
    input_ids = torch.tensor([prompt_ids]).to(device)

    # 4. Generate output
    output_ids = model.model.generate(
        inputs=input_ids,
        images=image_tensor,
        do_sample=False,
        max_new_tokens=max_new_tokens
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ---- INFERENCE LOOP ----
results = []
for entry in tqdm(data, desc="Running Inference"):
    image = Image.open(entry["image_path"]).convert("RGB")
    prompt = entry["prompt"]

    try:
        response = chat(model, image, prompt, tokenizer, image_processor, model.device)
    except Exception as e:
        response = f"Error: {e}"

    results.append({
        "image_path": entry["image_path"],
        "true_label": entry["label"],
        "generated": response.strip()
    })

# ---- SAVE TO CSV ----
df = pd.DataFrame(results)
df.to_csv("/content/llava_peft_test_results.csv", index=False)
print("Inference complete! Results saved to llava_peft_test_results.csv")


Running Inference:   0%|          | 0/1822 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:   0%|          | 1/1822 [00:00<25:27,  1.19it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:   0%|          | 2/1822 [00:01<23:55,  1.27it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:   0%|          | 3/1822 [00:02<

✅ Inference complete! Results saved to llava_peft_test_results.csv





In [None]:
import pandas as pd

df = pd.DataFrame(results)
df.head(10)
with pd.option_context('display.max_colwidth', None):
    display(df.iloc[2])

Unnamed: 0,2
image_path,/content/ODIR_Split/ODIR_Split/test/3399_left.jpg
true_label,[Normal]
generated,"[""Normal""]"


In [None]:
type(model)

In [None]:
from peft import PeftModel

# Merge adapter weights into base model
merged_model = model.merge_and_unload()

# Save the fully merged model
merged_model.save_pretrained("/content/llava_peft_merged")
tokenizer.save_pretrained("/content/llava_peft_merged")

('/content/llava_peft_merged/tokenizer_config.json',
 '/content/llava_peft_merged/special_tokens_map.json',
 '/content/llava_peft_merged/tokenizer.json')

In [None]:
import zipfile

model_folder = "/content/llava_peft_merged"
zip_filename = "/content/llava_peft_merged.zip"

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_folder):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, model_folder)
            zipf.write(file_path, arcname)

print("Merged model zipped successfully!")

✅ Merged model zipped successfully!


In [None]:
import os

file_path = "/content/llava_peft_merged.zip"
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
print(f"📦 File size: {file_size_mb:.2f} MB")

📦 File size: 16287.23 MB


In [None]:
import shutil

destination_path = "/content/drive/MyDrive/llava_peft_merged.zip"
shutil.move("/content/llava_peft_merged.zip", destination_path)

print(f"Merged model zip moved to {destination_path}")

✅ Merged model zip moved to /content/drive/MyDrive/llava_peft_merged.zip


In [None]:
from peft import PeftModel, PeftConfig
import torch

# === Path to your trained LoRA adapter ===
adapter_path = "/content/peft_adapter"

# === Load the adapter into the base model ===
print("🔧 Loading PEFT adapter into base model...")
model = PeftModel.from_pretrained(model, adapter_path)
model.eval()

# === Test ===
print("PEFT adapter successfully loaded into base model!")
print("Adapter path:", adapter_path)

# Optional: Check if model is now wrapped in PEFT
print("🔍 Model class:", type(model))

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from llava.mm_utils import process_images, tokenizer_image_token
from llava.conversation import conv_templates
import pandas as pd
from tqdm import tqdm

# ---- LOAD TEST JSONL ----
json_path = "/content/fundus_prompt_label_triples.jsonl"
with open(json_path, 'r') as f:
    lines = [json.loads(line.strip()) for line in f.readlines()]
    data = lines[:10]  # use all test samples

# ---- CHAT FUNCTION ----
def chat(model, image, prompt, tokenizer, image_processor, device, max_new_tokens=100):
    # 1. Preprocess image
    image_tensor = process_images([image], image_processor, model.config).to(device).half().unsqueeze(0)

    # 2. Format prompt
    conv = conv_templates["llava_v1"].copy()
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()

    # 3. Insert <image> token and tokenize
    prompt_ids = tokenizer_image_token(raw_prompt, tokenizer, 32000)
    print(f"Tokenized prompt IDs: {prompt_ids}")
    print(f"Decoded prompt: {tokenizer.decode(prompt_ids)}")
    input_ids = torch.tensor([prompt_ids]).to(device)

    # 4. Generate output
    output_ids = model.generate(
        inputs=input_ids,
        images=image_tensor,
        do_sample=False,
        max_new_tokens=max_new_tokens
    )
    print(f"\nPrompt: {prompt}")
    print(f"Output IDs: {output_ids}")
    print(f"Decoded: {tokenizer.decode(output_ids[0])}")
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ---- INFERENCE LOOP ----
results = []
for entry in tqdm(data, desc="Running Inference"):
    image = Image.open(entry["image_path"]).convert("RGB")
    prompt = entry["prompt"]

    try:
        response = chat(model, image, prompt, tokenizer, image_processor, model.device)
    except Exception as e:
        response = f"Error: {e}"

    results.append({
        "image_path": entry["image_path"],
        "true_label": entry["label"],
        "generated": response.strip()
    })

# ---- SAVE TO CSV ----
df = pd.DataFrame(results)
df.to_csv("/content/llava_peft_test_results.csv", index=False)
print("Inference complete! Results saved to llava_peft_test_results.csv")


Running Inference:   0%|          | 0/10 [00:00<?, ?it/s]

Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 66322, 4787, 512, 12, 18944, 320, 45, 340, 12, 53689, 320, 35, 340, 12, 8444, 2933, 82945, 320, 38, 340, 12, 356, 6526, 533, 320, 34, 340, 12, 13381, 14228, 7553, 1299, 58337, 17699, 320, 32, 340, 12, 39515, 531, 2711, 320, 39, 340, 12, 8092, 5848, 3092, 30651, 320, 44, 340, 12, 7089, 19338, 477, 75815, 320, 46, 696, 96830, 2317, 25, 24070, 2536, 43036, 1413, 2160, 258, 54042, 271, 861, 682, 4787, 13468, 304, 279, 2217, 13, 1442, 279, 8071, 374, 9498, 11, 6013, 449, 25, 4482, 12484, 5638, 1442, 5361, 4787, 527, 3118, 11, 1160, 682, 315, 1124, 304, 264, 13325, 1160, 11, 384, 1326, 2637, 4482, 22427, 16629, 498, 330, 32641, 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  10%|█         | 1/10 [00:00<00:04,  1.88it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

Clinical context: moderate non proliferative retinopathy

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  20%|██        | 2/10 [00:01<00:04,  2.00it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  30%|███       | 3/10 [00:01<00:03,  2.04it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  40%|████      | 4/10 [00:01<00:02,  2.06it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  50%|█████     | 5/10 [00:02<00:02,  2.07it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

Clinical context: severe nonproliferative retinopathy

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  60%|██████    | 6/10 [00:02<00:01,  2.08it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

Clinical context: severe nonproliferative retinopathy

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  70%|███████   | 7/10 [00:03<00:01,  2.09it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  80%|████████  | 8/10 [00:03<00:00,  2.10it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference:  90%|█████████ | 9/10 [00:04<00:00,  2.10it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
Tokenized prompt IDs: [128000, 32, 6369, 1990, 264, 22999, 3823, 323, 459, 21075, 11478, 18328, 13, 578, 18328, 6835, 11190, 11, 11944, 11, 323, 48887, 11503, 311, 279, 3823, 596, 4860, 13, 14194, 25, 1472, 527, 264, 6593, 11376, 44658, 6335, 13, 38527, 3059, 279, 3984, 3887, 355, 2217, 323, 10765, 902, 315, 279, 2768, 18274, 1299, 4787, 527, 3118, 11, 422, 904, 382, 6632

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running Inference: 100%|██████████| 10/10 [00:04<00:00,  2.08it/s]


Prompt: You are a medical vision-language expert. Analyze the provided fundus image and identify which of the following ocular conditions are present, if any.

Possible conditions:
- Normal (N)
- Diabetes (D)
- Glaucoma (G)
- Cataract (C)
- Age-related Macular Degeneration (A)
- Hypertension (H)
- Pathological Myopia (M)
- Other diseases or abnormalities (O)

List all conditions observed in the image. If the eye is healthy, respond with: ["Normal"]. If multiple conditions are present, list all of them in a Python list, e.g., ["Diabetes", "Glaucoma"].
Output IDs: tensor([[128009]], device='cuda:0')
Decoded: <|eot_id|>
✅ Inference complete! Results saved to llava_peft_test_results.csv



