In [4]:
!pip3 install bitsandbytes peft trl



In [5]:
!pip install git+https://github.com/huggingface/transformers accelerate



Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-jhkhzgr1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-jhkhzgr1

  Resolved https://github.com/huggingface/transformers to commit d1b92369ca193da49f9f7ecd01b08ece45c2c9aa
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [6]:
!pip install qwen-vl-utils

Note: you may need to restart the kernel to use updated packages.


In [7]:
import csv
import os
import torch
import pandas as pd
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info  # Stelle sicher, dass dieses Modul im PYTHONPATH ist
from datasets import load_dataset
from pathlib import Path

In [8]:
# Datensätze laden (Trainings- und Validierungssplit)
dataset = load_dataset("flaviagiammarino/path-vqa")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

print("Datasets wurden geladen.")
print("Trainingsgröße:", len(train_dataset))
print("Validierungsgröße:", len(val_dataset))

Datasets wurden geladen.
Trainingsgröße: 19654
Validierungsgröße: 6259


In [9]:
# Zelle 2: Few-Shot-Beispiele vorbereiten
few_shot_indices = [10658, 18497, 8273, 16324, 10392, 9073, 4623, 10336]
few_shot_examples = []

for idx in few_shot_indices:
    sample = train_dataset[idx]
    few_shot_examples.append({
        "question": sample["question"],
        "answer": sample["answer"],
        "image": sample["image"]
    })

print("Few-Shot-Beispiele vorbereitet. Anzahl:", len(few_shot_examples))


Few-Shot-Beispiele vorbereitet. Anzahl: 8


In [10]:
# Zelle 3: Nachrichtenliste (Prompt) erstellen

# Wähle ein Beispiel aus dem Validierungsdatensatz (z.B. das erste Sample)
sample = val_dataset[0]
val_image = sample["image"]
val_question = sample["question"]

# System Message definieren
system_message = (
    "You are a medical pathology expert. Your task is to answer medical questions "
    "based solely on the visual information in the provided pathology image. "
    "Focus only on what is visible in the image — do not rely on prior medical knowledge, "
    "assumptions, or external information. Your responses should be short, factual, "
    "and medically precise, using appropriate terminology. "
    "Do not include any explanations, reasoning, or additional text. "
    "Use a consistent format, without punctuation, and avoid capitalisation unless medically required. "
    "Only return the exact answer."
)

# Erstelle die Nachrichtenliste
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": system_message}]
    }
]

# Füge alle Few-Shot-Beispiele hinzu, wobei Bild, Frage und Antwort in einer Nachricht kombiniert werden.
for ex in few_shot_examples:
    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": ex["image"]},
            {"type": "text", "text": "question: " + ex["question"] + "\nanswer: " + ex["answer"]}
        ]
    })

# Füge das Validierungssample hinzu (nur Frage, da hier das Modell antworten soll)
messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": val_image},
            {"type": "text", "text": "question: " + val_question},
            {"type": "text", "text": "Answer: "}
        ]
    })

# Zeige die Nachrichtenliste an
print("Nachrichtenliste für den Prompt:")
for i, msg in enumerate(messages):
    print(f"Nachricht {i+1}: {msg}")


Nachrichtenliste für den Prompt:
Nachricht 1: {'role': 'system', 'content': [{'type': 'text', 'text': 'You are a medical pathology expert. Your task is to answer medical questions based solely on the visual information in the provided pathology image. Focus only on what is visible in the image — do not rely on prior medical knowledge, assumptions, or external information. Your responses should be short, factual, and medically precise, using appropriate terminology. Do not include any explanations, reasoning, or additional text. Use a consistent format, without punctuation, and avoid capitalisation unless medically required. Only return the exact answer.'}]}
Nachricht 2: {'role': 'user', 'content': [{'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=792x528 at 0x7FAD1569B460>}, {'type': 'text', 'text': 'question: where is this?\nanswer: urinary'}]}
Nachricht 3: {'role': 'user', 'content': [{'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image

In [11]:
# Zelle 4: Finalen Prompt und Modell-Input erzeugen und Antworten für 10 Beispiele generieren

# Lade Modell und Prozessor
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)
model.eval()  # Schalte das Modell in den Evaluierungsmodus

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# Iteriere über 10 Beispiele aus dem Validierungsdatensatz (z.B. die ersten 10)
for idx in range(10):
    sample = val_dataset[idx]
    val_image = sample["image"]
    val_question = sample["question"]

    # Definiere die System Message
    system_message = (
        "You are a medical pathology expert. Your task is to answer medical questions "
        "based solely on the visual information in the provided pathology image. "
        "Focus only on what is visible in the image — do not rely on prior medical knowledge, "
        "assumptions, or external information. Your responses should be short, factual, "
        "and medically precise, using appropriate terminology. "
        "Do not include any explanations, reasoning, or additional text. "
        "Use a consistent format, without punctuation, and avoid capitalisation unless medically required. "
        "Only return the exact answer."
    )

    # Erstelle die Nachrichtenliste
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}]
        }
    ]

    # Füge alle Few-Shot-Beispiele hinzu, wobei Bild, Frage und Antwort in einer Nachricht kombiniert werden.
    for ex in few_shot_examples:
        messages.append({
            "role": "user",
            "content": [
                {"type": "image", "image": ex["image"]},
                {"type": "text", "text": "question: " + ex["question"] + "\nanswer: " + ex["answer"]}
            ]
        })

    # Füge das Validierungsbeispiel hinzu (nur Bild und Frage, damit das Modell eine Antwort generiert)
    messages.append({
        "role": "user",
        "content": [
            {"type": "image", "image": val_image},
            {"type": "text", "text": "question: " + val_question},
            {"type": "text", "text": "Answer: "}
        ]
    })

    # Erzeuge den Text-Prompt aus der Nachrichtenliste
    text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print("\nBeispiel", idx + 1)
    print("Erzeugter Text-Prompt:\n", text_prompt)

    # Verarbeite die Vision-Informationen (Bilder und Videos)
    image_inputs, video_inputs = process_vision_info(messages)

    # Erstelle die finalen Inputs für das Modell
    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Generiere die Antwort ohne Gradientenberechnung
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        # Entferne den Input-Teil, um nur die generierte Antwort zu erhalten
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

    generated_answer = output_text[0]
    print("Generierte Antwort:", generated_answer)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



Beispiel 1
Erzeugter Text-Prompt:
 <|im_start|>system
You are a medical pathology expert. Your task is to answer medical questions based solely on the visual information in the provided pathology image. Focus only on what is visible in the image — do not rely on prior medical knowledge, assumptions, or external information. Your responses should be short, factual, and medically precise, using appropriate terminology. Do not include any explanations, reasoning, or additional text. Use a consistent format, without punctuation, and avoid capitalisation unless medically required. Only return the exact answer.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>question: where is this?
answer: urinary<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>question: what is present?
answer: female reproductive<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>question: does infarction secondary to shock show normal pancreas?
answer: no<|i