In [1]:
pip install transformers accelerate pillow torch torchvision bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m52.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m51.0 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: sympy, bitsandbytes
[2K  Attempting uninstall: sympy
[2K    Found existing installation: sympy 1.14.0
[2K    Uninstalling sympy-1.14.0:
[2K      Successfully uninstalled sympy-1.14.0━━━━━[0m [32m0/2[0m [sympy]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [bitsandbytes][0m [bitsandbytes]
[1A[2KSuccessfully installed bitsandbytes-0.49.

In [1]:
import torch
from torchvision import transforms, models
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq

# -------------------------------------------------------
# 1. ResNet18 laden
# -------------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

num_classes = 13
class_names = [
    "ABAP Dictionary",
    "BW4Cockpit (Stammdaten)",
    "Bewegungsdaten",
    "Composite Provider",
    "DTP",
    "Data Flow Object",
    "Data Mart",
    "Data Source",
    "Data Store Object",
    "Datenvorschau",
    "Excel",
    "Query",
    "Transformationen"
]

resnet = models.resnet18(weights=None)
resnet.fc = torch.nn.Linear(resnet.fc.in_features, num_classes)
resnet.load_state_dict(torch.load("outputs/resnet_best.pt", map_location=device))
resnet = resnet.to(device)
resnet.eval()

# -------------------------------------------------------
# 2. Qwen laden
# -------------------------------------------------------
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

processor = AutoProcessor.from_pretrained(model_id, token="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")

qwen = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
    token="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"
)

# -------------------------------------------------------
# 3. Bildvorverarbeitung für ResNet
# -------------------------------------------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

def classify_image(image_path):
    img = Image.open(image_path).convert("RGB")
    tensor = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = resnet(tensor)
        _, pred = torch.max(outputs, 1)

    return class_names[pred.item()], img

# -------------------------------------------------------
# 4. Prompts pro Klasse
# -------------------------------------------------------
PROMPTS = {
    "Excel": """
Extrahiere die tabellarischen Daten aus dem Excel-Screenshot.
Ignoriere UI-Elemente wie Menüleisten, Filterbereiche, Metadaten oder Seitentitel.
Konzentriere dich ausschließlich auf die sichtbare Tabelle.

Gib NUR gültiges JSON zurück.
KEIN Markdown.
KEINE ```json Blöcke.
KEINE Erklärungen.
KEINE Kommentare.
KEINE zusätzlichen Texte.

Format:

{
  "columns": ["Spalte1", "Spalte2", ...],
  "rows": [
    ["Wert1", "Wert2", ...],
    ["Wert1", "Wert2", ...]
  ]
}
""",

    "DTP": """
Analysiere den Screenshot eines Data Transfer Process (DTP) in SAP BW/4HANA.

Extrahiere ausschließlich die für einen DTP relevanten Informationen:

- DTP-Name bzw. technische ID
- Ausführungsmodus (z. B. Serial SAP HANA Execution, Dialog, Hintergrund)
- Quelle des DTP (z. B. DataSource, RSDS, Datei)
- Zielobjekt (z. B. ADSO, InfoCube)
- Sichtbare Prozessschritte (z. B. Fill data transfer intermediate storage, Prepare for Extraction, Data Package Loop)
- Hinweise auf die Extraktionsart (z. B. Datei-Extraktion)
- Sichtbare Systemmeldungen oder Pop-ups (z. B. SAP GUI Security File Access)

Gib NUR gültiges JSON zurück.
KEIN Markdown.
KEINE ```json Blöcke.
KEINE Erklärungen.
KEINE Kommentare.
KEINE zusätzlichen Texte.

Format:

{
  "dtp_name": "",
  "execution_mode": "",
  "source": "",
  "target": "",
  "process_steps": [],
  "extraction_type": "",
  "system_messages": []
}
""",

        "Transformationen": """
Analysiere den Screenshot einer Transformation in SAP BW/4HANA.

Extrahiere ausschließlich die variablen, inhaltlichen Informationen der Transformation:

- Name bzw. technische ID der Quelle
- Name bzw. technische ID des Ziels
- Alle Quellfelder (Name + Datentyp)
- Alle Zielfelder (Name + Datentyp)
- Alle sichtbaren Feldzuordnungen (source_field → target_field)
- Nur echte Mappings, keine Linieninterpretation
- Behandle auch farbige, dünne oder schwach sichtbare Linien als gültige Mappings, wenn sie eine Verbindung zwischen zwei Feldern darstellen.
- Keine automatisch generierten Felder erfinden

Gib NUR gültiges JSON zurück.
KEIN Markdown.
KEINE ```json Blöcke.
KEINE Erklärungen.
KEINE Kommentare.
KEINE zusätzlichen Texte.

Format:

{
  "source": {
    "name": "",
    "fields": [
      {"name": "", "type": ""}
    ]
  },
  "target": {
    "name": "",
    "fields": [
      {"name": "", "type": ""}
    ]
  },
  "mappings": [
    {"source_field": "", "target_field": ""}
  ]
}
"""
}

# -------------------------------------------------------
# 5. Qwen JSON-Extraktion (KORREKT!)
# -------------------------------------------------------
def qwen_extract(image, prompt):
    print("[QWEN] Baue messages...")
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    print("[QWEN] Wende chat template an...")
    text_prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    print("[QWEN] Erzeuge Inputs...")
    inputs = processor(
        text=text_prompt,
        images=image,
        return_tensors="pt"
    ).to(device)

    for k, v in inputs.items():
        print(f"[QWEN] {k} shape:", tuple(v.shape))

    print("[QWEN] Starte generate()...")
    output = qwen.generate(
        **inputs,
        max_new_tokens=512,   # jetzt aktiv
        do_sample=False,
        temperature=0.0,
        eos_token_id=None      # jetzt aktiv
    )
    print("[QWEN] generate() fertig.")

    decoded = processor.batch_decode(output, skip_special_tokens=True)[0]
    print("[QWEN] Decoding fertig.")
    return decoded

# -------------------------------------------------------
# 6. Pipeline ausführen
# -------------------------------------------------------
def process_image(image_path):
    pred_class, img = classify_image(image_path)
    print(f"ResNet-Klasse: {pred_class}")

    prompt = PROMPTS[pred_class]
    json_output = qwen_extract(img, prompt)

    print("\nJSON-Ausgabe:")
    print(json_output)
    return json_output

# -------------------------------------------------------
# 7. Beispielaufruf
# -------------------------------------------------------
if __name__ == "__main__":
    process_image("/workspace/Projekt/Testdaten/Data/Ungelabelt/58c4a60002ea47ce84086a31e8410242.png")

2026-01-12 09:28:05.679330: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  resnet.load_state_dict(torch.load("outputs/resnet_best.pt", map_location=device))
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


ResNet-Klasse: Datenvorschau


KeyError: 'Datenvorschau'