# Automatische Bildunterschriftengenerierung - Evaluation
## Masterarbeit: Multimodale KI-Modelle f√ºr Barrierefreiheit

Dieses Notebook implementiert die Methodik aus Kapitel 4:
- Datens√§tze: MS-COCO, Flickr30k, VizWiz
- Modelle: BLIP-1, BLIP-2, GPT-4V
- Metriken: BLEU, CIDEr
- WCAG-basierte qualitative Bewertung

## 1. Setup und Installation

In [None]:
%%javascript
// Colab Pro Keep-Alive (aggressiver)
function KeepAlive() {
  fetch('/api/kernels').catch(e => console.log(e));
  console.log('Keep-alive ping sent at ' + new Date().toLocaleTimeString());
}

// Alle 30 Sekunden (aggressiver f√ºr Pro)
setInterval(KeepAlive, 30000);
console.log('‚úì Pro Keep-Alive aktiviert (30s interval)');

// Zus√§tzlich: Simuliere Aktivit√§t
document.addEventListener('visibilitychange', function() {
  if (!document.hidden) {
    console.log('Tab visible again - sending keep-alive');
    KeepAlive();
  }
});

In [None]:

# GPU-√úberpr√ºfung
!nvidia-smi

# Grundlegende Pakete
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate pillow
!pip install -q pycocoevalcap
!pip install -q datasets
!pip install -q openai anthropic  # F√ºr GPT-4V API
!pip install -q nltk
!pip install -q scipy
!pip install -q scikit-image

# BLIP-2 spezifisch
!pip install -q salesforce-lavis

print("\n‚úì Installation abgeschlossen")

In [None]:
# Recovery

import json
import pickle
from pathlib import Path
import os

class NotebookState:
    """Verwaltet den Notebook-Zustand f√ºr Recovery"""

    def __init__(self):
        self.state_file = Path('notebook_state.pkl')
        self.datasets_loaded = False
        self.models_loaded = False
        self.current_dataset = None
        self.current_model = None

    def save_datasets(self, datasets):
        """Speichere Dataset-Referenzen"""
        dataset_info = {}
        for name, samples in datasets.items():
            dataset_info[name] = {
                'count': len(samples),
                'sample_keys': list(samples[0].keys()) if samples else []
            }

        with open('datasets_info.json', 'w') as f:
            json.dump(dataset_info, f)

        self.datasets_loaded = True
        self.save_state()
        print(f"üíæ Datasets gespeichert: {list(datasets.keys())}")

    def save_models(self, models):
        """Speichere Model-Info"""
        model_info = {name: model.name for name, model in models.items()}

        with open('models_info.json', 'w') as f:
            json.dump(model_info, f)

        self.models_loaded = True
        self.save_state()
        print(f"üíæ Models gespeichert: {list(models.keys())}")

    def save_state(self):
        """Speichere aktuellen Zustand"""
        with open(self.state_file, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load_state(self):
        """Lade gespeicherten Zustand"""
        if self.state_file.exists():
            with open(self.state_file, 'rb') as f:
                self.__dict__.update(pickle.load(f))
            return True
        return False

    def check_prerequisites(self):
        """Pr√ºfe ob alle Voraussetzungen erf√ºllt sind"""
        issues = []

        if not self.datasets_loaded:
            issues.append("‚ùå Datasets nicht geladen - f√ºhre Dataset-Zelle aus!")

        if not self.models_loaded:
            issues.append("‚ùå Models nicht geladen - f√ºhre Model-Zelle aus!")

        return issues

# Globale State-Instanz
notebook_state = NotebookState()
notebook_state.load_state()

print("‚úì Crash Recovery System aktiviert")

if notebook_state.load_state():
    print(f"üìÇ Zustand wiederhergestellt:")
    print(f"   Datasets geladen: {notebook_state.datasets_loaded}")
    print(f"   Models geladen: {notebook_state.models_loaded}")

In [None]:
import torch
import numpy as np
import pandas as pd
from PIL import Image
import json
import os
from typing import List, Dict, Tuple
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

from google.colab import drive
import os
from datetime import datetime

# Drive mounten
drive.mount('/content/drive')

# Ordner erstellen
run_dir = '/content/drive/MyDrive/masterarbeit_results/run_20260102_100samples'
os.makedirs(run_dir, exist_ok=True)

print(f"‚úì Ergebnisse werden gespeichert in: {run_dir}")

# Transformers f√ºr BLIP-2 und CLIP
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    Blip2Processor,
    Blip2ForConditionalGeneration,
    CLIPProcessor,
    CLIPModel,
    GPT2Tokenizer,
    AutoProcessor,
    AutoModelForCausalLM
)

# Datasets
from datasets import load_dataset

# Evaluation Metriken
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

# NLTK f√ºr Textverarbeitung
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Ger√§t festlegen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Verwende Ger√§t: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Verf√ºgbarer VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Konfiguration

In [None]:
# Konfiguration
CONFIG = {
    'sample_size': 300,  # Anzahl Bilder pro Datensatz f√ºr erste Tests
    'full_sample_size': 300,  # F√ºr vollst√§ndige Evaluation
    'batch_size': 8,
    'max_length': 100,
    'num_beams': 5,
    'random_seed': 42,
    'output_dir': '/content/drive/MyDrive/masterarbeit_results/run_20260102_100samples',
    'datasets': ['coco', 'flickr30k', 'vizwiz'],
    'models': ['blip2', 'gpt4v'],  # CLIPCap separat implementiert
}

# Output-Verzeichnis erstellen
os.makedirs(CONFIG['output_dir'], exist_ok=True)

# F√ºr reproduzierbare Ergebnisse
torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

print("‚úì Konfiguration geladen")

## 3. Datens√§tze laden

In [None]:

# DATENS√ÑTZE AUS GOOGLE DRIVE LADEN

import numpy as np
from PIL import Image
from tqdm.auto import tqdm
import os
import json
import pandas as pd
from pathlib import Path

print("="*80)
print("LADE DATENS√ÑTZE AUS GOOGLE DRIVE")
print("="*80)

# 1. GOOGLE DRIVE MOUNTEN
from google.colab import drive

if not os.path.exists('/content/drive'):
    print("\nüìÅ Mounte Google Drive...")
    drive.mount('/content/drive')
    print("‚úì Drive gemountet")
else:
    print("\n‚úì Drive bereits gemountet")

# 2. PFADE ZU DEN COCO-DATEN

# Basispfad zum data-Ordner
DRIVE_BASE = '/content/drive/MyDrive/data'

# COCO-Ordner
COCO_DIR = f'{DRIVE_BASE}/coco2017'

print(f"\nüìÇ Pr√ºfe COCO-Pfad: {COCO_DIR}")

# Pr√ºfe ob Ordner existiert
if os.path.exists(COCO_DIR):
    print("‚úì COCO-Ordner gefunden!")

    # Zeige Inhalt
    contents = os.listdir(COCO_DIR)
    print(f"\nüìã Inhalt von coco2017/:")
    for item in contents[:10]:  # Zeige erste 10 Items
        print(f"   ‚Ä¢ {item}")
    if len(contents) > 10:
        print(f"   ... und {len(contents) - 10} weitere")
else:
    print(f"‚ùå Ordner nicht gefunden!")
    print(f"\nüí° Bitte pr√ºfe den Pfad. M√∂gliche Alternativen:")
    print(f"   ‚Ä¢ /content/drive/MyDrive/data/coco2017")
    print(f"   ‚Ä¢ /content/drive/MyDrive/data/COCO")
    print(f"   ‚Ä¢ /content/drive/MyDrive/coco2017")

    # Versuche zu finden
    print("\nüîç Suche nach COCO-Ordnern...")
    for root, dirs, files in os.walk(f'{DRIVE_BASE}', maxdepth=2):
        for d in dirs:
            if 'coco' in d.lower():
                print(f"   Gefunden: {os.path.join(root, d)}")

# 3. COCO ANNOTATIONS LADEN

print("\n" + "="*80)
print("LADE COCO CAPTIONS")
print("="*80)

# Config
sample_size = CONFIG.get('sample_size', 100) if 'CONFIG' in globals() else 100
np.random.seed(42)

datasets = {}

# Pr√ºfe m√∂gliche Pfade
possible_paths = [
    # Standard COCO-Struktur
    {
        'images': f'{COCO_DIR}/val2017/val2017',
        'annotations': f'{COCO_DIR}/annotations/captions_val2017.json'
    },
    {
        'images': f'{COCO_DIR}/test2017',
        'annotations': f'{COCO_DIR}/annotations/captions_train2017.json'
    },
    # Alternative Strukturen
    {
        'images': f'{COCO_DIR}/images/val2017/val2017',
        'annotations': f'{COCO_DIR}/annotations/captions_val2017.json'
    },
    {
        'images': f'{COCO_DIR}/val2017/val2017',
        'annotations': f'{COCO_DIR}/captions_val2017.json'
    },
]

coco_data = None

for path_config in possible_paths:
    ann_path = path_config['annotations']
    img_dir = path_config['images']

    if os.path.exists(ann_path) and os.path.exists(img_dir):
        print(f"\n‚úì Gefunden:")
        print(f"   Annotations: {ann_path}")
        print(f"   Bilder: {img_dir}")

        # Lade Annotations
        print(f"\nüì• Lade Captions aus JSON...")
        with open(ann_path, 'r') as f:
            coco_data = json.load(f)

        print(f"‚úì {len(coco_data.get('images', []))} Bilder")
        print(f"‚úì {len(coco_data.get('annotations', []))} Captions")

        # Erstelle Image-ID zu Captions Mapping
        print("\nüîó Erstelle Caption-Mapping...")
        image_to_captions = {}

        for ann in coco_data['annotations']:
            img_id = ann['image_id']
            if img_id not in image_to_captions:
                image_to_captions[img_id] = []
            image_to_captions[img_id].append(ann['caption'])

        # Sample Bilder
        print(f"\nüé≤ Sample {sample_size} zuf√§llige Bilder...")
        available_images = [img for img in coco_data['images'] if img['id'] in image_to_captions]

        if len(available_images) == 0:
            print("‚ùå Keine Bilder mit Captions gefunden!")
            continue

        sampled_images = np.random.choice(
            available_images,
            min(sample_size, len(available_images)),
            replace=False
        )

        # Lade Samples
        coco_samples = []
        print(f"\nüì∏ Lade Bilder aus Drive...")

        for img_info in tqdm(sampled_images, desc="COCO"):
            img_id = img_info['id']
            img_filename = img_info['file_name']
            img_path = os.path.join(img_dir, img_filename)

            # Pr√ºfe ob Bild existiert
            if not os.path.exists(img_path):
                continue

            try:
                # Lade Bild
                image = Image.open(img_path).convert('RGB')

                # Hole Captions
                captions = image_to_captions.get(img_id, [f"COCO image {img_id}"])

                coco_samples.append({
                    'image': image,
                    'image_id': img_id,
                    'captions': captions[:5],  # Max 5 Captions
                    'dataset': 'coco'
                })

            except Exception as e:
                print(f"\n‚ö†Ô∏è  Fehler bei {img_filename}: {str(e)[:50]}")
                continue

        datasets['coco'] = coco_samples
        print(f"\n‚úì {len(coco_samples)} COCO-Bilder geladen")

        if coco_samples:
            print(f"üìù Beispiel Caption: '{coco_samples[0]['captions'][0][:60]}...'")

        break

if not coco_data:
    print("\n‚ùå Konnte COCO-Daten nicht finden!")
    print("\nüí° HILFE:")
    print("   1. √ñffne deinen Drive-Ordner 'data/coco2017'")
    print("   2. Pr√ºfe die Struktur:")
    print("      ‚îú‚îÄ‚îÄ coco2017/")
    print("      ‚îÇ   ‚îú‚îÄ‚îÄ val2017/  (oder train2017/)")
    print("      ‚îÇ   ‚îî‚îÄ‚îÄ annotations/")
    print("      ‚îÇ       ‚îî‚îÄ‚îÄ captions_val2017.json")
    print("   3. Passe COCO_DIR im Script an falls n√∂tig")

    datasets['coco'] = []

# 4. VIZWIZ (aus HuggingFace)

print("\n" + "="*80)
print("LADE VIZWIZ (aus HuggingFace)")
print("="*80)
print("üí° VizWiz ist klein (~100MB), laden aus HF ist schneller als Drive")

try:
    from datasets import load_dataset

    vizwiz_dataset = load_dataset("lmms-lab/VizWiz-VQA", split="val")

    indices = np.random.choice(
        len(vizwiz_dataset),
        min(sample_size, len(vizwiz_dataset)),
        replace=False
    )

    vizwiz_samples = []
    for idx in tqdm(indices, desc="VizWiz"):
        item = vizwiz_dataset[int(idx)]

        captions = []
        answers = item.get('answers', [])
        for ans in answers[:3]:
            if isinstance(ans, dict) and 'answer' in ans:
                captions.append(ans['answer'])
            elif isinstance(ans, str):
                captions.append(ans)

        if not captions and 'question' in item:
            captions = [item['question']]
        if not captions:
            captions = [f"VizWiz image {idx}"]

        vizwiz_samples.append({
            'image': item['image'],
            'image_id': item.get('image_id', idx),
            'captions': captions,
            'dataset': 'vizwiz'
        })

    datasets['vizwiz'] = vizwiz_samples
    print(f"‚úì {len(vizwiz_samples)} VizWiz-Bilder geladen")

except Exception as e:
    print(f"‚ùå VizWiz: {str(e)[:100]}")
    datasets['vizwiz'] = []

# ZUSAMMENFASSUNG

print("\n" + "="*80)
print("‚úÖ DATENS√ÑTZE GELADEN")
print("="*80)

total_images = sum(len(ds) for ds in datasets.values() if ds)
print(f"\nüìä Gesamt: {total_images} Bilder")

for name, samples in datasets.items():
    if samples:
        print(f"\nüìÅ {name.upper()}:")
        print(f"   ‚Ä¢ Anzahl: {len(samples)}")
        print(f"   ‚Ä¢ Durchschn. Captions/Bild: {np.mean([len(s['captions']) for s in samples]):.1f}")
        if samples:
            print(f"   ‚Ä¢ Beispiel: '{samples[0]['captions'][0][:60]}...'")

print("\n" + "="*80)

In [None]:

# FLICKR30K

print("\n3Ô∏è‚É£ Lade Flickr30k von Google Drive...\n")

import os
from google.colab import drive

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
    print("‚úì Drive gemountet\n")

# DEIN PFAD
drive_flickr_path = '/content/drive/MyDrive/data/archive/Images'

print(f"üìÅ Verwende Pfad: {drive_flickr_path}\n")

# Pr√ºfe ob existiert
if not os.path.exists(drive_flickr_path):
    print(f"‚ùå FEHLER: Pfad existiert nicht!")
    print(f"\nPfad: {drive_flickr_path}")
    print("\nüîç Zeige MyDrive-Inhalt:")

    myDrive = '/content/drive/MyDrive'
    if os.path.exists(myDrive):
        print(f"   MyDrive: {os.listdir(myDrive)}")

    raise FileNotFoundError("Bitte korrigiere den Pfad oben!")

print("‚úì Pfad existiert")

# Zeige Inhalt
contents = os.listdir(drive_flickr_path)
print(f"\nüìÇ Inhalt ({len(contents)} Elemente):")

# Zeige nur wichtige Dateien
for item in contents[:10]:
    if item.endswith('.csv') or item == 'flickr30k_images':
        print(f"   ‚úì {item}")

# Bilder
if 'flickr30k_images' in contents:
    drive_images_path = os.path.join(drive_flickr_path, 'flickr30k_images')
    jpg_count = len([f for f in os.listdir(drive_images_path) if f.endswith('.jpg')])
    print(f"\n‚úì Bilder-Ordner: flickr30k_images/ ({jpg_count:,} JPGs)")
else:
    # Falls JPGs direkt im Ordner
    drive_images_path = drive_flickr_path
    jpg_count = sum(1 for f in contents if f.endswith('.jpg'))
    print(f"\n‚úì Bilder direkt im Ordner ({jpg_count:,} JPGs)")

# CSV
if 'results.csv' in contents:
    results_csv_path = os.path.join(drive_flickr_path, 'results.csv')
    size_mb = os.path.getsize(results_csv_path) / (1024*1024)
    print(f"‚úì results.csv ({size_mb:.1f} MB)")
else:
    results_csv_path = None
    print("‚ö†Ô∏è results.csv nicht gefunden")

# GLOBALE VARIABLEN

print("\n" + "="*70)
print("SETZE VARIABLEN")
print("="*70)

globals()['flickr_dir'] = drive_flickr_path
globals()['flickr_images_dir'] = drive_images_path
globals()['flickr_results_csv'] = results_csv_path

print(f"flickr_dir          = '{drive_flickr_path}'")
print(f"flickr_images_dir   = '{drive_images_path}'")
print(f"flickr_results_csv  = '{results_csv_path}'")

print("="*70)

In [None]:

# FLICKR30K - CAPTIONS & BILDER LADEN (VON DRIVE)

print("\nLade Flickr30k Captions & Bilder...\n")

import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
import os
import numpy as np

try:

    if 'datasets' not in globals():
        print("‚ö†Ô∏è datasets Dictionary nicht gefunden - erstelle neues")
        datasets = {}
    else:
        print(f"‚úì datasets gefunden mit {len(datasets)} bestehenden Datens√§tzen")

    flickr_dir = globals().get('flickr_dir')
    flickr_images_dir = globals().get('flickr_images_dir')
    flickr_results_csv = globals().get('flickr_results_csv')

    if not flickr_dir:
        raise ValueError("flickr_dir nicht gefunden - bitte vorherige Zelle ausf√ºhren!")

    print(f"üìÅ Arbeite mit:")
    print(f"   Hauptordner: {flickr_dir}")
    print(f"   Bilder: {flickr_images_dir}")
    print(f"   Captions: {flickr_results_csv or 'Nicht gefunden'}")

    # LADE CAPTIONS

    caption_dict = {}

    if flickr_results_csv and os.path.exists(flickr_results_csv):
        print("\nüìù Lade Captions aus results.csv...")

        try:
            # Lade CSV
            captions_df = pd.read_csv(flickr_results_csv, delimiter='|')
            captions_df.columns = captions_df.columns.str.strip()

            # Manchmal sind die Spaltennamen anders
            if 'image_name' not in captions_df.columns:
                captions_df.columns = ['image_name', 'comment_number', 'comment']

            print(f"   ‚úì {len(captions_df):,} Caption-Zeilen geladen")

            # Gruppiere nach Bildname
            for _, row in captions_df.iterrows():
                img_name = str(row['image_name']).strip()
                caption = str(row['comment']).strip()

                if img_name not in caption_dict:
                    caption_dict[img_name] = []
                caption_dict[img_name].append(caption)

            print(f"   ‚úì {len(caption_dict):,} Bilder mit Captions")

            avg_captions = sum(len(caps) for caps in caption_dict.values()) / len(caption_dict)
            print(f"   ‚úì √ò {avg_captions:.1f} Captions pro Bild")

        except Exception as e:
            print(f"   ‚ö†Ô∏è Fehler beim Laden von Captions: {e}")
            print("   ‚Üí Verwende Platzhalter-Captions")
            caption_dict = {}
    else:
        print("\n‚ö†Ô∏è Keine results.csv gefunden ‚Üí Verwende Platzhalter-Captions")

    # FINDE ALLE JPG-DATEIEN

    print("\nüñºÔ∏è Suche JPG-Dateien...")

    available_images = []

    # Suche rekursiv nach allen JPGs
    for root, dirs, files in os.walk(flickr_images_dir):
        jpg_files = [f for f in files if f.endswith('.jpg')]
        if jpg_files:
            print(f"   üì∏ {root}: {len(jpg_files):,} JPGs")
            available_images.extend(jpg_files)

    print(f"\n   ‚úì {len(available_images):,} Bilder gefunden")

    if len(available_images) == 0:
        raise FileNotFoundError(f"Keine JPG-Dateien in {flickr_images_dir}")

    # ERSTELLE SAMPLE

    print("\nüìä Erstelle Sample...")

    # Nur Bilder mit Captions (falls vorhanden)
    if caption_dict:
        valid_images = [img for img in available_images if img in caption_dict]
        print(f"   {len(valid_images):,} Bilder haben Captions")

        if len(valid_images) == 0:
            print("   ‚ö†Ô∏è Keine √úbereinstimmungen - verwende alle Bilder")
            valid_images = available_images
    else:
        valid_images = available_images

    # Sample-Gr√∂√üe
    sample_size_flickr = min(300, len(valid_images))

    # Random Sample (reproduzierbar)
    random_seed = globals().get('CONFIG', {}).get('random_seed', 42) if 'CONFIG' in globals() else 42
    np.random.seed(random_seed)
    sample_images = np.random.choice(valid_images, sample_size_flickr, replace=False)

    print(f"   ‚Üí Sample: {sample_size_flickr} Bilder")

    # LADE BILDER INS MEMORY

    print("\n‚è≥ Lade Bilder ins Memory...")

    flickr_samples = []
    failed = 0

    for img_name in tqdm(sample_images, desc="   Flickr30k"):
        # Finde den vollst√§ndigen Pfad zum Bild
        img_path = None
        for root, dirs, files in os.walk(flickr_images_dir):
            if img_name in files:
                img_path = os.path.join(root, img_name)
                break

        if img_path is None:
            failed += 1
            continue

        try:
            # Lade Bild
            image = Image.open(img_path).convert('RGB')

            # Hole Captions (oder Platzhalter)
            if caption_dict and img_name in caption_dict:
                captions = caption_dict[img_name]
            else:
                captions = [f'Flickr image: {img_name.replace(".jpg", "")}']

            flickr_samples.append({
                'image': image,
                'image_id': img_name.replace('.jpg', ''),
                'captions': captions,
                'dataset': 'flickr30k'
            })

        except Exception as e:
            failed += 1
            if failed <= 3:
                print(f"\n   ‚ö†Ô∏è Fehler bei {img_name}: {str(e)[:50]}")


    datasets['flickr30k'] = flickr_samples


    print("\n" + "="*80)
    print("="*80)
    print(f"\n  Bilder geladen:        {len(flickr_samples)}")
    print(f"  Bilder fehlgeschlagen: {failed}")

    if flickr_samples:
        total_caps = sum(len(s['captions']) for s in flickr_samples)
        avg_caps = total_caps / len(flickr_samples)

        print(f"  Total Captions:        {total_caps}")
        print(f"  √ò Captions/Bild:       {avg_caps:.1f}")

        # Beispiel
        example = flickr_samples[0]
        print(f"\nüìù BEISPIEL (Bild: {example['image_id']})")
        print(f"   Anzahl Captions: {len(example['captions'])}")

        for i, cap in enumerate(example['captions'][:3], 1):
            display_cap = cap[:70] + "..." if len(cap) > 70 else cap
            print(f"   {i}. {display_cap}")

        if len(example['captions']) > 3:
            print(f"   ... und {len(example['captions']) - 3} weitere")

    print("="*80)

except Exception as e:
    print(f"\n‚ùå Flickr30k Fehler: {str(e)}")

    import traceback
    print("\nDetaillierter Fehler:")
    traceback.print_exc()

    datasets['flickr30k'] = []

In [None]:

# ZUSAMMENFASSUNG ALLER DATENS√ÑTZE

print("\n" + "="*80)
print("ZUSAMMENFASSUNG ALLER GELADENEN DATENS√ÑTZE")
print("="*80)

# Pr√ºfe ob datasets existiert
if 'datasets' not in globals():
    print("\nFEHLER: datasets Dictionary nicht gefunden!")
    print("   Bitte f√ºhre die Datensatz-Lade-Zellen aus:")
    print("   ‚Ä¢ Zelle 9: COCO + VizWiz")
    print("   ‚Ä¢ Zelle 12: Flickr30k")
else:
    total = 0

    print(f"\n √úbersicht:")
    print("\nDataset      | Bilder | √ò Captions | Status")
    print("-" * 50)

    for name, samples in datasets.items():
        count = len(samples)
        total += count

        if count > 0:
            with_captions = sum(1 for s in samples if s.get('captions') and len(s['captions']) > 0)
            avg_caps = sum(len(s['captions']) for s in samples) / count
            status = "‚úì" if with_captions == count else "‚ö†Ô∏è"

            print(f"{name:12s} | {count:6d} | {avg_caps:10.1f} | {status}")
        else:
            print(f"{name:12s} | {count:6d} | {'N/A':>10s} | ‚ùå")

    print("-" * 50)
    print(f"{'GESAMT':12s} | {total:6d} |")
    print("\n" + "="*80)

    print("="*80)

    # Details zu jedem Datensatz
    if total > 0:
        print("\n DETAILS PRO DATENSATZ:\n")

        for name, samples in datasets.items():
            if len(samples) > 0:
                print(f"üìÅ {name.upper()}")
                print(f"   ‚Ä¢ Bilder: {len(samples)}")

                # Caption-Statistiken
                caption_counts = [len(s['captions']) for s in samples]
                avg = sum(caption_counts) / len(caption_counts)
                min_caps = min(caption_counts)
                max_caps = max(caption_counts)

                print(f"   ‚Ä¢ Captions: √ò {avg:.1f} (min: {min_caps}, max: {max_caps})")

                # Beispiel
                example = samples[0]
                print(f"   ‚Ä¢ Beispiel-ID: {example.get('image_id', 'N/A')}")
                if example.get('captions'):
                    first_cap = example['captions'][0]
                    display_cap = first_cap[:60] + "..." if len(first_cap) > 60 else first_cap
                    print(f"   ‚Ä¢ Beispiel-Caption: \"{display_cap}\"")
                print()

In [None]:

# DATASETS PERSISTENT SPEICHERN - EINMALIG AUSF√úHREN!
# Diese Zelle NACH dem Laden der Datasets ausf√ºhren

import pickle
import json
import os
from pathlib import Path

print("="*80)
print(" SPEICHERE DATASETS PERSISTENT")
print("="*80)

# Pr√ºfe ob datasets existiert
try:
    datasets
    print(f"\n‚úì Datasets gefunden: {list(datasets.keys())}")
except NameError:
    print("\n‚ùå Variable 'datasets' nicht gefunden!")
    print("   ‚Üí F√ºhre zuerst die Dataset-Lade-Zelle aus!")
    raise

# Zielpfad (Drive)
datasets_file = Path('/content/drive/MyDrive/caption_generation_workspace/state/datasets_persistent.pkl')
datasets_file.parent.mkdir(parents=True, exist_ok=True)

print(f"\nSpeichere nach: {datasets_file.resolve()}")

# Speichere datasets
with open(datasets_file, 'wb') as f:
    pickle.dump(datasets, f, protocol=pickle.HIGHEST_PROTOCOL)

# Speichere auch Info als JSON (ebenfalls nach Drive!)
datasets_info = {}
for name, samples in datasets.items():
    datasets_info[name] = {
        'count': len(samples),
        'first_keys': list(samples[0].keys()) if samples else []
    }

datasets_info_file = datasets_file.with_name('datasets_info.json')
with open(datasets_info_file, 'w') as f:
    json.dump(datasets_info, f, indent=2)

size_mb = datasets_file.stat().st_size / (1024**2)

print(f"\n‚úÖ DATASETS GESPEICHERT!")
print(f"   Datei:  {datasets_file.name}")
print(f"   Gr√∂√üe:  {size_mb:.1f} MB")
print(f"   Info:   {datasets_info_file.name}")

print(f"\nüìä Gespeichert:")
for name, info in datasets_info.items():
    print(f"   - {name}: {info['count']} Bilder")

print(f"\nüí° Ab jetzt kannst du datasets aus Drive laden (Recovery nach Crash):")
print(f"   {datasets_file}")


## 4. Modelle initialisieren

In [None]:

# MODELL-DEFINITIONEN - BLIP-1 STATT CLIPCAP

class CaptioningModel:
    """Basis-Klasse f√ºr Captioning-Modelle"""

    def __init__(self, name: str):
        self.name = name
        self.device = device

    def generate_caption(self, image: Image.Image) -> str:
        raise NotImplementedError

    def generate_batch(self, images: List[Image.Image]) -> List[str]:
        return [self.generate_caption(img) for img in tqdm(images, desc=f"{self.name} generiert")]


class BLIP2Model(CaptioningModel):
    """BLIP-2 Modell mit Q-Former"""

    def __init__(self):
        super().__init__("BLIP-2")
        print(f"Lade {self.name}...")

        from transformers import Blip2Processor, Blip2ForConditionalGeneration

        model_name = "Salesforce/blip2-flan-t5-xl"

        self.processor = Blip2Processor.from_pretrained(model_name)
        self.model = Blip2ForConditionalGeneration.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )

        print(f"‚úì {self.name} geladen")

    def generate_caption(self, image: Image.Image, prompt: str = "") -> str:
        if not prompt:
            prompt = "A detailed description of this image:"

        inputs = self.processor(image, text=prompt, return_tensors="pt").to(
            self.device, torch.float16
        )

        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_length=CONFIG['max_length'],
                num_beams=CONFIG['num_beams'],
                early_stopping=True
            )

        caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
        return caption.strip()


class BLIP1Model(CaptioningModel):
    """BLIP-1 Modell (Vorg√§nger von BLIP-2) - ERSETZT CLIPCAP"""

    def __init__(self):
        super().__init__("BLIP-1")
        print(f"Lade {self.name}...")

        from transformers import BlipProcessor, BlipForConditionalGeneration

        # BLIP-1 Large (beste Version)
        model_name = "Salesforce/blip-image-captioning-large"

        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16
        ).to(self.device)

        print(f"‚úì {self.name} geladen")

    def generate_caption(self, image: Image.Image) -> str:
        inputs = self.processor(image, return_tensors="pt").to(
            self.device, torch.float16
        )

        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_length=CONFIG.get('max_length', 50),
                num_beams=CONFIG.get('num_beams', 5),
                early_stopping=True
            )

        caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
        return caption.strip()


class GPT4VModel(CaptioningModel):
    """GPT-4V √ºber API"""

    def __init__(self, api_key: str = None):
        super().__init__("GPT-4V")

        if api_key:
            import openai
            self.client = openai.OpenAI(api_key=api_key)
            self.enabled = True
            print(f"‚úì {self.name} API initialisiert")
        else:
            self.enabled = False
            print(f"‚ö† {self.name} ben√∂tigt API-Key")

    def generate_caption(self, image: Image.Image) -> str:
        if not self.enabled:
            return "GPT-4V not available (no API key)"

        import base64
        from io import BytesIO
        import time

        if image.mode != 'RGB':
            image = image.convert('RGB')

        buffered = BytesIO()
        image.save(buffered, format="JPEG", quality=95)
        img_str = base64.b64encode(buffered.getvalue()).decode()
        buffered.close()

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Provide a detailed, accessible description of this image suitable for visually impaired users. Focus on the main content, objects, people, actions, and context."},
                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}}
                        ]
                    }
                ],
                max_tokens=100
            )

            time.sleep(0.5)  # Rate limiting
            return response.choices[0].message.content.strip()

        except Exception as e:
            print(f"GPT-4V Fehler: {e}")
            return "Error generating caption"


# INITIALISIERE MODELLE

print("\n" + "="*80)
print("INITIALISIERE MODELLE")
print("="*80 + "\n")

models = {}

# BLIP-2 (State-of-the-art)
models['blip2'] = BLIP2Model()

# BLIP-1 (Ersetzt CLIPCap)
models['blip1'] = BLIP1Model()

# GPT-4V (Optional)
OPENAI_API_KEY = "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"
models['gpt4v'] = GPT4VModel(api_key=OPENAI_API_KEY)

print("\n" + "="*80)
print("‚úÖ ALLE MODELLE GELADEN")
print("="*80)

print("\nüìã Verf√ºgbare Modelle:")
for name, model in models.items():
    print(f"  ‚Ä¢ {name}: {model.name}")


## 5. Caption-Generierung

In [None]:

# CAPTION GENERATION - √úBERLEBT ALLES!

# Speichert ALLES in Google Drive

import json
import os
from pathlib import Path
import time
import gc
import traceback
from typing import List, Dict
from tqdm import tqdm
import torch
from PIL import Image
import pandas as pd
import pickle

# 1. SETUP DRIVE PATHS

print("="*80)
print("üîß SETUP DRIVE-BASED SYSTEM")
print("="*80)

from google.colab import drive

# Robust Drive mount check
if not os.path.ismount('/content/drive'):
    print("\nMounte Google Drive...")
    drive.mount('/content/drive')
    time.sleep(2)

# Erstelle Workspace in Drive
DRIVE_WORKSPACE = '/content/drive/MyDrive/caption_generation_workspace'
os.makedirs(DRIVE_WORKSPACE, exist_ok=True)

# Unterordner
CHECKPOINTS_DIR = os.path.join(DRIVE_WORKSPACE, 'checkpoints')
RESULTS_DIR = os.path.join(DRIVE_WORKSPACE, 'results')
LOGS_DIR = os.path.join(DRIVE_WORKSPACE, 'logs')
STATE_DIR = os.path.join(DRIVE_WORKSPACE, 'state')

for dir_path in [CHECKPOINTS_DIR, RESULTS_DIR, LOGS_DIR, STATE_DIR]:
    os.makedirs(dir_path, exist_ok=True)

print(f"\n‚úì Drive Workspace: {DRIVE_WORKSPACE}")
print(f"  - checkpoints/")
print(f"  - results/")
print(f"  - logs/")
print(f"  - state/")

# 2. CHECKPOINT FUNCTIONS (DRIVE-BASED)

def save_checkpoint_to_drive(results, dataset_name, model_name):
    """Speichere Checkpoint DIREKT in Drive"""
    import numpy as np

    def convert_to_json_serializable(obj):
        if isinstance(obj, dict):
            return {k: convert_to_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_to_json_serializable(item) for item in obj]
        elif isinstance(obj, (np.integer, np.int64, np.int32)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float64, np.float32)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return obj

    checkpoint_file = os.path.join(CHECKPOINTS_DIR, f"{dataset_name}_{model_name}.json")

    # Backup alte Version
    if os.path.exists(checkpoint_file):
        backup_file = checkpoint_file.replace('.json', '_backup.json')
        try:
            os.rename(checkpoint_file, backup_file)
        except:
            pass

    # Speichere
    try:
        serializable = convert_to_json_serializable(results)
        with open(checkpoint_file, 'w') as f:
            json.dump(serializable, f)

        # Schreibe auch Metadaten
        meta_file = checkpoint_file.replace('.json', '_meta.json')
        with open(meta_file, 'w') as f:
            json.dump({
                'count': len(results),
                'dataset': dataset_name,
                'model': model_name,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            }, f)

        return True
    except Exception as e:
        print(f"‚ö†Ô∏è Save failed: {e}")
        return False


def load_checkpoint_from_drive(dataset_name, model_name):
    """Lade Checkpoint von Drive"""
    checkpoint_file = os.path.join(CHECKPOINTS_DIR, f"{dataset_name}_{model_name}.json")

    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                results = json.load(f)
            print(f"üìÇ Checkpoint geladen: {len(results)} Captions")
            return results
        except Exception as e:
            print(f"‚ö†Ô∏è Checkpoint corrupt: {e}")
            # Try backup
            backup_file = checkpoint_file.replace('.json', '_backup.json')
            if os.path.exists(backup_file):
                with open(backup_file, 'r') as f:
                    results = json.load(f)
                print(f"   ‚úì Backup geladen: {len(results)} Captions")
                return results
    return []


def save_state_to_drive(state):
    """Speichere aktuellen Zustand"""
    state_file = os.path.join(STATE_DIR, 'current_state.json')
    with open(state_file, 'w') as f:
        json.dump(state, f)


def load_state_from_drive():
    """Lade gespeicherten Zustand"""
    state_file = os.path.join(STATE_DIR, 'current_state.json')
    if os.path.exists(state_file):
        with open(state_file, 'r') as f:
            return json.load(f)
    return None


# 3. AUTO-RECOVERY + PRE-FLIGHT CHECK

# Auto-load datasets (falls vorhanden)
datasets_pkl = os.path.join(STATE_DIR, 'datasets_persistent.pkl')
if 'datasets' not in globals():
    if os.path.exists(datasets_pkl):
        try:
            with open(datasets_pkl, 'rb') as f:
                datasets = pickle.load(f)
            print(f"‚úÖ datasets aus Drive geladen: {list(datasets.keys())}")
        except Exception as e:
            print(f"‚ö†Ô∏è Konnte datasets_persistent.pkl nicht laden: {e}")
            print("   ‚Üí Bitte Dataset-Lade-Zellen neu ausf√ºhren oder eine g√ºltige datasets_persistent.pkl in Drive ablegen.")
    else:
        print("‚ö†Ô∏è datasets fehlt UND keine datasets_persistent.pkl in Drive gefunden.")
        print(f"   Erwarteter Pfad: {datasets_pkl}")

# models: NICHT auto-recoverable ‚Üí klare Fehlermeldung statt sp√§terer Crash
if 'models' not in globals():
    print("‚ö†Ô∏è models fehlen: bitte Model-Zelle erneut ausf√ºhren (l√§dt die Modelle neu).")

def preflight_check():
    """Pr√ºfe ob alles bereit ist"""
    print("\n" + "="*80)
    print("üîç PRE-FLIGHT CHECK")
    print("="*80)

    issues = []

    # Check datasets
    try:
        if 'datasets' not in globals() or not isinstance(datasets, dict) or len(datasets) == 0:
            issues.append("‚ùå datasets fehlt/leer")
        else:
            # Nur Datasets mit Samples anzeigen
            non_empty = {k: len(v) for k, v in datasets.items() if isinstance(v, list) and len(v) > 0}
            print(f"‚úì Datasets: {list(datasets.keys())}")
            if non_empty:
                print("  Nicht-leere Datasets:", non_empty)
            else:
                issues.append("‚ùå datasets enth√§lt keine Samples")
    except Exception:
        issues.append("‚ùå datasets Error")

    # Check models
    try:
        if 'models' not in globals():
            issues.append("‚ùå models fehlt")
        elif not isinstance(models, dict) or len(models) == 0:
            issues.append(f"‚ùå models ist {type(models)} oder leer")
        else:
            print(f"‚úì Models: {list(models.keys())}")
    except Exception:
        issues.append("‚ùå models Error")

    # Check Drive
    if os.path.exists(DRIVE_WORKSPACE):
        print(f"‚úì Drive Workspace: {DRIVE_WORKSPACE}")

        # Zeige existierende Checkpoints
        checkpoints = list(Path(CHECKPOINTS_DIR).glob('*.json'))
        checkpoints = [c for c in checkpoints if not c.name.endswith('_meta.json') and not c.name.endswith('_backup.json')]

        if checkpoints:
            print(f"‚úì Gefundene Checkpoints: {len(checkpoints)}")
            for cp in checkpoints[:8]:
                try:
                    with open(cp) as f:
                        data = json.load(f)
                    print(f"  - {cp.name}: {len(data)} Captions")
                except:
                    pass
    else:
        issues.append("‚ùå Drive Workspace nicht erreichbar")

    print("="*80)

    if issues:
        print("\nüö® PROBLEME:")
        for issue in issues:
            print(f"   {issue}")
        return False
    else:
        print("\n‚úÖ ALLES BEREIT!")
        return True

# 4. HAUPTFUNKTION (DRIVE-BASED)

def _get_image_from_sample(sample: Dict):
    """
    Robust: unterst√ºtzt beide Formate:
    - sample['image'] (PIL oder np-array)
    - sample['image_path'] (Pfad zu Datei)
    """
    # bevorzugt: image_path (RAM-schonend)
    img_path = sample.get('image_path')
    if img_path:
        # Bild nur hier √∂ffnen
        with Image.open(img_path) as im:
            image = im.convert('RGB')
        return image

    # fallback: image im Speicher
    if 'image' not in sample:
        raise KeyError("Sample hat weder 'image_path' noch 'image'.")

    image = sample['image']
    if isinstance(image, Image.Image):
        return image
    return Image.fromarray(image)


def generate_captions_for_dataset(dataset_name: str, dataset_samples: List[Dict], models: Dict):
    """Caption Generation mit Drive-Persistence"""
    results = []

    print(f"\n{'='*60}")
    print(f"DATASET: {dataset_name.upper()}")
    print(f"{'='*60}\n")

    for model_name, model in models.items():
        print(f"\n--- {getattr(model, 'name', model_name)} ---")

        # Lade Checkpoint von Drive
        existing_results = load_checkpoint_from_drive(dataset_name, model_name)
        processed_ids = {r['image_id'] for r in existing_results}
        results.extend(existing_results)

        if len(processed_ids) > 0:
            print(f"Bereits verarbeitet: {len(processed_ids)}/{len(dataset_samples)}")
            if len(processed_ids) == len(dataset_samples):
                print(f"‚úì {getattr(model, 'name', model_name)} komplett!")
                continue

        # Counters
        error_count = 0
        max_consecutive_errors = 5
        save_interval = 3     # alle 3 neuen Ergebnisse speichern
        processed_in_session = 0
        last_save_time = time.time()

        for idx, sample in enumerate(tqdm(dataset_samples,
                                          desc=f"{getattr(model, 'name', model_name)}",
                                          total=len(dataset_samples))):
            try:
                image_id = str(sample.get('image_id', f'img_{idx}'))
                if image_id in processed_ids:
                    continue

                # Bild laden (robust)
                image = _get_image_from_sample(sample)

                # Generate
                generated_caption = model.generate_caption(image)

                # Save result
                result = {
                    'dataset': dataset_name,
                    'image_id': image_id,
                    'model': model_name,
                    'generated_caption': generated_caption,
                    'reference_captions': sample.get('captions', []),
                }

                if dataset_name == 'vizwiz':
                    result['question'] = sample.get('question', '')
                    result['answers'] = sample.get('answers', [])

                results.append(result)
                processed_in_session += 1
                error_count = 0

                # Speichern alle N neuen Bilder
                if processed_in_session % save_interval == 0:
                    model_results = [r for r in results if r['model'] == model_name and r['dataset'] == dataset_name]
                    if save_checkpoint_to_drive(model_results, dataset_name, model_name):
                        last_save_time = time.time()
                        print(f"\nüíæ Drive-Checkpoint: {len(model_results)}/{len(dataset_samples)}")

                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    gc.collect()

                # Auto-save alle 90 Sekunden
                if time.time() - last_save_time > 90:
                    model_results = [r for r in results if r['model'] == model_name and r['dataset'] == dataset_name]
                    if save_checkpoint_to_drive(model_results, dataset_name, model_name):
                        last_save_time = time.time()
                        print(f"\n‚è∞ Auto-save (90s)")

            except KeyboardInterrupt:
                print("\n\n‚ö†Ô∏è BENUTZER-ABBRUCH!")
                model_results = [r for r in results if r['model'] == model_name and r['dataset'] == dataset_name]
                save_checkpoint_to_drive(model_results, dataset_name, model_name)
                print("üíæ In Drive gespeichert")
                raise

            except Exception as e:
                error_count += 1
                error_trace = traceback.format_exc()

                # Log zu Drive
                log_file = os.path.join(LOGS_DIR, f'errors_{dataset_name}_{model_name}.log')
                with open(log_file, 'a') as f:
                    f.write(f"\n{'='*60}\n")
                    f.write(f"Image: {sample.get('image_id', '?')}\n")
                    f.write(f"Error: {str(e)}\n")
                    f.write(f"Trace:\n{error_trace}\n")

                print(f"\n‚ö†Ô∏è Error: {str(e)[:120]}")

                if error_count >= max_consecutive_errors:
                    print(f"\n‚ùå Zu viele Errors!")
                    model_results = [r for r in results if r['model'] == model_name and r['dataset'] == dataset_name]
                    save_checkpoint_to_drive(model_results, dataset_name, model_name)
                    break

                continue

        # Finale Speicherung
        model_results = [r for r in results if r['model'] == model_name and r['dataset'] == dataset_name]
        save_checkpoint_to_drive(model_results, dataset_name, model_name)
        print(f"\n‚úì {getattr(model, 'name', model_name)} fertig: {len(model_results)} Captions")

        # Cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        time.sleep(2)

    return results


# 5. HAUPTSCHLEIFE

# Wenn models fehlt: sauber abbrechen
if 'models' not in globals():
    raise RuntimeError(
        "models fehlt. Bitte die Model-Initialisierungszelle ausf√ºhren (die models = {...} setzt) "
        "und dann diese Zelle erneut starten."
    )

if not preflight_check():
    print("\nüõë Behebe Probleme oben!")
else:
    print("\n" + "="*80)
    print("üöÄ STARTE CAPTION GENERATION (DRIVE-BASED)")
    print("="*80)

    all_results = []

    for dataset_name, dataset_samples in datasets.items():
        if not isinstance(dataset_samples, list) or len(dataset_samples) == 0:
            continue

        print(f"\n{'='*80}")
        print(f"DATASET: {dataset_name.upper()} ({len(dataset_samples)} Bilder)")
        print(f"{'='*80}")

        # Limitiere
        max_samples = 1000
        if len(dataset_samples) > max_samples:
            print(f"‚ö†Ô∏è Limitiere auf {max_samples}")
            dataset_samples = dataset_samples[:max_samples]

        try:
            results = generate_captions_for_dataset(dataset_name, dataset_samples, models)
            all_results.extend(results)

            # Speichere progressive CSV in Drive (wie bisher)
            if len(all_results) > 0:
                df = pd.DataFrame(all_results)
                csv_path = os.path.join(RESULTS_DIR, 'caption_results_progressive.csv')
                df.to_csv(csv_path, index=False)
                print(f"\nüíæ CSV in Drive: {len(df)} Captions")

        except Exception as e:
            print(f"\n‚ùå Error: {e}")
            traceback.print_exc()

            # Emergency save
            if len(all_results) > 0:
                df = pd.DataFrame(all_results)
                csv_path = os.path.join(RESULTS_DIR, 'caption_results_emergency.csv')
                df.to_csv(csv_path, index=False)
                print(f"üíæ Emergency CSV: {len(df)} Captions")

            print("\nüí° KEIN PROBLEM!")
            print("   ‚Üí Alle Checkpoints sind in Drive gespeichert")
            print("   ‚Üí Einfach diese Zelle NEU AUSF√úHREN")
            print("   ‚Üí Macht automatisch weiter!")
            break

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # FINALE ZUSAMMENFASSUNG

    print(f"\n{'='*80}")
    print("‚úÖ ABGESCHLOSSEN")
    print(f"{'='*80}")

    if len(all_results) > 0:
        df = pd.DataFrame(all_results)

        # Finale CSV in Drive
        final_csv = os.path.join(RESULTS_DIR, 'caption_results_FINAL.csv')
        df.to_csv(final_csv, index=False)

        print(f"\n‚úì Total: {len(df)} Captions")
        print(f"\nüìä √úbersicht:")
        print(df.groupby(['dataset', 'model']).size())

        print(f"\nüíæ ALLE DATEIEN IN DRIVE:")
        print(f"   {DRIVE_WORKSPACE}")
        print(f"\nüìÇ checkpoints/ - Alle Checkpoints")
        print(f"üìÇ results/ - CSV Dateien")
        print(f"üìÇ logs/ - Error Logs")

        # Kopiere auch nach /content f√ºr Download
        import shutil
        local_csv = '/content/caption_results_FINAL.csv'
        shutil.copy(final_csv, local_csv)
        print(f"\nüì• Auch kopiert nach: {local_csv}")

    print("\n" + "="*80)
    print("üéâ FERTIG!")
    print("="*80)


## 6. Automatische Evaluation (BLEU, CIDEr)

In [None]:

# EVALUATION - AUTOMATISCHE METRIKEN

import pandas as pd
from typing import Dict
import warnings
warnings.filterwarnings('ignore')

try:
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.meteor.meteor import Meteor
    from pycocoevalcap.cider.cider import Cider
except ImportError:
    print("Installiere pycocoevalcap...")
    !pip install -q pycocoevalcap
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.meteor.meteor import Meteor
    from pycocoevalcap.cider.cider import Cider


class CaptionEvaluator:
    """Evaluiert Captions mit automatischen Metriken"""

    def __init__(self):
        self.scorers = {
            'BLEU': Bleu(4),
            'METEOR': Meteor(),
            'CIDEr': Cider(),
        }

    def prepare_data(self, results_df: pd.DataFrame):
        """Bereitet Daten f√ºr pycocoevalcap vor"""
        gts = {}
        res = {}

        for idx, row in results_df.iterrows():
            img_id = f"{row['dataset']}_{row['image_id']}_{row['model']}"

            refs = row['reference_captions']
            if isinstance(refs, list) and len(refs) > 0:
                gts[img_id] = refs
            else:
                gts[img_id] = ['No reference available']

            res[img_id] = [row['generated_caption']]

        return gts, res

    def safe_meteor_compute(self, meteor_scorer, gts, res):
        """
        Sichere METEOR-Berechnung mit Fehlerbehandlung
        Returns: (success: bool, score: float or None)
        """
        try:
            # F√ºhre compute_score aus
            score, scores_list = meteor_scorer.compute_score(gts, res)

            # SOFORT pr√ºfen ob es eine Liste ist
            if isinstance(score, list):
                print(f"      METEOR gab Liste zur√ºck: {score}")
                # Nehme Maximum oder Durchschnitt
                try:
                    numeric_scores = [float(s) for s in score]
                    final_score = max(numeric_scores)
                    print(f"      ‚Üí Verwende maximalen Wert: {final_score:.4f}")

                    # Normalisierung
                    if final_score > 1.0:
                        final_score = final_score / 100.0

                    return True, final_score
                except:
                    print(f"      ‚Üí Konnte Liste nicht in Float konvertieren")
                    return False, None

            # Pr√ºfe auf Bytes mit Error
            if isinstance(score, bytes):
                score_str = score.decode('utf-8').strip()
                if 'Error' in score_str or 'error' in score_str.lower():
                    print(f"      METEOR Error: {score_str}")
                    return False, None
                # Extrahiere erste Zahl
                try:
                    score = float(score_str.split()[0])
                except:
                    print(f"      ‚Üí Konnte Bytes nicht parsen: {score_str}")
                    return False, None

            # Pr√ºfe auf String mit Error
            elif isinstance(score, str):
                if 'Error' in score or 'error' in score.lower():
                    print(f"      METEOR Error: {score}")
                    return False, None
                try:
                    score = float(score.split()[0])
                except:
                    print(f"      ‚Üí Konnte String nicht parsen: {score}")
                    return False, None

            # Sollte jetzt numerisch sein
            else:
                try:
                    score = float(score)
                except:
                    print(f"      ‚Üí Unerwarteter Typ: {type(score)}, Wert: {score}")
                    return False, None

            # Normalisierung
            if score > 1.0:
                score = score / 100.0

            return True, score

        except Exception as e:
            print(f"      ‚ùå METEOR Exception: {str(e)[:100]}")
            return False, None

    def compute_metrics(self, gts: Dict, res: Dict) -> Dict:
        """Berechnet alle Metriken mit robuster Fehlerbehandlung"""
        scores = {}

        for name, scorer in self.scorers.items():

            # ============================================================
            # METEOR: Spezialbehandlung VOR dem normalen Try-Block
            # ============================================================
            if name == 'METEOR':
                print(f"Berechne {name}...")
                success, meteor_score = self.safe_meteor_compute(scorer, gts, res)

                if success and meteor_score is not None:
                    scores['METEOR'] = meteor_score
                    print(f"   ‚úì METEOR: {meteor_score:.4f}")
                else:
                    scores['METEOR'] = 0.0
                    print(f"   ‚ö†Ô∏è  METEOR: Berechnung fehlgeschlagen ‚Üí 0.0")

                continue  # Springe zum n√§chsten Scorer

            # ============================================================
            # Alle anderen Metriken (BLEU, CIDEr)
            # ============================================================
            try:
                print(f"Berechne {name}...")
                score, scores_list = scorer.compute_score(gts, res)

                if name == 'BLEU':
                    if isinstance(score, (list, tuple)):
                        for i, s in enumerate(score, 1):
                            scores[f'BLEU-{i}'] = float(s)
                        print(f"   ‚úì BLEU-1: {scores['BLEU-1']:.4f}")
                        print(f"   ‚úì BLEU-2: {scores['BLEU-2']:.4f}")
                        print(f"   ‚úì BLEU-3: {scores['BLEU-3']:.4f}")
                        print(f"   ‚úì BLEU-4: {scores['BLEU-4']:.4f}")
                    else:
                        scores['BLEU-4'] = float(score)
                        print(f"   ‚úì BLEU-4: {scores['BLEU-4']:.4f}")

                else:  # CIDEr etc.
                    if isinstance(score, (list, tuple)):
                        scores[name] = float(score[0])
                    else:
                        scores[name] = float(score)
                    print(f"   ‚úì {name}: {scores[name]:.4f}")

            except Exception as e:
                print(f"   ‚ùå FEHLER bei {name}: {str(e)[:100]}")
                if name == 'BLEU':
                    for i in range(1, 5):
                        scores[f'BLEU-{i}'] = 0.0
                else:
                    scores[name] = 0.0

        return scores

    def evaluate_by_group(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """Evaluiert nach Datensatz und Modell gruppiert"""
        evaluation_results = []
        groups = results_df.groupby(['dataset', 'model'])

        for (dataset, model), group in groups:
            print(f"\n{'='*60}")
            print(f"Evaluiere {model} auf {dataset} ({len(group)} Bilder)")
            print(f"{'='*60}")

            gts, res = self.prepare_data(group)
            scores = self.compute_metrics(gts, res)

            result = {
                'dataset': dataset,
                'model': model,
                'n_samples': len(group),
                **scores
            }

            evaluation_results.append(result)

        return pd.DataFrame(evaluation_results)


# ============================================================
# FAST EVALUATOR (ohne METEOR)
# ============================================================
class FastCaptionEvaluator:
    """Schnelle Evaluation OHNE METEOR (nur BLEU + CIDEr)"""

    def __init__(self):
        self.scorers = {
            'BLEU': Bleu(4),
            'CIDEr': Cider(),
        }

    def prepare_data(self, results_df: pd.DataFrame):
        gts = {}
        res = {}
        for idx, row in results_df.iterrows():
            img_id = f"{row['dataset']}_{row['image_id']}_{row['model']}"
            refs = row['reference_captions']
            gts[img_id] = refs if isinstance(refs, list) and len(refs) > 0 else ['No reference']
            res[img_id] = [row['generated_caption']]
        return gts, res

    def compute_metrics(self, gts: Dict, res: Dict) -> Dict:
        scores = {}
        for name, scorer in self.scorers.items():
            try:
                print(f"Berechne {name}...")
                score, _ = scorer.compute_score(gts, res)

                if name == 'BLEU':
                    for i, s in enumerate(score, 1):
                        scores[f'BLEU-{i}'] = float(s)
                    print(f"   ‚úì BLEU-1: {scores['BLEU-1']:.4f}")
                    print(f"   ‚úì BLEU-2: {scores['BLEU-2']:.4f}")
                    print(f"   ‚úì BLEU-3: {scores['BLEU-3']:.4f}")
                    print(f"   ‚úì BLEU-4: {scores['BLEU-4']:.4f}")
                else:
                    scores[name] = float(score[0] if isinstance(score, (list, tuple)) else score)
                    print(f"   ‚úì {name}: {scores[name]:.4f}")

            except Exception as e:
                print(f"   ‚ùå FEHLER bei {name}: {str(e)[:100]}")
                if name == 'BLEU':
                    for i in range(1, 5):
                        scores[f'BLEU-{i}'] = 0.0
                else:
                    scores[name] = 0.0
        return scores

    def evaluate_by_group(self, results_df: pd.DataFrame) -> pd.DataFrame:
        evaluation_results = []
        groups = results_df.groupby(['dataset', 'model'])

        for (dataset, model), group in groups:
            print(f"\n{'='*60}")
            print(f"Evaluiere {model} auf {dataset} ({len(group)} Bilder)")
            print(f"{'='*60}")

            gts, res = self.prepare_data(group)
            scores = self.compute_metrics(gts, res)

            result = {
                'dataset': dataset,
                'model': model,
                'n_samples': len(group),
                **scores
            }
            evaluation_results.append(result)

        return pd.DataFrame(evaluation_results)


# EVALUATION STARTEN

print("\n" + "="*70)
print("AUTOMATISCHE EVALUATION")
print("="*70)

# Finde Results-Variable
if 'results_df' in globals() and isinstance(results_df, pd.DataFrame) and len(results_df) > 0:
    print(f"\n‚úì Verwende: results_df ({len(results_df)} Zeilen)")
    eval_df = results_df
elif 'all_results' in globals() and isinstance(all_results, list) and len(all_results) > 0:
    print(f"\n‚úì Verwende: all_results ({len(all_results)} Eintr√§ge)")
    eval_df = pd.DataFrame(all_results)
else:
    print("\n‚ùå FEHLER: Keine Results gefunden!")
    raise NameError("Results-Variable nicht gefunden")

# Info
print(f"\nüìä Datens√§tze: {list(eval_df['dataset'].unique())}")
print(f"ü§ñ Modelle: {list(eval_df['model'].unique())}")
print(f"üìù Gesamt: {len(eval_df)} Caption-Paare")

# Setze Standard auf FAST (ohne METEOR)
USE_FAST_MODE = True  # Auf False setzen um METEOR zu versuchen

if USE_FAST_MODE:
    print("\nüöÄ Verwende: FastCaptionEvaluator (BLEU + CIDEr)")
    evaluator = FastCaptionEvaluator()
    output_file = "automatic_metrics.csv"
else:
    print("\n‚ö†Ô∏è  Verwende: CaptionEvaluator (inkl. METEOR - kann instabil sein)")
    evaluator = CaptionEvaluator()
    output_file = "automatic_metrics_with_meteor.csv"

# ============================================================
# EVALUATION DURCHF√úHREN
# ============================================================
print("\n" + "="*70)
print("STARTE BERECHNUNG")
print("="*70)

eval_results = evaluator.evaluate_by_group(eval_df)

# ============================================================
# ERGEBNISSE
# ============================================================
print("\n" + "="*70)
print("‚úÖ EVALUATION ABGESCHLOSSEN")
print("="*70)
print("\nüìä ERGEBNISSE:\n")
print(eval_results.to_string(index=False))

# Speichern
output_path = f"{CONFIG['output_dir']}/{output_file}"
eval_results.to_csv(output_path, index=False)
print(f"\nüíæ Gespeichert: {output_path}")

# ============================================================
# VERGLEICH
# ============================================================
print("\n" + "="*70)
print("üìà MODELL-VERGLEICH")
print("="*70)

# Nach Dataset gruppiert
for dataset in eval_results['dataset'].unique():
    dataset_results = eval_results[eval_results['dataset'] == dataset].copy()
    dataset_results = dataset_results.sort_values('BLEU-4', ascending=False)

    print(f"\nüéØ {dataset.upper()}:")
    print(f"{'Modell':<15} {'BLEU-1':>8} {'BLEU-2':>8} {'BLEU-3':>8} {'BLEU-4':>8} {'CIDEr':>8}")
    print("-" * 65)

    for _, row in dataset_results.iterrows():
        print(f"{row['model']:<15} {row['BLEU-1']:>8.4f} {row['BLEU-2']:>8.4f} "
              f"{row['BLEU-3']:>8.4f} {row['BLEU-4']:>8.4f} {row['CIDEr']:>8.4f}")

# Beste Modelle
print("\n" + "="*70)
best_bleu = eval_results.loc[eval_results['BLEU-4'].idxmax()]
best_cider = eval_results.loc[eval_results['CIDEr'].idxmax()]

print(f"üèÜ BESTES MODELL (BLEU-4):")
print(f"   {best_bleu['model']} auf {best_bleu['dataset']}")
print(f"   Score: {best_bleu['BLEU-4']:.4f}")

print(f"\nüèÜ BESTES MODELL (CIDEr):")
print(f"   {best_cider['model']} auf {best_cider['dataset']}")
print(f"   Score: {best_cider['CIDEr']:.4f}")

print("\n" + "="*70)
print("‚úÖ FERTIG!")
print("="*70)

## 7. WCAG-basierte qualitative Bewertung

In [None]:

# LADE RESULTS AUS DRIVE - VOR EVALUATION!

import pandas as pd
import json
import os
from pathlib import Path

print("="*80)
print("üìÇ LADE CAPTION RESULTS AUS DRIVE")
print("="*80)

# Drive Workspace
DRIVE_WORKSPACE = '/content/drive/MyDrive/caption_generation_workspace'
CHECKPOINTS_DIR = os.path.join(DRIVE_WORKSPACE, 'checkpoints')
RESULTS_DIR = os.path.join(DRIVE_WORKSPACE, 'results')

final_csv = os.path.join(RESULTS_DIR, 'caption_results_FINAL.csv')
progressive_csv = os.path.join(RESULTS_DIR, 'caption_results_progressive.csv')

if os.path.exists(final_csv):
    print(f"\n‚úì Lade aus FINAL CSV...")
    results_df = pd.read_csv(final_csv)
    print(f"   {len(results_df)} Captions geladen")

elif os.path.exists(progressive_csv):
    print(f"\n‚úì Lade aus PROGRESSIVE CSV...")
    results_df = pd.read_csv(progressive_csv)
    print(f"   {len(results_df)} Captions geladen")

else:

    print(f"\nüìÇ Keine CSV gefunden - lade aus Checkpoints...")

    all_results = []

    checkpoint_files = list(Path(CHECKPOINTS_DIR).glob('*.json'))
    checkpoint_files = [f for f in checkpoint_files if not f.name.endswith('_meta.json')
                       and not f.name.endswith('_backup.json')]

    print(f"   Gefunden: {len(checkpoint_files)} Checkpoint-Dateien\n")

    for checkpoint_file in sorted(checkpoint_files):
        try:
            with open(checkpoint_file) as f:
                data = json.load(f)

            print(f"   ‚úì {checkpoint_file.name}: {len(data)} Captions")
            all_results.extend(data)

        except Exception as e:
            print(f"   ‚ö†Ô∏è {checkpoint_file.name}: Fehler - {e}")

    if all_results:
        # Dedupliziere
        seen = set()
        unique_results = []
        for r in all_results:
            key = (r['dataset'], r['model'], r['image_id'])
            if key not in seen:
                seen.add(key)
                unique_results.append(r)

        results_df = pd.DataFrame(unique_results)

        # Speichere auch als CSV
        csv_path = os.path.join(RESULTS_DIR, 'caption_results_reconstructed.csv')
        results_df.to_csv(csv_path, index=False)

        print(f"\n‚úì {len(results_df)} eindeutige Captions rekonstruiert")
        print(f"üíæ Gespeichert: caption_results_reconstructed.csv")
    else:
        print("\n‚ùå Keine Checkpoints gefunden!")
        print(f"   Pr√ºfe: {CHECKPOINTS_DIR}")
        raise FileNotFoundError("Keine Caption-Daten gefunden")

# ============================================================
# ZEIGE √úBERSICHT
# ============================================================

print("\n" + "="*80)
print("üìä CAPTION RESULTS √úBERSICHT")
print("="*80)

print(f"\nTotal Captions: {len(results_df)}")
print(f"\nNach Dataset & Model:")
print(results_df.groupby(['dataset', 'model']).size())

print(f"\nSpalten: {list(results_df.columns)}")

# Pr√ºfe auf leere Captions
empty_count = results_df['generated_caption'].isna().sum()
if empty_count > 0:
    print(f"\n‚ö†Ô∏è {empty_count} leere Captions gefunden")

print("\n" + "="*80)
print("‚úÖ results_df BEREIT F√úR EVALUATION!")
print("="*80)

In [None]:
class WCAGEvaluator:
    """Bewertet Captions nach WCAG-Kriterien f√ºr Barrierefreiheit"""

    def __init__(self):
        self.criteria = {
            'verstaendlichkeit': {
                'name': 'Verst√§ndlichkeit',
                'description': 'Grammatikalisch korrekt, klar formuliert'
            },
            'informationsgehalt': {
                'name': 'Informationsgehalt',
                'description': 'Vollst√§ndige aber pr√§gnante Wiedergabe der Bildinhalte'
            },
            'kontextadaequanz': {
                'name': 'Kontextad√§quanz',
                'description': 'Semantisch konsistent, frei von Stereotypen'
            }
        }

    def analyze_caption_quality(self, caption: str) -> Dict:
        """Automatische Qualit√§tsanalyse (Heuristiken)"""

        scores = {}

        # Handle empty or None captions
        if not caption or not isinstance(caption, str):
            return {
                'verstaendlichkeit': 0.0,
                'informationsgehalt': 0.0,
                'kontextadaequanz': 0.0,
                'overall_wcag': 0.0
            }

        caption = caption.strip()  # Remove whitespace

        if len(caption) == 0:  # Still empty after strip
            return {
                'verstaendlichkeit': 0.0,
                'informationsgehalt': 0.0,
                'kontextadaequanz': 0.0,
                'overall_wcag': 0.0
            }

        # Verst√§ndlichkeit
        word_count = len(caption.split())
        has_verb = any(word in caption.lower() for word in ['is', 'are', 'shows', 'contains', 'depicts'])

        verstaendlichkeit_score = 0
        if 5 <= word_count <= 30:  # Angemessene L√§nge
            verstaendlichkeit_score += 1
        if has_verb:  # Enth√§lt Verb
            verstaendlichkeit_score += 1
        # ‚úÖ FIX: Check length before accessing caption[0]
        if len(caption) > 0 and caption[0].isupper() and caption.endswith('.'):  # Korrekte Interpunktion
            verstaendlichkeit_score += 1

        scores['verstaendlichkeit'] = min(verstaendlichkeit_score / 3, 1.0)

        # Informationsgehalt
        # Pr√ºfe auf spezifische Objekte/Details
        has_specific_terms = any(len(word) > 4 for word in caption.split())
        informationsgehalt_score = 0.5 if has_specific_terms else 0.3
        if word_count >= 10:  # Ausreichend detailliert
            informationsgehalt_score += 0.3

        scores['informationsgehalt'] = min(informationsgehalt_score, 1.0)

        # Kontextad√§quanz
        # Pr√ºfe auf problematische Begriffe (vereinfachte Bias-Erkennung)
        problematic_terms = ['probably', 'seems', 'appears to be', 'might be']
        has_uncertainty = any(term in caption.lower() for term in problematic_terms)

        kontextadaequanz_score = 0.7 if not has_uncertainty else 0.4
        scores['kontextadaequanz'] = kontextadaequanz_score

        # Gesamt-Score
        scores['overall_wcag'] = np.mean(list(scores.values()))

        return scores

    def evaluate_dataset(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """Evaluiert alle Captions nach WCAG"""

        wcag_scores = []

        for idx, row in tqdm(results_df.iterrows(), total=len(results_df), desc="WCAG-Evaluation"):
            caption = row['generated_caption']

            scores = self.analyze_caption_quality(caption)
            scores.update({
                'dataset': row['dataset'],
                'model': row['model'],
                'image_id': row['image_id']
            })

            wcag_scores.append(scores)

        return pd.DataFrame(wcag_scores)

    def aggregate_scores(self, wcag_df: pd.DataFrame) -> pd.DataFrame:
        """Aggregiert WCAG-Scores nach Modell und Datensatz"""

        agg_results = wcag_df.groupby(['dataset', 'model']).agg({
            'verstaendlichkeit': 'mean',
            'informationsgehalt': 'mean',
            'kontextadaequanz': 'mean',
            'overall_wcag': 'mean'
        }).reset_index()

        return agg_results


# WCAG Evaluation durchf√ºhren
wcag_evaluator = WCAGEvaluator()

print("\n" + "="*60)
print("WCAG-BASIERTE QUALITATIVE BEWERTUNG")
print("="*60 + "\n")

wcag_df = wcag_evaluator.evaluate_dataset(results_df)
wcag_summary = wcag_evaluator.aggregate_scores(wcag_df)

print("\nWCAG-Scores (aggregiert):")
print(wcag_summary.to_string(index=False))

# Speichern
wcag_df.to_csv(f"{CONFIG['output_dir']}/wcag_detailed.csv", index=False)
wcag_summary.to_csv(f"{CONFIG['output_dir']}/wcag_summary.csv", index=False)

print(f"\n‚úì WCAG-Ergebnisse gespeichert")

## 8. Visualisierung und Beispiele

In [None]:
# ============================================================
# VISUALISIERUNG DER METRIKEN (adaptiv f√ºr mit/ohne METEOR)
# ============================================================

import matplotlib.pyplot as plt
import numpy as np

# Pr√ºfe welche Metriken verf√ºgbar sind
has_meteor = 'METEOR' in eval_results.columns
has_wcag = 'wcag_summary' in globals() and isinstance(wcag_summary, pd.DataFrame)

print("\n" + "="*70)
print("ERSTELLE VISUALISIERUNGEN")
print("="*70)
print(f"üìä METEOR verf√ºgbar: {has_meteor}")
print(f"üìä WCAG verf√ºgbar: {has_wcag}")

# ============================================================
# BESTIMME PLOT-LAYOUT
# ============================================================
if has_meteor and has_wcag:
    # Alle 4 Plots (Original)
    n_plots = 4
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    plot_titles = ['BLEU-4', 'METEOR', 'CIDEr', 'WCAG Overall']
    plot_metrics = ['BLEU-4', 'METEOR', 'CIDEr', 'overall_wcag']

elif has_meteor:
    # 3 Plots (BLEU, METEOR, CIDEr)
    n_plots = 3
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    plot_titles = ['BLEU-4', 'METEOR', 'CIDEr']
    plot_metrics = ['BLEU-4', 'METEOR', 'CIDEr']

elif has_wcag:
    # 3 Plots (BLEU, CIDEr, WCAG)
    n_plots = 3
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    plot_titles = ['BLEU-4', 'CIDEr', 'WCAG Overall']
    plot_metrics = ['BLEU-4', 'CIDEr', 'overall_wcag']

else:
    # Nur 2 Plots (BLEU, CIDEr) - Standard f√ºr Fast Mode
    n_plots = 2
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    plot_titles = ['BLEU-4', 'CIDEr']
    plot_metrics = ['BLEU-4', 'CIDEr']

# Sicherstellen dass axes immer eine Liste ist
if n_plots == 1:
    axes = [axes]
elif n_plots > 1 and not isinstance(axes, np.ndarray):
    axes = [axes]
else:
    axes = axes.flatten()

fig.suptitle('Evaluation Metriken - Vergleich der Modelle',
             fontsize=16, fontweight='bold', y=0.98)

# ============================================================
# PLOTS ERSTELLEN
# ============================================================
colors = plt.cm.Set2(np.linspace(0, 1, len(eval_results['dataset'].unique())))

for idx, (title, metric) in enumerate(zip(plot_titles, plot_metrics)):
    if idx >= len(axes):
        break

    ax = axes[idx]

    # Bestimme Datenquelle (eval_results oder wcag_summary)
    if metric == 'overall_wcag' and has_wcag:
        data_source = wcag_summary
    else:
        data_source = eval_results

    # Pr√ºfe ob Metrik existiert
    if metric not in data_source.columns:
        ax.text(0.5, 0.5, f'{metric}\nnicht verf√ºgbar',
               ha='center', va='center', fontsize=12)
        ax.set_title(title)
        ax.axis('off')
        continue

    # Gruppiere nach Dataset
    datasets = data_source['dataset'].unique()
    models = data_source['model'].unique()
    n_datasets = len(datasets)
    n_models = len(models)

    # Bar-Positionen berechnen
    x = np.arange(n_models)
    width = 0.8 / n_datasets

    # Plotte jeden Dataset
    for i, (dataset, color) in enumerate(zip(datasets, colors)):
        subset = data_source[data_source['dataset'] == dataset]

        # Sortiere nach model order
        subset = subset.set_index('model').reindex(models).reset_index()

        offset = (i - n_datasets/2 + 0.5) * width
        bars = ax.bar(x + offset, subset[metric], width,
                     label=dataset, alpha=0.85, color=color)

        # Werte √ºber Bars anzeigen
        for j, (bar, val) in enumerate(zip(bars, subset[metric])):
            if not pd.isna(val) and val > 0:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{val:.3f}' if val < 1 else f'{val:.1f}',
                       ha='center', va='bottom', fontsize=8)

    ax.set_title(title, fontweight='bold', fontsize=12)
    ax.set_ylabel('Score', fontsize=10)
    ax.set_xlabel('Modell', fontsize=10)
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=0, ha='center')
    ax.legend(loc='best', fontsize=9)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)

    # Y-Achse Limits
    if metric in data_source.columns:
        max_val = data_source[metric].max()
        if not pd.isna(max_val):
            ax.set_ylim(0, max_val * 1.15)

plt.tight_layout()

# Speichern
output_path = f"{CONFIG['output_dir']}/metrics_comparison.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"\nüíæ Gespeichert: {output_path}")

plt.show()

# ============================================================
# ZUS√ÑTZLICHE DETAILLIERTE PLOTS
# ============================================================
print("\n" + "="*70)
print("ERSTELLE DETAILLIERTE BLEU-SCORES")
print("="*70)

# Plot aller BLEU-N Scores
bleu_metrics = [col for col in eval_results.columns if col.startswith('BLEU-')]

if len(bleu_metrics) > 0:
    fig, axes = plt.subplots(1, len(bleu_metrics), figsize=(5*len(bleu_metrics), 5))

    if len(bleu_metrics) == 1:
        axes = [axes]

    fig.suptitle('BLEU Scores (N-gram Overlap)', fontsize=14, fontweight='bold')

    for idx, metric in enumerate(bleu_metrics):
        ax = axes[idx]

        datasets = eval_results['dataset'].unique()
        models = eval_results['model'].unique()
        n_datasets = len(datasets)
        n_models = len(models)

        x = np.arange(n_models)
        width = 0.8 / n_datasets

        for i, (dataset, color) in enumerate(zip(datasets, colors)):
            subset = eval_results[eval_results['dataset'] == dataset]
            subset = subset.set_index('model').reindex(models).reset_index()

            offset = (i - n_datasets/2 + 0.5) * width
            bars = ax.bar(x + offset, subset[metric], width,
                         label=dataset, alpha=0.85, color=color)

            for bar, val in zip(bars, subset[metric]):
                if not pd.isna(val) and val > 0:
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height,
                           f'{val:.3f}', ha='center', va='bottom', fontsize=8)

        ax.set_title(metric, fontweight='bold')
        ax.set_ylabel('Score')
        ax.set_xlabel('Modell')
        ax.set_xticks(x)
        ax.set_xticklabels(models, rotation=0)
        ax.legend(loc='best', fontsize=9)
        ax.grid(axis='y', alpha=0.3, linestyle='--')

        max_val = eval_results[metric].max()
        if not pd.isna(max_val):
            ax.set_ylim(0, max_val * 1.15)

    plt.tight_layout()

    output_path = f"{CONFIG['output_dir']}/bleu_detailed.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"üíæ Gespeichert: {output_path}")

    plt.show()

# ============================================================
# HEATMAP F√úR MODELL-VERGLEICH
# ============================================================
print("\n" + "="*70)
print("ERSTELLE HEATMAP")
print("="*70)

# Erstelle Pivot-Tabelle f√ºr Heatmap
metrics_for_heatmap = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', 'CIDEr']
if has_meteor:
    metrics_for_heatmap.insert(4, 'METEOR')

available_metrics = [m for m in metrics_for_heatmap if m in eval_results.columns]

if len(available_metrics) > 0:
    # Kombiniere dataset + model f√ºr eindeutige IDs
    eval_results['model_dataset'] = eval_results['model'] + '\n(' + eval_results['dataset'] + ')'

    # Erstelle Heatmap-Daten
    heatmap_data = eval_results.set_index('model_dataset')[available_metrics]

    fig, ax = plt.subplots(figsize=(10, 6))

    im = ax.imshow(heatmap_data.values, cmap='YlOrRd', aspect='auto')

    # Achsen-Labels
    ax.set_xticks(np.arange(len(available_metrics)))
    ax.set_yticks(np.arange(len(heatmap_data)))
    ax.set_xticklabels(available_metrics, fontsize=10)
    ax.set_yticklabels(heatmap_data.index, fontsize=9)

    # Rotiere x-Labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Werte in Zellen schreiben
    for i in range(len(heatmap_data)):
        for j in range(len(available_metrics)):
            val = heatmap_data.values[i, j]
            if not pd.isna(val):
                text = ax.text(j, i, f'{val:.3f}',
                             ha="center", va="center", color="black", fontsize=9)

    ax.set_title('Metriken-√úbersicht aller Modelle', fontweight='bold', fontsize=12, pad=20)

    # Colorbar
    cbar = plt.colorbar(im, ax=ax)
    cbar.set_label('Score', rotation=270, labelpad=20)

    plt.tight_layout()

    output_path = f"{CONFIG['output_dir']}/metrics_heatmap.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"üíæ Gespeichert: {output_path}")

    plt.show()

    # Cleanup
    eval_results.drop('model_dataset', axis=1, inplace=True)

print("\n" + "="*70)
print("‚úÖ ALLE VISUALISIERUNGEN ERSTELLT")
print("="*70)

In [None]:
# Beispiel-Captions anzeigen
print("\n" + "="*80)
print("BEISPIEL-CAPTIONS")
print("="*80 + "\n")

for dataset_name in ['coco', 'flickr30k', 'vizwiz']:
    subset = results_df[results_df['dataset'] == dataset_name]

    if len(subset) == 0:
        continue

    print(f"\n{'='*80}")
    print(f"Dataset: {dataset_name.upper()}")
    print(f"{'='*80}\n")

    # Zeige erste 2 Beispiele
    for image_id in subset['image_id'].unique()[:2]:
        examples = subset[subset['image_id'] == image_id]

        print(f"\nBild ID: {image_id}")
        print("-" * 80)

        # Reference captions
        ref = examples.iloc[0]['reference_captions']
        if isinstance(ref, list) and len(ref) > 0:
            print(f"\nReferenz-Caption(s):")
            for i, r in enumerate(ref[:3], 1):  # Zeige max 3
                print(f"  {i}. {r}")

        # Generated captions
        print(f"\nGenerierte Captions:")
        for _, row in examples.iterrows():
            print(f"  [{row['model']}]: {row['generated_caption']}")

        print()

print("\n" + "="*80)
print("‚úì Evaluation abgeschlossen")
print("="*80)

## 9. Export und Zusammenfassung

In [None]:
# Kombinierte Ergebnistabelle erstellen
final_results = eval_results.merge(
    wcag_summary,
    on=['dataset', 'model'],
    how='outer'
)

# Sortieren
final_results = final_results.sort_values(['dataset', 'model'])

# Formatierung f√ºr bessere Lesbarkeit
numeric_cols = final_results.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if col != 'n_samples':
        final_results[col] = final_results[col].round(4)

print("\n" + "="*100)
print("FINALE ERGEBNISTABELLE")
print("="*100 + "\n")
print(final_results.to_string(index=False))

# Als Excel exportieren
with pd.ExcelWriter(f"{CONFIG['output_dir']}/evaluation_results.xlsx") as writer:
    final_results.to_excel(writer, sheet_name='Summary', index=False)
    results_df.to_excel(writer, sheet_name='All_Captions', index=False)
    wcag_df.to_excel(writer, sheet_name='WCAG_Detailed', index=False)

print(f"\n‚úì Excel-Datei erstellt: {CONFIG['output_dir']}/evaluation_results.xlsx")

# Zusammenfassung
print("\n" + "="*100)
print("ZUSAMMENFASSUNG")
print("="*100 + "\n")

print(f"Evaluierte Datens√§tze: {', '.join(results_df['dataset'].unique())}")
print(f"Getestete Modelle: {', '.join(results_df['model'].unique())}")
print(f"Gesamt Captions generiert: {len(results_df)}")
print(f"\nMetriken berechnet: BLEU-1 bis BLEU-4, METEOR, CIDEr")
print(f"WCAG-Kriterien: Verst√§ndlichkeit, Informationsgehalt, Kontextad√§quanz")

print("\n" + "="*100)
print("‚úì EVALUATION ERFOLGREICH ABGESCHLOSSEN")
print("="*100)

# Download-Link f√ºr Colab
print("\nDateien zum Download:")
from google.colab import files
import zipfile

# ============================================================
# CAPTION EXPORT - ALLE FORMATE
# ============================================================

import pandas as pd
import json
from pathlib import Path
from typing import Dict, List
import os

class CaptionExporter:
    """Exportiert generierte Captions in verschiedenen Formaten"""

    def __init__(self, output_dir: str):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def export_all_formats(self, results_df: pd.DataFrame):
        """Exportiert Captions in allen Formaten"""
        print("\n" + "="*80)
        print("EXPORTIERE CAPTIONS IN ALLE FORMATE")
        print("="*80 + "\n")

        self.export_csv(results_df)
        self.export_excel(results_df)
        self.export_json(results_df)
        self.export_txt(results_df)
        self.export_html(results_df)
        self.export_model_summaries(results_df)

        print("\n‚úì ALLE CAPTION-EXPORTS ABGESCHLOSSEN")

    def export_csv(self, results_df: pd.DataFrame):
        """Exportiert als CSV"""
        csv_path = self.output_dir / "all_generated_captions.csv"
        results_df.to_csv(csv_path, index=False, encoding='utf-8')
        print(f"‚úì CSV: {csv_path}")

        compact_cols = ['dataset', 'model', 'image_id', 'generated_caption']
        compact_path = self.output_dir / "captions_compact.csv"
        results_df[compact_cols].to_csv(compact_path, index=False, encoding='utf-8')
        print(f"‚úì Kompakte CSV: {compact_path}")

    def export_excel(self, results_df: pd.DataFrame):
        """Exportiert als Excel mit mehreren Sheets"""
        excel_path = self.output_dir / "captions_complete.xlsx"

        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            results_df.to_excel(writer, sheet_name='Alle_Captions', index=False)

            for model in results_df['model'].unique():
                model_data = results_df[results_df['model'] == model]
                sheet_name = f'{model}_Captions'[:31]
                model_data.to_excel(writer, sheet_name=sheet_name, index=False)

            for dataset in results_df['dataset'].unique():
                dataset_data = results_df[results_df['dataset'] == dataset]
                sheet_name = f'{dataset}_Captions'[:31]
                dataset_data.to_excel(writer, sheet_name=sheet_name, index=False)

            comparison_df = self.create_comparison_view(results_df)
            comparison_df.to_excel(writer, sheet_name='Vergleich_Side_by_Side', index=False)

        print(f"‚úì Excel (mehrere Sheets): {excel_path}")

    def create_comparison_view(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """Erstellt Side-by-Side Vergleichsansicht"""
        comparison_rows = []

        for (dataset, image_id), group in results_df.groupby(['dataset', 'image_id']):
            row = {'dataset': dataset, 'image_id': image_id}

            if 'reference_captions' in group.columns:
                refs = group.iloc[0]['reference_captions']
                if isinstance(refs, list) and len(refs) > 0:
                    row['reference_1'] = refs[0] if len(refs) > 0 else ''
                    row['reference_2'] = refs[1] if len(refs) > 1 else ''
                    row['reference_3'] = refs[2] if len(refs) > 2 else ''

            for _, item in group.iterrows():
                model_name = item['model']
                row[f'{model_name}_caption'] = item['generated_caption']

            comparison_rows.append(row)

        return pd.DataFrame(comparison_rows)

    def export_json(self, results_df: pd.DataFrame):
        """Exportiert als JSON"""
        json_data = {}

        for dataset in results_df['dataset'].unique():
            json_data[dataset] = {}
            dataset_data = results_df[results_df['dataset'] == dataset]

            for image_id in dataset_data['image_id'].unique():
                image_data = dataset_data[dataset_data['image_id'] == image_id]
                json_data[dataset][str(image_id)] = {
                    'reference_captions': [],
                    'generated_captions': {}
                }

                first_row = image_data.iloc[0]
                if 'reference_captions' in first_row:
                    refs = first_row['reference_captions']
                    if isinstance(refs, list):
                        json_data[dataset][str(image_id)]['reference_captions'] = refs

                for _, row in image_data.iterrows():
                    model = row['model']
                    caption = row['generated_caption']
                    json_data[dataset][str(image_id)]['generated_captions'][model] = caption

        json_path = self.output_dir / "captions_structured.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)

        print(f"‚úì JSON: {json_path}")

    def export_txt(self, results_df: pd.DataFrame):
        """Exportiert als lesbares Text-Format"""
        txt_path = self.output_dir / "captions_readable.txt"

        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write("="*80 + "\n")
            f.write("GENERIERTE BILDUNTERSCHRIFTEN - √úBERSICHT\n")
            f.write("="*80 + "\n\n")

            for dataset in sorted(results_df['dataset'].unique()):
                f.write(f"\n{'='*80}\n")
                f.write(f"DATASET: {dataset.upper()}\n")
                f.write(f"{'='*80}\n\n")

                dataset_data = results_df[results_df['dataset'] == dataset]

                for image_id in sorted(dataset_data['image_id'].unique())[:10]:
                    image_data = dataset_data[dataset_data['image_id'] == image_id]

                    f.write(f"\n{'-'*80}\n")
                    f.write(f"Bild ID: {image_id}\n")
                    f.write(f"{'-'*80}\n\n")

                    first_row = image_data.iloc[0]
                    if 'reference_captions' in first_row:
                        refs = first_row['reference_captions']
                        if isinstance(refs, list) and len(refs) > 0:
                            f.write("REFERENZ-CAPTIONS:\n")
                            for i, ref in enumerate(refs[:5], 1):
                                f.write(f"  {i}. {ref}\n")
                            f.write("\n")

                    f.write("GENERIERTE CAPTIONS:\n")
                    for _, row in image_data.iterrows():
                        model = row['model']
                        caption = row['generated_caption']
                        f.write(f"  [{model:10s}]: {caption}\n")
                    f.write("\n")

        print(f"‚úì TXT (lesbar): {txt_path}")

    def export_html(self, results_df: pd.DataFrame):
        """Exportiert als interaktive HTML-Seite (vereinfacht)"""
        html_path = self.output_dir / "captions_interactive.html"

        html = """<!DOCTYPE html>
<html lang="de">
<head>
    <meta charset="UTF-8">
    <title>Generierte Bildunterschriften</title>
    <style>
        body { font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; background: #f5f5f5; }
        h1 { color: #2c3e50; text-align: center; }
        .card { background: white; margin: 15px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        .caption { margin: 10px 0; padding: 10px; background: #f9f9f9; border-left: 3px solid #3498db; }
        .model { font-weight: bold; color: #3498db; }
    </style>
</head>
<body>
    <h1>üìä Generierte Bildunterschriften</h1>
"""

        for dataset in sorted(results_df['dataset'].unique()):
            html += f'<h2>üìÅ {dataset.upper()}</h2>'
            dataset_data = results_df[results_df['dataset'] == dataset]

            for image_id in sorted(dataset_data['image_id'].unique())[:5]:
                image_data = dataset_data[dataset_data['image_id'] == image_id]
                html += f'<div class="card"><strong>Bild ID: {image_id}</strong>'

                for _, row in image_data.iterrows():
                    html += f'<div class="caption"><span class="model">{row["model"].upper()}:</span> {row["generated_caption"]}</div>'

                html += '</div>'

        html += '</body></html>'

        with open(html_path, 'w', encoding='utf-8') as f:
            f.write(html)

        print(f"‚úì HTML (interaktiv): {html_path}")

    def export_model_summaries(self, results_df: pd.DataFrame):
        """Erstellt separate Zusammenfassungen pro Modell"""
        summaries_dir = self.output_dir / "model_summaries"
        summaries_dir.mkdir(exist_ok=True)

        for model in results_df['model'].unique():
            model_data = results_df[results_df['model'] == model]
            model_csv = summaries_dir / f"{model}_captions.csv"
            model_data.to_csv(model_csv, index=False, encoding='utf-8')

        print(f"‚úì Modell-Zusammenfassungen: {summaries_dir}")


# ============================================================
# JETZT EXPORTIEREN
# ============================================================

print("\n" + "="*80)
print("CAPTION-EXPORT IN ALLE FORMATE")
print("="*80)

exporter = CaptionExporter(CONFIG['output_dir'])
exporter.export_all_formats(results_df)

print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë                    CAPTION-EXPORT ABGESCHLOSSEN                              ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

Erstellt:
  ‚úì all_generated_captions.csv
  ‚úì captions_compact.csv
  ‚úì captions_complete.xlsx (mehrere Sheets!)
  ‚úì captions_structured.json
  ‚úì captions_readable.txt
  ‚úì captions_interactive.html
  ‚úì model_summaries/

Alle Dateien in: {CONFIG['output_dir']}
""")

# Alle Ergebnisse zippen
zip_path = "/content/evaluation_results.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files_list in os.walk(CONFIG['output_dir']):
        for file in files_list:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.basename(file_path))

print(f"\n‚úì Alle Ergebnisse als ZIP-Datei bereit")
print(f"  Download: {zip_path}")

# Optional: Automatischer Download
# files.download(zip_path)