# Visual Art Interpreter - Phase 1: Zero-Shot Baseline

Model: Qwen3-VL-8B-Instruct
Task: Zero-shot art/aesthetic analysis

## 1. Setup & Dependencies

In [None]:
# Install dependencies
!pip install -q transformers accelerate bitsandbytes datasets
!pip install -q pillow matplotlib

In [None]:
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from datasets import load_dataset
from PIL import Image
import matplotlib.pyplot as plt
import json
from pathlib import Path

## 2. Load Model (Qwen3-VL-8B-Instruct)

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"

# Load in bfloat16 (A100 has enough VRAM; use load_in_4bit=True for smaller GPUs)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

print(f"Model loaded: {MODEL_NAME}")
print(f"Device: {model.device}")

## 3. Data Loading

In [None]:
# Output directory (local - no Google Drive)
OUTPUT_DIR = Path("./outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load AVA dataset from Hugging Face (small batch for testing)
# Dataset: image_id, image, mean_score (1-10), rating_counts, total_votes, etc.
# NOTE: split="train[:20]" downloads ALL 61 shards (~30GB). Load parquet directly
# to fetch only the first shard (~500MB) and bypass split metadata.
BATCH_SIZE = 20  # Adjust for quick testing

# Load single parquet file via generic loader (avoids ExpectedMoreSplitsError)
AVA_PARQUET_URL = "https://huggingface.co/datasets/trojblue/AVA-Huggingface/resolve/main/data/train-00000-of-00061.parquet"
ava_dataset = load_dataset("parquet", data_files=AVA_PARQUET_URL, split="train")
ava_dataset = ava_dataset.select(range(BATCH_SIZE))

print(f"Loaded {len(ava_dataset)} images from AVA-Huggingface (~500MB download)")
print(f"Columns: {ava_dataset.column_names}")
print(f"Sample mean_score range: {ava_dataset['mean_score'][0]:.2f} - {ava_dataset['mean_score'][-1]:.2f}")

In [None]:
# Convert AVA dataset to list of dicts for evaluation
def prepare_ava_batch(dataset):
    """Convert AVA dataset to list of {image_id, image, mean_score}."""
    return [
        {
            "image_id": row["image_id"],
            "image": row["image"].convert("RGB") if hasattr(row["image"], "convert") else row["image"],
            "mean_score": float(row["mean_score"]),
        }
        for row in dataset
    ]

test_images = prepare_ava_batch(ava_dataset)

In [None]:
print(f"Prepared {len(test_images)} images for evaluation")
print(f"Sample: image_id={test_images[0]['image_id']}, mean_score={test_images[0]['mean_score']:.2f}")

## 4. Evaluation Prompts (8-Question Framework)

In [None]:
# Prompts based on Pad√≥ & Thomas (2025)
EVAL_PROMPTS = {
    "aesthetic": "Rate this photograph's aesthetic quality from 1-10 (1=low, 10=high). Give your score as a single number first, then briefly explain.",
    "content": "Describe what you see in this artwork.",
    "type": "What type of artwork is this (painting, photograph, etc.)?",
    "emotion": "What emotion does this artwork convey?",
    "polarity": "Is the overall emotion positive or negative?",
    "specific_emotion": "What specific emotion is depicted (joy, grief, awe, etc.)?",
    "expression": "How is this emotion expressed (color, composition, subject)?",
    "symbol": "Are there any symbols used to convey meaning?",
}

## 5. Inference Function

In [None]:
def analyze_image(image, prompt_key="content", image_id=None):
    """
    Run zero-shot inference on an image.
    
    Args:
        image: Path to image file (str) or PIL Image
        prompt_key: Key from EVAL_PROMPTS dict
        image_id: Optional identifier (for results)
    
    Returns:
        dict with image info, prompt, and response
    """
    # Load image if path given
    if isinstance(image, (str, Path)):
        image = Image.open(image).convert("RGB")
    elif hasattr(image, "convert"):
        image = image.convert("RGB")
    
    # Get prompt
    prompt = EVAL_PROMPTS.get(prompt_key, prompt_key)
    
    # Prepare message (Qwen3-VL format)
    messages = [
        {
            "role": "system",
            "content": "You are an expert art historian and critic."
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]
    
    # Process with Qwen3-VL API
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
    )
    inputs.pop("token_type_ids", None)

    # Move inputs to model device (handles nested pixel_values)
    def to_device(obj, device):
        if hasattr(obj, "to"):
            return obj.to(device)
        if isinstance(obj, (list, tuple)):
            return type(obj)(to_device(x, device) for x in obj)
        if isinstance(obj, dict):
            return {k: to_device(v, device) for k, v in obj.items()}
        return obj
    inputs = to_device(inputs, model.device)
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
    
    # Decode only new tokens (trim input from output)
    input_length = inputs["input_ids"].shape[1]
    generated_ids_trimmed = generated_ids[:, input_length:]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return {
        "image_id": image_id,
        "prompt_key": prompt_key,
        "prompt": prompt,
        "response": response
    }

## 6. Test Run

In [None]:
# Test on first image
if test_images:
    test_img = test_images[0]
    print(f"Testing on: image_id={test_img['image_id']} (mean_score={test_img['mean_score']:.2f})")
    
    result = analyze_image(test_img["image"], prompt_key="aesthetic", image_id=test_img["image_id"])
    result["mean_score"] = test_img["mean_score"]  # Ground truth for evaluation
    print(f"\nPrompt: {result['prompt']}")
    print(f"\nResponse: {result['response']}")

## 7. Batch Evaluation (Small Scale)

In [None]:
# Run on subset for quick validation
def run_subset_evaluation(images, max_images=10, prompt_key="content"):
    """Run evaluation on subset of images (from AVA or similar)."""
    results = []
    
    for img_info in images[:max_images]:
        print(f"Processing: {img_info['image_id']}...", end=" ")
        try:
            result = analyze_image(img_info["image"], prompt_key, image_id=img_info["image_id"])
            result["mean_score"] = img_info["mean_score"]  # Ground truth for correlation eval
            results.append(result)
            print("OK")
        except Exception as e:
            print(f"Error: {e}")
    
    return results

In [None]:
# Run on 5 images as smoke test (aesthetic prompt for AVA scoring)
results = run_subset_evaluation(test_images, max_images=5, prompt_key="aesthetic")

In [None]:
# Save results
output_file = OUTPUT_DIR / "phase1_pilot_results.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved to: {output_file}")