# Visual Art Interpreter - Phase 1: Zero-Shot Baseline

Model: Qwen3-VL-8B-Instruct
Task: Zero-shot art/aesthetic analysis

## 1. Setup & Dependencies

In [None]:
# Install dependencies
!pip install -q transformers accelerate bitsandbytes
!pip install -q pillow matplotlib

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from PIL import Image
import matplotlib.pyplot as plt
import json
from pathlib import Path

## 2. Load Model (Qwen3-VL-8B-Instruct)

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"

# Load in 4-bit to save VRAM
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True
)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

print(f"Model loaded: {MODEL_NAME}")
print(f"Device: {model.device}")

## 3. Data Loading

In [None]:
# Mount Google Drive (for data storage)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define data paths - modify these
DATA_DIR = Path("/content/drive/MyDrive/art_data")  # Update with your path
OUTPUT_DIR = Path("/content/drive/MyDrive/outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
# Simple image loader
def load_test_images(data_dir):
    """Load images from directory.
    
    Expected structure:
        data_dir/
            photos/
            paintings/
            abstract/
    """
    images = []
    categories = ["photos", "paintings", "abstract"]
    
    for category in categories:
        cat_dir = data_dir / category
        if not cat_dir.exists():
            continue
        for img_path in cat_dir.glob("*.jpg"):
            images.append({
                "path": str(img_path),
                "category": category,
                "name": img_path.stem
            })
    
    return images

In [None]:
# Load test images
test_images = load_test_images(DATA_DIR)
print(f"Loaded {len(test_images)} images")
print(f"Categories: {set(img['category'] for img in test_images)}")

## 4. Evaluation Prompts (8-Question Framework)

In [None]:
# Prompts based on Padó & Thomas (2025)
EVAL_PROMPTS = {
    "content": "Describe what you see in this artwork.",
    "type": "What type of artwork is this (painting, photograph, etc.)?",
    "emotion": "What emotion does this artwork convey?",
    "polarity": "Is the overall emotion positive or negative?",
    "specific_emotion": "What specific emotion is depicted (joy, grief, awe, etc.)?",
    "expression": "How is this emotion expressed (color, composition, subject)?",
    "symbol": "Are there any symbols used to convey meaning?",
    "aesthetic": "Rate this artwork aesthetically from 1-10 and explain why."
}

## 5. Inference Function

In [None]:
def analyze_image(image_path, prompt_key="content"):
    """
    Run zero-shot inference on an image.
    
    Args:
        image_path: Path to image file
        prompt_key: Key from EVAL_PROMPTS dict
    
    Returns:
        dict with image info, prompt, and response
    """
    # Load image
    image = Image.open(image_path).convert("RGB")
    
    # Get prompt
    prompt = EVAL_PROMPTS.get(prompt_key, prompt_key)
    
    # Prepare message
    messages = [
        {
            "role": "system",
            "content": "You are an expert art historian and critic."
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]
    
    # Process
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False
        )
    
    # Decode
    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    
    return {
        "image_path": str(image_path),
        "prompt_key": prompt_key,
        "prompt": prompt,
        "response": response
    }

## 6. Test Run

In [None]:
# Test on first image
if test_images:
    test_img = test_images[0]
    print(f"Testing on: {test_img['name']} ({test_img['category']})")
    
    result = analyze_image(test_img['path'], prompt_key="content")
    print(f"\nPrompt: {result['prompt']}")
    print(f"\nResponse: {result['response']}")

## 7. Batch Evaluation (Small Scale)

In [None]:
# Run on subset for quick validation
def run_subset_evaluation(images, max_images=10, prompt_key="content"):
    """Run evaluation on subset of images."""
    results = []
    
    for img_info in images[:max_images]:
        print(f"Processing: {img_info['name']}...", end=" ")
        try:
            result = analyze_image(img_info['path'], prompt_key)
            result['category'] = img_info['category']
            results.append(result)
            print("OK")
        except Exception as e:
            print(f"Error: {e}")
    
    return results

In [None]:
# Run on 5 images as smoke test
results = run_subset_evaluation(test_images, max_images=5, prompt_key="content")

In [None]:
# Save results
output_file = OUTPUT_DIR / "phase1_pilot_results.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved to: {output_file}")

## Next Steps

1. Validate model works on your test images
2. Add more prompts - test all 8 questions
3. Expand test set - run on full evaluation dataset
4. Add metrics - consistency checks, comparison with ground truth