In [1]:
import os
import re
import csv
import requests
from urllib.parse import urlparse

# === Configuration ===
API_KEY = '8a3e7b04-96e1-45f3-9666-20c5dbb4dd88'  # replace with your Harvard Art Museums API key
BASE_URL = 'https://api.harvardartmuseums.org/object'
cultures = ["Byzantine", "Greek", "Roman", "Egyptian"]
classifications = [
    "Architecture Elements", "Coins", "Paintings", "Sculpture",
    "Seals", "Vessels", "Weapons and Ammunitions"
]
max_per_combo = 150
OUTPUT_CSV = 'metadata.csv'
IMAGE_ROOT = 'images'

# Initialize CSV
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id','title','classification','century','culture','image_path'])

# Helpers
def sanitize(name):
    return re.sub(r'[<>:"/\\|?*]', '_', name)

def get_extension_from_url(url):
    path = urlparse(url).path
    ext = os.path.splitext(path)[1]
    return ext if ext and len(ext) <= 5 else '.jpg'

# Summary counters
total_downloaded = 0
breakdown = {}

# Main loop
for culture in cultures:
    for cls in classifications:
        print(f"Processing {culture} / {cls}...")
        # Fetch items
        items = []
        page = 1
        while True:
            params = {
                'apikey': API_KEY,
                'q': f'culture:"{culture}" AND classification:"{cls}"',
                'hasimage': 1,
                'size': 100,
                'page': page
            }
            resp = requests.get(BASE_URL, params=params)
            resp.raise_for_status()
            data = resp.json()
            recs = data.get('records', [])
            if not recs:
                break
            for rec in recs:
                if rec.get('primaryimageurl'):
                    items.append(rec)
                if len(items) >= max_per_combo * 2:
                    break
            if len(items) >= max_per_combo * 2 or not data.get('info', {}).get('next'):
                break
            page += 1

        # Select up to max_per_combo, balanced by century
        if len(items) > max_per_combo:
            by_century = {}
            for rec in items:
                cent = rec.get('century') or 'Unknown'
                by_century.setdefault(cent, []).append(rec)
            selected = []
            idxs = {c:0 for c in by_century}
            centuries = list(by_century.keys())
            i = 0
            while len(selected) < max_per_combo and centuries:
                cent = centuries[i % len(centuries)]
                lst = by_century[cent]
                if idxs[cent] < len(lst):
                    selected.append(lst[idxs[cent]])
                    idxs[cent] += 1
                else:
                    centuries.remove(cent)
                i += 1
            if len(selected) < max_per_combo:
                remaining = [r for r in items if r not in selected]
                selected += remaining[:max_per_combo - len(selected)]
        else:
            selected = items

        # Download images and write metadata
        count = 0
        for rec in selected:
            obj_id = rec.get('id')
            img_url = rec.get('primaryimageurl')
            title = rec.get('title','').replace('\n',' ').strip()
            century = rec.get('century','')
            if not img_url:
                continue
            # Determine extension
            ext = get_extension_from_url(img_url)
            # Filename
            safe_id = sanitize(str(obj_id))
            filename = f"{safe_id}{ext}"
            folder = os.path.join(IMAGE_ROOT, sanitize(culture), sanitize(cls))
            os.makedirs(folder, exist_ok=True)
            path = os.path.join(folder, filename)
            try:
                img_data = requests.get(img_url, timeout=15).content
                with open(path, 'wb') as f_img:
                    f_img.write(img_data)
            except Exception as e:
                print(f"Failed downloading {img_url}: {e}")
                continue
            # Append metadata
            with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow([obj_id, title, cls, century, culture, path])
            count += 1

        breakdown[(culture, cls)] = count
        total_downloaded += count
        print(f"Downloaded {count} images for {culture} / {cls}\n")

# Final summary
print(f"All done. Total images downloaded: {total_downloaded}")
print("Breakdown by culture and classification:")
for (culture, cls), cnt in breakdown.items():
    print(f" - {culture} / {cls}: {cnt}")


Processing Byzantine / Architecture Elements...
Downloaded 1 images for Byzantine / Architecture Elements

Processing Byzantine / Coins...
Downloaded 150 images for Byzantine / Coins

Processing Byzantine / Paintings...
Downloaded 24 images for Byzantine / Paintings

Processing Byzantine / Sculpture...
Downloaded 36 images for Byzantine / Sculpture

Processing Byzantine / Seals...
Downloaded 150 images for Byzantine / Seals

Processing Byzantine / Vessels...
Downloaded 8 images for Byzantine / Vessels

Processing Byzantine / Weapons and Ammunitions...
Downloaded 0 images for Byzantine / Weapons and Ammunitions

Processing Greek / Architecture Elements...
Downloaded 9 images for Greek / Architecture Elements

Processing Greek / Coins...
Downloaded 150 images for Greek / Coins

Processing Greek / Paintings...
Downloaded 5 images for Greek / Paintings

Processing Greek / Sculpture...
Downloaded 150 images for Greek / Sculpture

Processing Greek / Seals...
Downloaded 6 images for Greek / S

In [11]:
import pandas as pd

df = pd.read_csv("images/metadata.csv")
df['image_path'] = df['image_path'].apply(
    lambda path: path.replace('images\\', '', 1)
) 

df['image_path'] = df['image_path'].str.replace(r'\\', '/', regex=True)

df['image_path'] 

df.to_csv('images/metadata.csv', index=False)