In [7]:
import requests
import time
import csv
import re
from pathlib import Path
from tqdm import tqdm

In [8]:
# BASE_URL = "https://collectionapi.metmuseum.org/public/collection/v1"

# # 1. Look up the departmentId for “Greek and Roman Art”
# deps = requests.get(f"{BASE_URL}/departments").json()["departments"]
# dept_id = next(d["departmentId"] for d in deps if d["displayName"] == "Greek and Roman Art")

# # 2. Search for “Greek” objects with images in that department
# params = {
#     "departmentId": dept_id,
#     "artistOrCulture": True,
#     "q": "Statue",
#     "hasImages": True
# }
# search = requests.get(f"{BASE_URL}/search", params=params).json()
# object_ids = search.get("objectIDs", [])

# print(f"Found {len(object_ids)} Greek objects with images in Dept. {dept_id}")

# # 3. Download each primaryImage
# out_dir = Path("met_greek_images")
# out_dir.mkdir(exist_ok=True)

# # OPTIONAL: set to None or an integer to cap total downloads
# MAX_DOWNLOADS = None  # e.g. 500 to stop after 500 images

# count = 0
# for obj_id in tqdm(object_ids, desc="Downloading images"):
#     if MAX_DOWNLOADS and count >= MAX_DOWNLOADS:
#         break

#     # 3a. Fetch metadata
#     try:
#         obj = requests.get(f"{BASE_URL}/objects/{obj_id}").json()
#     except Exception as e:
#         # skip if something goes wrong with JSON decoding
#         tqdm.write(f"[{obj_id}] metadata fetch failed: {e}")
#         continue

#     img_url = obj.get("primaryImage")
#     if not img_url:
#         continue  # skip no-image records

#     # 3b. Download the image
#     try:
#         resp = requests.get(img_url, timeout=10)
#         resp.raise_for_status()
#     except Exception as e:
#         tqdm.write(f"[{obj_id}] image download failed: {e}")
#         continue

#     # 3c. Write to disk
#     ext = img_url.split(".")[-1].split("?")[0]  # handle query strings
#     filename = out_dir / f"{obj_id}.{ext}"
#     filename.write_bytes(resp.content)
#     count += 1

#     # 3d. Throttle (optional, adjust as needed)
#     time.sleep(0.2)

# print(f"Done — downloaded {count} images to {out_dir}/")


In [9]:
# Base API URL
BASE_URL = "https://collectionapi.metmuseum.org/public/collection/v1"

# 1. Look up the departmentId for "Greek and Roman Art"
deps = requests.get(f"{BASE_URL}/departments").json().get("departments", [])
dept_id = next((d["departmentId"] for d in deps if d["displayName"] == "Greek and Roman Art"), None)
if dept_id is None:
    raise ValueError("Department 'Greek and Roman Art' not found")

# 2. Search for "Highlights" objects in that department
#    Workaround: using a space query to return all titles, filtered by isHighlight
params = {
    "departmentId": dept_id,
    "hasImages": True,
    "q": " "  # space to match all highlight objects
}
search = requests.get(f"{BASE_URL}/search", params=params).json()
object_ids = search.get("objectIDs", []) or []
print(f"Found {len(object_ids)} objects with images in Dept {dept_id}")

# 3. Prepare output directories and CSV
base_dir = Path("met_highlights_greek_roman")
base_dir.mkdir(exist_ok=True)

csv_path = base_dir / "metadata.csv"
fieldnames = [
    "objectID", "title", "objectName", "objectDate", "period", "culture",
    "medium", "classification", "artistDisplayName", "repository", "objectURL", "imageFiles"
]

# Utility to sanitize folder names
def sanitize(name):
    return re.sub(r"[^\w\- ]+", "_", name).strip()

# Open CSV for writing
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # 4. Iterate and download
    for obj_id in tqdm(object_ids, desc="Downloading"):
        # Fetch object metadata
        try:
            obj = requests.get(f"{BASE_URL}/objects/{obj_id}").json()
        except Exception:
            continue

        # 4a. Classify by objectName (e.g., "Vase", "Statue")
        obj_type = obj.get("objectName", "Unknown") or "Unknown"
        folder_name = sanitize(obj_type)
        dest_dir = base_dir / folder_name
        dest_dir.mkdir(exist_ok=True)

        # 4b. Collect image URLs: primary + additional
        image_urls = []
        if obj.get("primaryImage"):
            image_urls.append(obj["primaryImage"])
        image_urls.extend(obj.get("additionalImages", []))

        downloaded_files = []
        for idx, url in enumerate(image_urls, start=1):
            try:
                resp = requests.get(url, timeout=10)
                resp.raise_for_status()
                ext = Path(url).suffix.split('?')[0] or ".jpg"
                fname = f"{obj_id}_{idx}{ext}"
                fout = dest_dir / fname
                fout.write_bytes(resp.content)
                downloaded_files.append(str(fout))
                time.sleep(0.1)
            except Exception:
                continue

        # 4c. Write metadata row including description
        writer.writerow({
            "objectID": obj_id,
            "title": obj.get("title", ""),
            "objectName": obj_type,
            "objectDate": obj.get("objectDate", ""),
            "period": obj.get("period", ""),
            "culture": obj.get("culture", ""),
            "medium": obj.get("medium", ""),
            "classification": obj.get("classification", ""),
            "artistDisplayName": obj.get("artistDisplayName", ""),
            "repository": obj.get("repository", ""),
            "objectURL": obj.get("objectURL", ""),
            "imageFiles": "|".join(downloaded_files)
        })

print(f"Done! Metadata CSV saved to {csv_path}")

Found 32 objects with images in Dept 13


Downloading: 100%|██████████| 32/32 [00:23<00:00,  1.38it/s]

Done! Metadata CSV saved to met_highlights_greek_roman\metadata.csv



