In [6]:
import os
import requests
import time
import csv
import json
from datetime import datetime
from urllib.parse import urlparse, unquote

# ─────────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────────
CATEGORY_TITLE       = "Category:Ancient_Greek_architecture"
API_ENDPOINT         = "https://commons.wikimedia.org/w/api.php"
OUTPUT_CSV           = "ancient_greek_architecture_metadata.csv"
OUTPUT_JSON          = "ancient_greek_architecture_metadata.json"
DOWNLOAD_BASE_FOLDER = "downloaded_images"

# Your custom User-Agent per Wikimedia policy:
USER_AGENT = "AncientGreekArchDownloader/1.0 (kzhang83@student.ubc.ca)"

IIPROP = [
    "timestamp",   # upload date/time
    "user",        # who uploaded
    "url",         # direct file URL
    "size",        # file size in bytes
    "mime",        # MIME type
    "metadata",    # EXIF / embedded metadata if any
]

FETCH_CATEGORIES = True


# ─────────────────────────────────────────────────────────────────
# UTILITY FUNCTIONS
# ─────────────────────────────────────────────────────────────────
def fetch_category_members(category, session, cmcontinue=None):
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": category,
        "cmlimit": "500",
        "format": "json",
    }
    if cmcontinue:
        params["cmcontinue"] = cmcontinue

    resp = session.get(API_ENDPOINT, params=params)
    resp.raise_for_status()
    return resp.json()

def fetch_imageinfo(titles, session):
    params = {
        "action": "query",
        "prop": "imageinfo",
        "titles": "|".join(titles),
        "iiprop": "|".join(IIPROP),
        "format": "json",
    }
    resp = session.get(API_ENDPOINT, params=params)
    resp.raise_for_status()
    return resp.json()

def fetch_categories_for_pages(pageids, session):
    params = {
        "action": "query",
        "prop": "categories",
        "pageids": "|".join(str(pid) for pid in pageids),
        "cllimit": "max",
        "format": "json",
    }
    resp = session.get(API_ENDPOINT, params=params)
    resp.raise_for_status()
    return resp.json()

def sanitize_category_name(cat_title):
    if cat_title.startswith("Category:"):
        cat_title = cat_title[len("Category:") :]
    return cat_title.replace(" ", "_").replace("/", "_")

def sanitize_filename_from_url(url):
    path = urlparse(url).path
    name = os.path.basename(path)
    return unquote(name)

def download_image(url, save_path, session):
    if os.path.exists(save_path):
        return
    headers = {"User-Agent": USER_AGENT}
    resp = session.get(url, headers=headers, stream=True)
    resp.raise_for_status()
    with open(save_path, "wb") as fd:
        for chunk in resp.iter_content(1024 * 8):
            fd.write(chunk)


# ─────────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # (1) Create a Session with custom User-Agent
    session = requests.Session()
    session.headers.update({"User-Agent": USER_AGENT})

    # (2) Fetch all category members
    print(f"Fetching members of {CATEGORY_TITLE} ...")
    cmcontinue = None
    all_members = []
    while True:
        data = fetch_category_members(CATEGORY_TITLE, session, cmcontinue=cmcontinue)
        all_members.extend(data["query"]["categorymembers"])
        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.1)
        else:
            break

    print(f"  → Found {len(all_members)} total entries.")
    file_members = [m for m in all_members if m["ns"] == 6]
    print(f"  → {len(file_members)} are actual files (ns=6).")

    # (3) Batch‐query imageinfo & categories
    metadata_records = []
    chunk_size = 50
    for i in range(0, len(file_members), chunk_size):
        chunk = file_members[i : i + chunk_size]
        titles = [m["title"] for m in chunk]
        pageid_list = [m["pageid"] for m in chunk]

        img_data = fetch_imageinfo(titles, session)
        pages = img_data["query"]["pages"]

        if FETCH_CATEGORIES:
            cat_data = fetch_categories_for_pages(pageid_list, session)
            cat_pages = cat_data["query"]["pages"]
        else:
            cat_pages = {}

        for pid, pdata in pages.items():
            record = {
                "pageid": int(pid),
                "title": pdata.get("title", ""),
                "timestamp": None,
                "uploader": None,
                "file_url": None,
                "filesize": None,
                "mime": None,
                "exif": {},
                "categories": [],
            }

            if "imageinfo" in pdata:
                ii = pdata["imageinfo"][0]
                record["timestamp"] = ii.get("timestamp")
                record["uploader"]  = ii.get("user")
                record["file_url"]  = ii.get("url")
                record["filesize"]  = ii.get("size")
                record["mime"]      = ii.get("mime")

                md_list = ii.get("metadata", [])
                exif_dict = {}
                for md in md_list:
                    name = md.get("name")
                    val  = md.get("value")
                    if name and val:
                        exif_dict[name] = val
                record["exif"] = exif_dict

            if FETCH_CATEGORIES and pid in cat_pages:
                cats = cat_pages[pid].get("categories", [])
                record["categories"] = [c["title"] for c in cats]

            metadata_records.append(record)

        time.sleep(0.2)

    # (4) Compute “days_since_upload”
    for rec in metadata_records:
        ts = rec["timestamp"]
        if ts:
            dt = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")
            delta = datetime.utcnow() - dt
            rec["days_since_upload"] = delta.days
        else:
            rec["days_since_upload"] = None

    # (5) Write metadata CSV & JSON
    print("Writing metadata to CSV and JSON...")
    with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as fh:
        fieldnames = [
            "pageid",
            "title",
            "timestamp",
            "days_since_upload",
            "uploader",
            "file_url",
            "filesize",
            "mime",
            "categories",
            "exif",
        ]
        writer = csv.DictWriter(fh, fieldnames=fieldnames)
        writer.writeheader()
        for rec in metadata_records:
            rec_row = {
                "pageid": rec["pageid"],
                "title": rec["title"],
                "timestamp": rec["timestamp"],
                "days_since_upload": rec["days_since_upload"],
                "uploader": rec["uploader"],
                "file_url": rec["file_url"],
                "filesize": rec["filesize"],
                "mime": rec["mime"],
                "categories": json.dumps(rec["categories"], ensure_ascii=False),
                "exif": json.dumps(rec["exif"], ensure_ascii=False),
            }
            writer.writerow(rec_row)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as fh:
        json.dump(metadata_records, fh, ensure_ascii=False, indent=2)

    # (6) Prepare base download folder
    if not os.path.exists(DOWNLOAD_BASE_FOLDER):
        os.makedirs(DOWNLOAD_BASE_FOLDER)

    # ──────────────────────────────────────────────────────────────
    # (7) Download each image and copy into category folders
    # ──────────────────────────────────────────────────────────────
    print("Downloading images and organizing into category folders…")
    for rec in metadata_records:
        url = rec["file_url"]
        if not url:
            continue

        local_filename = sanitize_filename_from_url(url)
        temp_path = os.path.join(DOWNLOAD_BASE_FOLDER, local_filename)

        # (1) Download the image once (with proper User-Agent).
        try:
            download_image(url, temp_path, session)
        except Exception as e:
            print(f"  [!] Failed to download {url}: {e}")
            continue

        # (2) If download didn’t produce a file, skip copying entirely.
        if not os.path.exists(temp_path):
            # Download must have quietly failed; don’t attempt any folder/copy.
            continue

        # (3) Now iterate over each category; ensure the folder exists before copying.
        for cat_full in rec["categories"]:
            folder_name = sanitize_category_name(cat_full)
            target_dir = os.path.join(DOWNLOAD_BASE_FOLDER, folder_name)

            # ─── Create the category directory before opening the file ───
            try:
                os.makedirs(target_dir, exist_ok=True)
            except Exception as e:
                print(f"  [!] Could not create folder {target_dir}: {e}")
                # If for some reason mkdir fails, skip copying into this category.
                continue

            target_path = os.path.join(target_dir, local_filename)

            # (4) Finally, copy the downloaded file into that folder.
            #     We wrap this in try/except so that even if something odd happens,
            #     the script will not crash out with FileNotFoundError.
            if not os.path.exists(target_path):
                try:
                    with open(temp_path, "rb") as src_fd, open(target_path, "wb") as dst_fd:
                        dst_fd.write(src_fd.read())
                except Exception as e:
                    print(f"  [!] Could not copy {local_filename} → {target_dir}: {e}")
                    # Continue on to the next category or image
                    continue

print("All done.")
print(f"  • Metadata CSV: {OUTPUT_CSV}")
print(f"  • Metadata JSON: {OUTPUT_JSON}")
print(f"  • Downloaded images folder: {DOWNLOAD_BASE_FOLDER}/")


Fetching members of Category:Ancient_Greek_architecture ...
  → Found 59 total entries.
  → 20 are actual files (ns=6).


  delta = datetime.utcnow() - dt


Writing metadata to CSV and JSON...
Downloading images and organizing into category folders…
  [!] Could not copy 1047_-_Keramikos_Museum,_Athens_-_Vase_shaped_as_a_grain_silos,_700-650_BC_-_Photo_by_Giovanni_Dall'Orto_Nov_12_2009.jpg → downloaded_images\Ancient_Greek_terracotta_figurines_in_the_Kerameikos_Archaeological_Museum_(Athens): [Errno 2] No such file or directory: "downloaded_images\\Ancient_Greek_terracotta_figurines_in_the_Kerameikos_Archaeological_Museum_(Athens)\\1047_-_Keramikos_Museum,_Athens_-_Vase_shaped_as_a_grain_silos,_700-650_BC_-_Photo_by_Giovanni_Dall'Orto_Nov_12_2009.jpg"
  [!] Could not copy 1047_-_Keramikos_Museum,_Athens_-_Vase_shaped_as_a_grain_silos,_700-650_BC_-_Photo_by_Giovanni_Dall'Orto_Nov_12_2009.jpg → downloaded_images\Attic_Geometric_pottery_in_the_Kerameikos_Archaeological_Museum_(Athens): [Errno 2] No such file or directory: "downloaded_images\\Attic_Geometric_pottery_in_the_Kerameikos_Archaeological_Museum_(Athens)\\1047_-_Keramikos_Museum,_Athe