# Manual Curation Workflow

Curate 10-20 high-quality buildings/statues with manual images.

In [7]:
import requests, csv, hashlib, time
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from PIL import Image
import pandas as pd

In [None]:
CURATED_ITEMS = [
    {"name": "Widener Library", "type": "building"},
    {"name": "Massachusetts Hall", "type": "building"},
    {"name": "University Hall", "type": "building"},
    {"name": "Memorial Church", "type": "building"},
    {"name": "Sever Hall", "type": "building"},
    {"name": "Harvard Hall", "type": "building"},
    {"name": "Harvard Science Center", "type": "building"},
    {"name": "Memorial Hall", "type": "building"},
    {"name": "John Harvard Statue", "type": "statue"},
    {"name": "Johnston Gate", "type": "gate"},
    {"name": "Dexter Gate", "type": "gate"},
    {"name": "Tanner Fountain", "type": "fountain"},
    {"name": "Meyer Gate", "type": "gate"},
    {"name": "Harvard Art Museums", "type": "building"},
    {"name": "Harvard Lampoon Building", "type": "building"},
    {"name": "Lowell House", "type": "building"},
    {"name": "Weld Boathouse", "type": "building"},
    {"name": "Weeks Footbridge", "type": "bridge"},
    {"name": "Langdell Hall", "type": "building"},
    {"name": "Smith Campus Center", "type": "building"},
    {"name": "Science and Engineering Complex", "type": "building"},
]

OUTPUT_DIR = Path("/Users/hughv/Documents/Harvard/AC215/ac215_HistoriCam/data_manual")
USER_AGENT = "HistoriCam/1.0 (Educational project; contact: hughvandeventer@g.harvard.edu)"
WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
print(f"‚úì {len(CURATED_ITEMS)} items to curate")

‚úì 1 items to curate


In [9]:
# Cell 3: Wikipedia Search Functions

def search_wikipedia(query: str, session: requests.Session, limit: int = 5) -> List[Tuple[str, str]]:
    """Search Wikipedia using opensearch API. Returns list of (title, description) tuples."""
    params = {"action": "opensearch", "search": query, "limit": limit, "format": "json", "namespace": 0}
    try:
        r = session.get(WIKIPEDIA_API, params=params, timeout=30)
        r.raise_for_status()
        data = r.json()
        titles = data[1] if len(data) > 1 else []
        descriptions = data[2] if len(data) > 2 else [""] * len(titles)
        return list(zip(titles, descriptions))
    except Exception as e:
        print(f"  Error searching: {e}")
        return []

def get_pageid_from_title(title: str, session: requests.Session) -> Optional[int]:
    """Get pageid for a Wikipedia page title."""
    params = {"action": "query", "titles": title, "format": "json"}
    try:
        r = session.get(WIKIPEDIA_API, params=params, timeout=30)
        r.raise_for_status()
        pages = r.json()["query"]["pages"]
        pageid = list(pages.keys())[0]
        return None if pageid == "-1" else int(pageid)
    except Exception as e:
        print(f"  Error getting pageid: {e}")
        return None

def fetch_page_details(pageid: int, session: requests.Session) -> Optional[Dict]:
    """Fetch metadata for a Wikipedia page (coordinates, QID, aliases)."""
    params = {
        "action": "query", "format": "json", "prop": "coordinates|pageprops|pageterms",
        "pageids": str(pageid), "coprop": "type|dim|name|country|region|globe",
        "ppprop": "wikibase_item", "wbptterms": "alias"
    }
    try:
        r = session.get(WIKIPEDIA_API, params=params, timeout=30)
        r.raise_for_status()
        page = r.json()["query"]["pages"][str(pageid)]
        coords = page.get("coordinates", [{}])[0]
        aliases = page.get("terms", {}).get("alias", [])
        return {
            "title": page["title"], "pageid": pageid,
            "url": f"https://en.wikipedia.org/?curid={pageid}",
            "lat": coords.get("lat"), "lon": coords.get("lon"),
            "qid": page.get("pageprops", {}).get("wikibase_item"),
            "aliases": "|".join(aliases) if aliases else ""
        }
    except Exception as e:
        print(f"  Error fetching details: {e}")
        return None

def interactive_search_and_select(item: Dict, session: requests.Session) -> Optional[Dict]:
    """Search Wikipedia and let user select correct match."""
    results = search_wikipedia(item["name"], session, limit=5)
    if not results:
        print(f"  No results for '{item['name']}'")
        return None
    
    print(f"\n  Found {len(results)} results:")
    for idx, (title, desc) in enumerate(results):
        print(f"    [{idx}] {title}")
        if desc:
            print(f"        {desc[:80]}...")
    
    while True:
        choice = input(f"\n  Select [0-{len(results)-1}] or 's' to skip: ").strip().lower()
        if choice == 's':
            return None
        try:
            idx = int(choice)
            if 0 <= idx < len(results):
                selected_title = results[idx][0]
                print(f"  Selected: {selected_title}")
                pageid = get_pageid_from_title(selected_title, session)
                if pageid:
                    details = fetch_page_details(pageid, session)
                    if details:
                        details["manual_type"] = item.get("type", "unknown")
                        return details
                return None
        except ValueError:
            pass
        print("  Invalid choice")

print("‚úì Wikipedia search functions defined")

‚úì Wikipedia search functions defined


In [10]:
# Cell 4: Execute Wikipedia Search

session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})

found_pages = []
skipped_items = []

print(f"Starting Wikipedia search for {len(CURATED_ITEMS)} items...\n")
print("="*60)

for idx, item in enumerate(CURATED_ITEMS, 1):
    print(f"\n[{idx}/{len(CURATED_ITEMS)}] Searching for: {item['name']}")
    
    if 'pageid' in item:
        print(f"  Using manually provided pageid: {item['pageid']}")
        details = fetch_page_details(item['pageid'], session)
        if details:
            details['manual_type'] = item.get('type', 'unknown')
            found_pages.append(details)
            print(f"  ‚úì Found: {details['title']}")
        else:
            print(f"  ‚úó Failed to fetch details")
            skipped_items.append(item)
    else:
        details = interactive_search_and_select(item, session)
        if details:
            found_pages.append(details)
            print(f"  ‚úì Added to curated list")
        else:
            print(f"  ‚úó Skipped")
            skipped_items.append(item)
    
    time.sleep(0.5)

print("\n" + "="*60)
print("SEARCH COMPLETE")
print("="*60)
print(f"‚úì Found: {len(found_pages)} pages")
print(f"‚úó Skipped: {len(skipped_items)} items")

if skipped_items:
    print("\nSkipped items:")
    for item in skipped_items:
        print(f"  - {item['name']}")

if found_pages:
    print("\nFound pages:")
    for p in found_pages:
        coord_str = f"({p['lat']:.4f}, {p['lon']:.4f})" if p.get('lat') and p.get('lon') else "(no coords)"
        print(f"  {p['title']} - {coord_str} - {p.get('qid', 'no QID')}")

Starting Wikipedia search for 1 items...


[1/1] Searching for: Science and Engineering Complex
  No results for 'Science and Engineering Complex'
  ‚úó Skipped

SEARCH COMPLETE
‚úì Found: 0 pages
‚úó Skipped: 1 items

Skipped items:
  - Science and Engineering Complex


In [5]:
# Cell 5: Generate CSV Files

def generate_buildings_csvs(pages: List[Dict], output_dir: Path) -> Tuple[Path, Path, Path]:
    """Generate buildings CSVs maintaining exact schema compatibility."""
    output_dir.mkdir(parents=True, exist_ok=True)
    now = datetime.utcnow().isoformat()
    
    # buildings_names.csv
    names_path = output_dir / "buildings_names.csv"
    with open(names_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "name", "source_url", "last_seen", "source"])
        for idx, page in enumerate(pages, start=1):
            writer.writerow([idx, page['title'], page['url'], now, "wikipedia"])
    print(f"‚úì Created {names_path.name}")
    
    # buildings_names_metadata.csv
    metadata_path = output_dir / "buildings_names_metadata.csv"
    with open(metadata_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "name", "source_url", "last_seen", "source", "latitude", "longitude", "aliases", "wikibase_item"])
        for idx, page in enumerate(pages, start=1):
            writer.writerow([idx, page['title'], page['url'], now, "wikipedia",
                           page.get('lat', ''), page.get('lon', ''), page.get('aliases', ''), page.get('qid', '')])
    print(f"‚úì Created {metadata_path.name}")
    
    # buildings_info.csv (stub)
    info_path = output_dir / "buildings_info.csv"
    with open(info_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "name", "source_url", "built_year", "architect", "architectural_style",
                        "location", "materials", "building_type", "owner", "height", "construction_cost", "unstructured_info"])
        for idx, page in enumerate(pages, start=1):
            writer.writerow([idx, page['title'], page['url']] + [''] * 10)
    print(f"‚úì Created {info_path.name} (stub)")
    
    return names_path, metadata_path, info_path

if not found_pages:
    print("‚ö† No pages found. Run Cell 4 first.")
else:
    print("Generating CSV files...\n")
    csv_paths = generate_buildings_csvs(found_pages, OUTPUT_DIR)
    print(f"\n‚úì All CSVs generated in {OUTPUT_DIR}")
    print(f"  Total buildings: {len(found_pages)}")

Generating CSV files...

‚úì Created buildings_names.csv
‚úì Created buildings_names_metadata.csv
‚úì Created buildings_info.csv (stub)

‚úì All CSVs generated in /Users/hughv/Documents/Harvard/AC215/ac215_HistoriCam/data_manual
  Total buildings: 17


  now = datetime.utcnow().isoformat()


In [6]:
# Cell 6: Setup Image Directories

def setup_image_directories(num_buildings: int, output_dir: Path) -> Path:
    """Create empty image directories for each building."""
    images_dir = output_dir / "images"
    images_dir.mkdir(exist_ok=True)
    for building_id in range(1, num_buildings + 1):
        (images_dir / str(building_id)).mkdir(exist_ok=True)
    return images_dir

if not found_pages:
    print("‚ö† No pages found. Run Cell 4 first.")
else:
    images_dir = setup_image_directories(len(found_pages), OUTPUT_DIR)
    print("‚úì Created image directories\n")
    print("="*60)
    print("MANUAL STEP: Download Images")
    print("="*60)
    print(f"\nDirectories: {images_dir}")
    print("\nNext steps:")
    print("1. Navigate to each numbered directory (1/, 2/, etc.)")
    print("2. Download images for corresponding building")
    print("3. Any filename works - code will rename later")
    print("4. Formats: JPEG, PNG, WebP")
    print("5. Min: 512x512px, Max: 10MB per image")
    print("\nBuilding ID to Name mapping:")
    print("-" * 60)
    for idx, page in enumerate(found_pages, start=1):
        print(f"  {idx}/ -> {page['title']}")
    print("-" * 60)
    print("\nRun Cell 7 after downloading images.")

‚úì Created image directories

MANUAL STEP: Download Images

Directories: /Users/hughv/Documents/Harvard/AC215/ac215_HistoriCam/data_manual/images

Next steps:
1. Navigate to each numbered directory (1/, 2/, etc.)
2. Download images for corresponding building
3. Any filename works - code will rename later
4. Formats: JPEG, PNG, WebP
5. Min: 512x512px, Max: 10MB per image

Building ID to Name mapping:
------------------------------------------------------------
  1/ -> Widener Library
  2/ -> Massachusetts Hall (Harvard University)
  3/ -> Memorial Church of Harvard University
  4/ -> Sever Hall
  5/ -> Harvard Hall
  6/ -> Harvard Science Center
  7/ -> Memorial Hall (Harvard University)
  8/ -> John Harvard Statue
  9/ -> Johnston Gate
  10/ -> Meyer Gate
  11/ -> Harvard Art Museums
  12/ -> Harvard Lampoon Building
  13/ -> Lowell House
  14/ -> Weld Boathouse
  15/ -> Weeks Footbridge
  16/ -> Langdell Hall
  17/ -> Smith Campus Center
----------------------------------------------

In [16]:
# Cell 7: Process Manual Images (from existing CSVs)

def validate_image_file(img_path: Path) -> Optional[Dict]:
    """Validate image meets quality requirements."""
    try:
        with Image.open(img_path) as img:
            if img.format not in ('JPEG', 'PNG', 'WebP'):
                print(f"  ‚úó {img_path.name}: Invalid format ({img.format})")
                return None
            size_bytes = img_path.stat().st_size
            return {'width': img.width, 'height': img.height, 'format': img.format, 'size_bytes': size_bytes}

    except Exception as e:
        print(f"  ‚úó {img_path.name}: Error - {e}")
        return None

def compute_sha256(file_path: Path) -> str:
    """Compute SHA256 hash for deduplication."""
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def process_manual_images_from_csv(output_dir: Path) -> Tuple[List[Dict], Dict]:
    """Scan, validate, hash, and rename images using existing CSV data."""
    # Read buildings from CSV
    buildings_csv = output_dir / "buildings_names_metadata.csv"
    if not buildings_csv.exists():
        print(f"‚ö† {buildings_csv} not found")
        return [], {}
    
    buildings_df = pd.read_csv(buildings_csv)
    
    manifest_records = []
    stats = {
        'total_buildings': len(buildings_df),
        'buildings_with_images': 0,
        'total_images': 0,
        'valid_images': 0,
        'invalid_images': 0,
        'renamed_count': 0
    }
    
    images_dir = output_dir / "images"
    
    for _, row in buildings_df.iterrows():
        building_id = row['id']
        building_name = row['name']
        qid = row.get('wikibase_item', '')
        
        building_dir = images_dir / str(building_id)
        
        if not building_dir.exists():
            continue
        
        # Find all image files
        image_files = (list(building_dir.glob('*.jpg')) + list(building_dir.glob('*.jpeg')) + 
                      list(building_dir.glob('*.png')) + list(building_dir.glob('*.webp')) +
                      list(building_dir.glob('*.JPG')) + list(building_dir.glob('*.PNG')))
        
        if not image_files:
            print(f"[{building_id}] {building_name}: No images")
            continue
        
        print(f"\n[{building_id}] {building_name}: Processing {len(image_files)} images...")
        stats['buildings_with_images'] += 1
        
        for img_path in image_files:
            stats['total_images'] += 1
            metadata = validate_image_file(img_path)
            if not metadata:
                stats['invalid_images'] += 1
                continue
            
            img_hash = compute_sha256(img_path)
            ext = img_path.suffix.lower()
            if ext == '.jpeg':
                ext = '.jpg'
            new_filename = f"{img_hash}{ext}"
            new_path = building_dir / new_filename
            original_filename = img_path.name
            
            if img_path.name != new_filename:
                img_path.rename(new_path)
                stats['renamed_count'] += 1
            
            mime_map = {'.jpg': 'image/jpeg', '.png': 'image/png', '.webp': 'image/webp'}
            manifest_records.append({
                'building_id': building_id,
                'building_name': building_name,
                'qid': qid if pd.notna(qid) else '',
                'image_hash': img_hash,
                'filename': new_filename,
                'original_filename': original_filename,
                'local_path': f"/data_manual/images/{building_id}/{new_filename}",
                'url': '',
                'width': metadata['width'],
                'height': metadata['height'],
                'size_bytes': metadata['size_bytes'],
                'mime_type': mime_map.get(ext, 'image/jpeg')
            })
            stats['valid_images'] += 1
            print(f"  ‚úì {new_filename} ({metadata['width']}x{metadata['height']}px)")
    
    return manifest_records, stats

def generate_image_manifest(manifest_records: List[Dict], output_dir: Path) -> Path:
    """Generate image_manifest.csv with exact schema."""
    manifest_path = output_dir / "images" / "image_manifest.csv"
    with open(manifest_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'building_id', 'building_name', 'qid', 'image_hash', 'filename', 'original_filename',
            'local_path', 'url', 'width', 'height', 'size_bytes', 'mime_type'
        ])
        writer.writeheader()
        writer.writerows(manifest_records)
    return manifest_path

# Main execution
if not OUTPUT_DIR.exists():
    print(f"‚ö† {OUTPUT_DIR} doesn't exist")
else:
    print("Processing manual images from existing CSVs...\n" + "="*60)
    manifest_records, image_stats = process_manual_images_from_csv(OUTPUT_DIR)
    
    if manifest_records:
        manifest_path = generate_image_manifest(manifest_records, OUTPUT_DIR)
        print(f"\n‚úì Created {manifest_path.name} with {len(manifest_records)} images")
    else:
        print("\n‚ö† No valid images found")
    
    print("\n" + "="*60)
    print("IMAGE PROCESSING SUMMARY")
    print("="*60)
    print(f"Buildings with images: {image_stats['buildings_with_images']}/{image_stats['total_buildings']}")
    print(f"Total images: {image_stats['total_images']}")
    print(f"Valid images: {image_stats['valid_images']}")
    print(f"Invalid images: {image_stats['invalid_images']}")
    print(f"Images renamed: {image_stats['renamed_count']}")
    if image_stats['valid_images'] > 0:
        avg = image_stats['valid_images'] / max(1, image_stats['buildings_with_images'])
        print(f"Avg per building: {avg:.1f}")

Processing manual images from existing CSVs...

[1] Widener Library: Processing 16 images...
  ‚úì 1cd72fd7ea8f57379d39bf4fe3d5d3079de69b5115802b94741c76b967b6b882.jpg (3872x1803px)
  ‚úì 4a83717a1b8fb0ff94eeffe0d8a2d779ddb0ec4fe5ecfd4d881aa56bb7c01374.jpg (2000x1499px)
  ‚úì be61ea6c5eadf07012b337e7ab25c6a9981c80ee2a97878bee20fb917fd23830.jpg (1920x1080px)
  ‚úì 7c321bcc1aa7f59970153ae2137f9ca92b8f0d1742fa09d3002bf05f2760e537.jpg (1024x683px)
  ‚úì ae50df860efa2142c49723adf99ef87387a7de188cecbfa92ea7bc554be551cb.jpg (1024x682px)
  ‚úì 8ea002a486a7bbc3e16ae788b9ad29d3716f7db4c232bd434058efb1374e5a97.jpg (3600x2316px)
  ‚úì 292000428a9cb8786761539b4ec978da7c66d72e897fc9632bf71f422b5d25de.jpg (2000x1348px)
  ‚úì 68cf501e6d230fb73c140b01910a97dfdac0177c19f972a40503c79cc5699180.jpg (1500x1000px)
  ‚úì 33855b2cf4523db5a5a1d1d3812d32ea3987f3e9d5c4e6e478c0e5ef17bb3dbc.jpg (1024x679px)
  ‚úì cd547c195fe85ea5f62ffe679643536b11d26e81f83e2a21f7ebd8c722267848.jpg (640x428px)
  ‚úì 0228ac4955deb73c

In [17]:
# Cell 8: Validation and Summary

def validate_csv_schemas(output_dir: Path) -> Dict:
    """Validate CSVs match expected schemas."""
    issues = []
    
    names_path = output_dir / "buildings_names.csv"
    if not names_path.exists():
        issues.append("buildings_names.csv not found")
    else:
        df = pd.read_csv(names_path)
        expected = ["id", "name", "source_url", "last_seen", "source"]
        if list(df.columns) != expected:
            issues.append(f"buildings_names.csv: column mismatch")
        if not (df['id'] == range(1, len(df) + 1)).all():
            issues.append("buildings_names.csv: IDs not sequential")
    
    metadata_path = output_dir / "buildings_names_metadata.csv"
    if not metadata_path.exists():
        issues.append("buildings_names_metadata.csv not found")
    else:
        df2 = pd.read_csv(metadata_path)
        expected2 = ["id", "name", "source_url", "last_seen", "source", "latitude", "longitude", "aliases", "wikibase_item"]
        if list(df2.columns) != expected2:
            issues.append("buildings_names_metadata.csv: column mismatch")
    
    manifest_path = output_dir / "images" / "image_manifest.csv"
    manifest_df = None
    if not manifest_path.exists():
        issues.append("images/image_manifest.csv not found")
    else:
        manifest_df = pd.read_csv(manifest_path)
        expected3 = ["building_id", "building_name", "qid", "image_hash", "filename", "original_filename",
                    "local_path", "url", "width", "height", "size_bytes", "mime_type"]
        if list(manifest_df.columns) != expected3:
            issues.append("image_manifest.csv: column mismatch")
    
    return {
        'valid': len(issues) == 0,
        'issues': issues,
        'buildings_count': len(df) if names_path.exists() else 0,
        'images_count': len(manifest_df) if manifest_df is not None else 0
    }

def generate_summary_report(pages, validation_results, output_dir: Path):
    """Generate comprehensive summary."""
    print("\n" + "="*60)
    print("MANUAL CURATION SUMMARY")
    print("="*60)
    print(f"\nüìç Output: {output_dir}")
    print(f"\nüèõÔ∏è  Buildings/Statues: {validation_results['buildings_count']}")
    
    if pages:
        types = {}
        for p in pages:
            t = p.get('manual_type', 'unknown')
            types[t] = types.get(t, 0) + 1
        for t, count in types.items():
            print(f"    - {t}: {count}")
    
    print(f"\nüì∏ Images: {validation_results['images_count']}")
    if validation_results['buildings_count'] > 0 and validation_results['images_count'] > 0:
        avg = validation_results['images_count'] / validation_results['buildings_count']
        print(f"    Average per building: {avg:.1f}")
    
    print(f"\nüìÑ Files:")
    print(f"  ‚úì buildings_names.csv")
    print(f"  ‚úì buildings_names_metadata.csv")
    print(f"  ‚úì buildings_info.csv (stub)")
    if validation_results['images_count'] > 0:
        print(f"  ‚úì images/image_manifest.csv")
    
    print(f"\n‚úÖ Schema Validation:")
    if validation_results['valid']:
        print("  All schemas valid ‚úì")
    else:
        print("  Issues found:")
        for issue in validation_results['issues']:
            print(f"    - {issue}")
    
    if pages:
        has_coords = sum(1 for p in pages if p.get('lat') and p.get('lon'))
        has_qid = sum(1 for p in pages if p.get('qid'))
        print(f"\nüó∫Ô∏è  Coordinates: {has_coords}/{len(pages)} ({100*has_coords/len(pages):.0f}%)")
        print(f"üîó Wikidata QIDs: {has_qid}/{len(pages)} ({100*has_qid/len(pages):.0f}%)")
    
    print("\n" + "="*60)
    print("NEXT STEPS:")
    print("="*60)
    print("1. Review generated CSVs for accuracy")
    print("2. Use /data_manual/ as drop-in replacement for /data/")
    print("3. Test with your ML pipeline")
    print("="*60)

if OUTPUT_DIR.exists():
    validation = validate_csv_schemas(OUTPUT_DIR)
    generate_summary_report(found_pages if 'found_pages' in dir() else [], validation, OUTPUT_DIR)
else:
    print(f"‚ö† {OUTPUT_DIR} doesn't exist. Run previous cells first.")


MANUAL CURATION SUMMARY

üìç Output: /Users/hughv/Documents/Harvard/AC215/ac215_HistoriCam/data_manual

üèõÔ∏è  Buildings/Statues: 18

üì∏ Images: 242
    Average per building: 13.4

üìÑ Files:
  ‚úì buildings_names.csv
  ‚úì buildings_names_metadata.csv
  ‚úì buildings_info.csv (stub)
  ‚úì images/image_manifest.csv

‚úÖ Schema Validation:
  All schemas valid ‚úì

NEXT STEPS:
1. Review generated CSVs for accuracy
2. Use /data_manual/ as drop-in replacement for /data/
3. Test with your ML pipeline
