In [1]:
import csv
from pathlib import Path
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlencode
from threading import Lock

DEFAULT_URL_PARAMS = {
    "w": "1600",
    "auto": "format",
}

def _find_photos_file(dataset_dir: Path) -> Path:
    candidates = list(dataset_dir.glob("photos.*"))
    if not candidates:
        raise FileNotFoundError("No photos file found (expected photos.*)")
    return candidates[0]

def download_unsplash_lite(
    dataset_dir: str,
    output_dir: str,
    url_params: dict[str, str] | None = None,
    max_workers: int = 8,
    chunk_size: int = 500,
) -> None:
    dataset_dir = Path(dataset_dir).expanduser().resolve()
    output_dir = Path(output_dir).expanduser().resolve()
    output_dir.mkdir(parents=True, exist_ok=True)
    
    photos_file = _find_photos_file(dataset_dir)
    
    params = DEFAULT_URL_PARAMS.copy()
    if url_params:
        params.update({k: str(v) for k, v in url_params.items()})
    
    print(f"[INFO] Reading: {photos_file.name}")
    print(f"[INFO] Processing in chunks of {chunk_size}")
    
    # Thread-safe counters
    stats = {"ok": 0, "skipped": 0, "errors": 0, "processed": 0}
    stats_lock = Lock()
    
    def fetch(photo_id, base_url):
        dest = output_dir / f"{photo_id}.jpg"
        if dest.exists():
            return "skipped", photo_id
        
        url = f"{base_url}?{urlencode(params)}"
        try:
            with requests.get(url, stream=True, timeout=20) as r:
                r.raise_for_status()
                with open(dest, "wb") as f:
                    for chunk in r.iter_content(1 << 14):
                        if chunk:
                            f.write(chunk)
            return "ok", photo_id
        except Exception as e:
            return "error", photo_id, str(e)
    
    def process_chunk(rows):
        """Process a chunk of rows with threading"""
        if not rows:
            return
            
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futures = [ex.submit(fetch, photo_id, url) for photo_id, url in rows]
            for fut in as_completed(futures):
                result = fut.result()
                with stats_lock:
                    stats["processed"] += 1
                    if result[0] == "ok":
                        stats["ok"] += 1
                    elif result[0] == "skipped":
                        stats["skipped"] += 1
                    else:
                        stats["errors"] += 1
                        print(f"[ERROR] {result[1]}: {result[2]}")
                    
                    # Progress update every 100 images
                    if stats["processed"] % 100 == 0:
                        print(f"[PROGRESS] Processed {stats['processed']} images "
                              f"(OK: {stats['ok']}, Skipped: {stats['skipped']}, Errors: {stats['errors']})")
    
    # Stream CSV in chunks
    try:
        with open(photos_file, 'r', encoding='utf-8', newline='') as f:
            # Detect delimiter from first line
            first_line = f.readline()
            f.seek(0)
            delimiter = '\t' if '\t' in first_line else ','
            
            delim_name = 'TAB' if delimiter == '\t' else 'COMMA'
            print(f"[INFO] Detected delimiter: {delim_name}")
            
            reader = csv.DictReader(f, delimiter=delimiter)
            
            # Verify required columns exist
            if not reader.fieldnames:
                raise ValueError("Could not read CSV headers")
            
            if 'photo_id' not in reader.fieldnames or 'photo_image_url' not in reader.fieldnames:
                raise ValueError(f"CSV must contain 'photo_id' and 'photo_image_url' columns. Found: {reader.fieldnames}")
            
            print(f"[INFO] Starting download...")
            
            chunk = []
            for i, row in enumerate(reader):
                # Skip rows with missing data
                if not row.get('photo_id') or not row.get('photo_image_url'):
                    continue
                    
                chunk.append((row['photo_id'], row['photo_image_url']))
                
                if len(chunk) >= chunk_size:
                    process_chunk(chunk)
                    chunk = []
            
            # Process remaining rows
            if chunk:
                process_chunk(chunk)
                
    except Exception as e:
        print(f"[ERROR] Failed to read CSV: {e}")
        raise
    
    print("\n" + "="*50)
    print("Done.")
    print(f"  Downloaded: {stats['ok']}")
    print(f"  Skipped:    {stats['skipped']}")
    print(f"  Errors:     {stats['errors']}")
    print(f"  Total:      {stats['processed']}")
    print(f"  Output:     {output_dir}")
    print("="*50)

In [None]:
# Usage
download_unsplash_lite(
    dataset_dir="./unsplash-research-dataset-lite-latest",
    output_dir="./unsplash_raw_images",
    chunk_size=500,  # Process 1000 rows at a time
)

[INFO] Reading: photos.csv000
[INFO] Processing in chunks of 500
[INFO] Detected delimiter: TAB
[INFO] Starting download...
[PROGRESS] Processed 100 images (OK: 100, Skipped: 0, Errors: 0)
[PROGRESS] Processed 200 images (OK: 200, Skipped: 0, Errors: 0)
[PROGRESS] Processed 300 images (OK: 300, Skipped: 0, Errors: 0)
[PROGRESS] Processed 400 images (OK: 400, Skipped: 0, Errors: 0)
[PROGRESS] Processed 500 images (OK: 500, Skipped: 0, Errors: 0)
[PROGRESS] Processed 600 images (OK: 600, Skipped: 0, Errors: 0)
[PROGRESS] Processed 700 images (OK: 700, Skipped: 0, Errors: 0)
[PROGRESS] Processed 800 images (OK: 800, Skipped: 0, Errors: 0)
[PROGRESS] Processed 900 images (OK: 900, Skipped: 0, Errors: 0)
[PROGRESS] Processed 1000 images (OK: 1000, Skipped: 0, Errors: 0)
[PROGRESS] Processed 1100 images (OK: 1100, Skipped: 0, Errors: 0)
[ERROR] rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: /?w=1600&auto=format (Caused by 