# Fetch SVGs to Google Drive (Colab)

This notebook mounts Drive, lists SVG files on Wikimedia Commons, and downloads them directly into Drive with polite rate limiting.


In [None]:
# 1) Mount Drive
from google.colab import drive  # type: ignore
drive.mount('/content/drive', force_remount=True)

DRIVE_DIR = '/content/drive/MyDrive/WikipediaSVG'  # change if needed
import os
os.makedirs(DRIVE_DIR, exist_ok=True)
print('Drive dir:', DRIVE_DIR)


In [None]:
# 2) Lightweight Commons SVG lister (generator=allpages ns=6) with resume state in Drive
import time, random, requests, os, json
from typing import Dict, Iterator, Tuple

COMMONS_API = 'https://commons.wikimedia.org/w/api.php'
HEADERS = {'User-Agent': 'SvgBot/0.1 (https://github.com/JacobAsmuth; jacobasmuth@gmail.com)'}
STATE_PATH = os.path.join(DRIVE_DIR, 'fetch_state.json')


def load_state() -> Tuple[int, Dict[str, str]]:
  if os.path.exists(STATE_PATH):
    try:
      with open(STATE_PATH, 'r') as f:
        s = json.load(f)
      return int(s.get('prefix_index', 0)), dict(s.get('cont', {}))
    except Exception:
      pass
  return 0, {}


def save_state(prefix_index: int, cont: Dict[str, str]) -> None:
  try:
    with open(STATE_PATH, 'w') as f:
      json.dump({'prefix_index': int(prefix_index), 'cont': dict(cont)}, f)
  except Exception:
    pass


def reset_state() -> None:
  try:
    os.remove(STATE_PATH)
    print('State reset')
  except FileNotFoundError:
    print('State already clear')


def list_svg_pages(limit: int, resume: bool = True) -> Iterator[Dict[str, str]]:
  prefixes = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  per_page = 50
  got = 0
  start_idx, persisted_cont = load_state() if resume else (0, {})

  for i in range(start_idx, len(prefixes)):
    sc = prefixes[i]
    if got >= limit:
      break
    start = f'File:{sc}'
    end = f'File:{prefixes[i+1]}' if i+1 < len(prefixes) else None
    cont: Dict[str, str] = dict(persisted_cont) if persisted_cont else {}
    persisted_cont = {}

    while got < limit:
      try:
        params = {
          'action': 'query', 'format': 'json', 'generator': 'allpages',
          'gapnamespace': '6', 'gaplimit': str(per_page), 'prop': 'imageinfo',
          'iiprop': 'url|mime|size|timestamp', 'origin': '*', 'gapfrom': start,
        }
        if end is not None:
          params['gapto'] = end
        merged = {**params, **cont}
        r = requests.get(COMMONS_API, params=merged, headers=HEADERS, timeout=60)
        data = r.json()
        pages = (data.get('query') or {}).get('pages') or {}
        cont = data.get('continue') or {}
        save_state(i, cont)
      except Exception as e:
        print(f"Error: {e}")
        continue

      for page in pages.values():
        try:
          title = page.get('title','')
          if not title.lower().endswith(('.svg','.svgz')):
            continue
          infos = page.get('imageinfo') or []
          if not infos: continue
          url = infos[0].get('url')
          mime = infos[0].get('mime','')
          if not url or (mime != 'image/svg+xml' and not url.lower().endswith(('.svg','.svgz'))):
            continue
          got += 1
          yield {'title': title, 'url': url}
          if got >= limit: break
        except Exception as e:
          print(f"Error: {e}")
          continue

      if not cont:
        save_state(i+1, {})
        break
      time.sleep(0.3 + random.random()*0.2)


In [None]:
# 3) Download to Drive (streamed)
import pathlib
import requests

SESSION = requests.Session()

def download_to_drive(url: str, drive_dir: str, filename: str) -> bool:
  p = pathlib.Path(drive_dir) / filename
  if p.exists():
    return False
  with SESSION.get(url, stream=True, timeout=120, headers=HEADERS) as r:
    r.raise_for_status()
    with open(p, 'wb') as f:
      for chunk in r.iter_content(chunk_size=1024*64):
        if chunk:
          f.write(chunk)
  return True

limit = 200  # adjust as needed
resume = True  # set False to start over from the beginning
saved = 0
for item in list_svg_pages(limit, resume=resume):
  url = item['url']
  fname = url.split('/')[-1]
  ok = download_to_drive(url, DRIVE_DIR, fname)
  if ok:
    saved += 1
    if saved % 10 == 0:
      print(f'Saved {saved}/{limit}...')
print('Done. Saved', saved)
print('Resume state saved at:', STATE_PATH)
