# Fetch SVGs to Google Drive (Colab)

This notebook mounts Drive, lists SVG files on Wikimedia Commons, and downloads them directly into Drive with polite rate limiting.


In [None]:
# 1) Mount Drive
from google.colab import drive  # type: ignore
drive.mount('/content/drive', force_remount=True)

DRIVE_DIR = '/content/drive/MyDrive/WikipediaSVG'  # change if needed
import os
os.makedirs(DRIVE_DIR, exist_ok=True)
print('Drive dir:', DRIVE_DIR)


In [None]:
# 2) Lightweight Commons SVG lister (generator=allpages ns=6)
import time, random, requests, os
from typing import Dict, Iterator

COMMONS_API = 'https://commons.wikimedia.org/w/api.php'
HEADERS = {'User-Agent': 'SvgBot/0.1 (https://github.com/JacobAsmuth; jacobasmuth@gmail.com)'}

def list_svg_pages(limit: int) -> Iterator[Dict[str, str]]:
  prefixes = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  per_page = 50
  got = 0
  for i, sc in enumerate(prefixes):
    if got >= limit:
      break
    start = f'File:{sc}'
    end = f'File:{prefixes[i+1]}' if i+1 < len(prefixes) else None
    cont: Dict[str,str] = {}
    while got < limit:
      params = {
        'action': 'query', 'format': 'json', 'generator': 'allpages',
        'gapnamespace': '6', 'gaplimit': str(per_page), 'prop': 'imageinfo',
        'iiprop': 'url|mime|size|timestamp', 'origin': '*', 'gapfrom': start,
      }
      if end is not None:
        params['gapto'] = end
      merged = {**params, **cont}
      r = requests.get(COMMONS_API, params=merged, headers=HEADERS, timeout=60)
      data = r.json()
      pages = (data.get('query') or {}).get('pages') or {}
      for page in pages.values():
        title = page.get('title','')
        if not title.lower().endswith(('.svg','.svgz')):
          continue
        infos = page.get('imageinfo') or []
        if not infos: continue
        url = infos[0].get('url')
        mime = infos[0].get('mime','')
        if not url or (mime != 'image/svg+xml' and not url.lower().endswith(('.svg','.svgz'))):
          continue
        got += 1
        yield {'title': title, 'url': url}
        if got >= limit: break
      cont = data.get('continue') or {}
      if not cont: break
      time.sleep(0.3 + random.random()*0.2)


In [None]:
# 3) Download to Drive (streamed)
import hashlib
import pathlib
import requests

SESSION = requests.Session()

def download_to_drive(url: str, drive_dir: str, filename: str) -> bool:
  p = pathlib.Path(drive_dir) / filename
  if p.exists():
    return False
  with SESSION.get(url, stream=True, timeout=120, headers=HEADERS) as r:
    r.raise_for_status()
    with open(p, 'wb') as f:
      for chunk in r.iter_content(chunk_size=1024*64):
        if chunk:
          f.write(chunk)
  return True

limit = 200  # adjust as needed
saved = 0
for item in list_svg_pages(limit):
  title = item['title']
  url = item['url']
  # Save as basename from URL
  fname = url.split('/')[-1]
  ok = download_to_drive(url, DRIVE_DIR, fname)
  if ok:
    saved += 1
    if saved % 10 == 0:
      print(f'Saved {saved}/{limit}...')
print('Done. Saved', saved)
