In [3]:
import requests
import html
import re
import time
from typing import List, Dict, Optional, Tuple

API_KEY = "AIzaSyADRjmf_vT9JbGqU1P6lXqZwzzHUDsOzO8"
SEARCH_TERM = "culture"

MAX_SNIPPETS = 100000
PAGE_SIZE = 40
SLEEP_SECONDS = 0.1

BASE_URL = "https://www.googleapis.com/books/v1/volumes"


def clean_snippet(raw: Optional[str]) -> Optional[str]:
    """Remove HTML tags, entities, and stray ellipses from the snippet."""
    if not raw:
        return None

    text = html.unescape(raw)

    # Strip <b> tags used for highlighting
    text = re.sub(r"</?b>", "", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Remove leading/trailing ellipses and some stray punctuation
    text = re.sub(r"^[.…\s]+", "", text)
    text = re.sub(r"[.…\s]+$", "", text)

    return text.strip() or None


def first_sentences(text: Optional[str], max_sentences: int = 2) -> Optional[str]:
    """Take the first 1-2 sentences based on simple punctuation rules."""
    if not text:
        return None

    parts = re.split(r"(?<=[.!?])\s+", text)
    parts = [p.strip() for p in parts if p.strip()]
    if not parts:
        return None

    return " ".join(parts[:max_sentences])


def extract_year(published_date: Optional[str]) -> Optional[int]:
    """
    Extract a 4 digit year from volumeInfo.publishedDate.

    publishedDate can be:
      - "2003"
      - "2003-05"
      - "2003-05-12"
      - Sometimes messy: "2003?"
    """
    if not published_date:
        return None

    m = re.search(r"\b(\d{4})\b", published_date)
    if not m:
        return None
    try:
        return int(m.group(1))
    except ValueError:
        return None


def build_year_buckets(
    year_start: int, year_end: int, year_chunk: int
) -> List[Tuple[int, int]]:
    """
    Build inclusive year buckets.

    Example:
      year_start=1900, year_end=2000, year_chunk=10
      -> [(1900, 1909), (1910, 1919), ..., (1990, 1999), (2000, 2000)]
    """
    buckets = []
    y = year_start
    while y <= year_end:
        end = min(y + year_chunk - 1, year_end)
        buckets.append((y, end))
        y = end + 1
    return buckets


def find_bucket_for_year(
    year: int, buckets: List[Tuple[int, int]]
) -> Optional[int]:
    """
    Given a year and list of buckets (start, end),
    return the index of the bucket this year belongs to (or None).
    """
    for i, (start, end) in enumerate(buckets):
        if start <= year <= end:
            return i
    return None


def fetch_snippets_for_bucket(
    query: str,
    bucket: Tuple[int, int],
    max_snippets: int = 500,
    page_size: int = 40,
    lang_restrict: Optional[str] = None,
    filter_param: str = "partial",
    seen_ids: Optional[set] = None,
    max_pages: int = 50,
) -> Tuple[List[Dict], set]:
    """
    Fetch snippets for a single year bucket.
    
    Adds the year range to the query to help narrow results from the API.
    Only keeps results whose publishedDate falls within the bucket range.
    
    Returns:
        (list of snippet dicts, updated seen_ids set)
    """
    bucket_start, bucket_end = bucket
    collected: List[Dict] = []
    
    if seen_ids is None:
        seen_ids = set()
    
    start_index = 0
    pages_fetched = 0
    consecutive_empty = 0
    
    # Add year range to query to help filter (Google Books doesn't have a direct date param)
    # Using the midpoint year or range in the query can help
    if bucket_start == bucket_end:
        year_query = f'"{query}" {bucket_start}'
    else:
        year_query = f'"{query}" {bucket_start}-{bucket_end}'
    
    while len(collected) < max_snippets and pages_fetched < max_pages:
        params = {
            "q": year_query,
            "maxResults": page_size,
            "startIndex": start_index,
            "projection": "lite",
            "printType": "books",
            "key": API_KEY,
        }

        if lang_restrict:
            params["langRestrict"] = lang_restrict

        if filter_param:
            params["filter"] = filter_param

        try:
            resp = requests.get(BASE_URL, params=params, timeout=10)
            if resp.status_code == 429:
                print("  Hit rate limit, sleeping longer...")
                time.sleep(5)
                continue
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"  Request failed at startIndex={start_index}: {e}")
            break

        data = resp.json()
        items = data.get("items", [])
        
        if not items:
            consecutive_empty += 1
            if consecutive_empty >= 2:
                break
            start_index += page_size
            pages_fetched += 1
            time.sleep(SLEEP_SECONDS)
            continue
        
        consecutive_empty = 0
        page_snippet_count = 0

        for item in items:
            if len(collected) >= max_snippets:
                break

            vol_id = item.get("id")
            if vol_id in seen_ids:
                continue

            vol_info = item.get("volumeInfo", {})
            search_info = item.get("searchInfo", {})

            # Filter by year - must be within bucket range
            pub_date = vol_info.get("publishedDate")
            year = extract_year(pub_date)
            
            if year is None:
                continue
            if not (bucket_start <= year <= bucket_end):
                continue

            raw_snippet = search_info.get("textSnippet")
            cleaned = clean_snippet(raw_snippet)
            snippet = first_sentences(cleaned, max_sentences=2)

            if not snippet:
                continue

            entry = {
                "id": vol_id,
                "title": vol_info.get("title"),
                "authors": vol_info.get("authors", []),
                "infoLink": vol_info.get("infoLink") or vol_info.get("previewLink"),
                "snippet": snippet,
                "publishedDate": pub_date,
                "year": year,
                "year_bucket": bucket,
            }

            collected.append(entry)
            seen_ids.add(vol_id)
            page_snippet_count += 1

        start_index += page_size
        pages_fetched += 1
        time.sleep(SLEEP_SECONDS)

    return collected, seen_ids


def fetch_snippets_by_buckets(
    query: str,
    year_start: int,
    year_end: int,
    year_chunk: int = 1,
    max_snippets_per_bucket: int = 100,
    total_max_snippets: int = 100000,
    page_size: int = 40,
    lang_restrict: Optional[str] = None,
    filter_param: str = "partial",
    max_pages_per_bucket: int = 50,
) -> List[Dict]:
    """
    Fetch snippets by looping over year buckets directly.
    
    Each bucket gets its own API queries with year hints in the search query.
    This ensures more balanced coverage across time periods.
    
    Args:
        query: Search term
        year_start: Start year (inclusive)
        year_end: End year (inclusive)
        year_chunk: Size of each year bucket (e.g., 1 for single years, 10 for decades)
        max_snippets_per_bucket: Max snippets to collect per bucket
        total_max_snippets: Overall maximum snippets to collect
        page_size: Results per API page (max 40)
        lang_restrict: Language code (e.g., "en")
        filter_param: Google Books filter (partial, full, free-ebooks, etc.)
        max_pages_per_bucket: Max API pages to fetch per bucket
    
    Returns:
        List of snippet dictionaries
    """
    # Build year buckets
    buckets = build_year_buckets(year_start, year_end, year_chunk)
    print(f"Processing {len(buckets)} year buckets: {buckets[0]} to {buckets[-1]}")
    
    all_collected: List[Dict] = []
    seen_ids: set = set()
    
    for bucket_idx, bucket in enumerate(buckets):
        if len(all_collected) >= total_max_snippets:
            print(f"\nReached total max snippets ({total_max_snippets}), stopping.")
            break
        
        bucket_start, bucket_end = bucket
        remaining = total_max_snippets - len(all_collected)
        bucket_max = min(max_snippets_per_bucket, remaining)
        
        print(f"\n=== Bucket {bucket_idx + 1}/{len(buckets)}: {bucket_start}-{bucket_end} ===")
        
        bucket_snippets, seen_ids = fetch_snippets_for_bucket(
            query=query,
            bucket=bucket,
            max_snippets=bucket_max,
            page_size=page_size,
            lang_restrict=lang_restrict,
            filter_param=filter_param,
            seen_ids=seen_ids,
            max_pages=max_pages_per_bucket,
        )
        
        all_collected.extend(bucket_snippets)
        print(f"  Collected {len(bucket_snippets)} snippets for this bucket. Total: {len(all_collected)}")
    
    return all_collected


# Fetch snippets by looping over year buckets
snippets = fetch_snippets_by_buckets(
    query=SEARCH_TERM,
    year_start=1900,
    year_end=2025,
    year_chunk=1,                    # 1 year per bucket (single years)
    max_snippets_per_bucket=100000,     # max snippets per year
    total_max_snippets=MAX_SNIPPETS,
    page_size=PAGE_SIZE,
    lang_restrict="en",
    filter_param="partial",
    max_pages_per_bucket=1000,         # max API pages per bucket
)

print(f"\nCollected {len(snippets)} snippets total.")
for i, s in enumerate(snippets[:10], start=1):
    authors = ", ".join(s["authors"]) if s["authors"] else "Unknown author"
    print(f"\n[{i}] {s['title']} | {authors}")
    print(f"Year: {s['year']}, bucket: {s['year_bucket']}")
    print(s["snippet"])

Processing 126 year buckets: (1900, 1900) to (2025, 2025)

=== Bucket 1/126: 1900-1900 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 2/126: 1901-1901 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 3/126: 1902-1902 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 4/126: 1903-1903 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 5/126: 1904-1904 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 6/126: 1905-1905 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 7/126: 1906-1906 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 8/126: 1907-1907 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 9/126: 1908-1908 ===
  Collected 0 snippets for this bucket. Total: 0

=== Bucket 10/126: 1909-1909 ===
  Collected 3 snippets for this bucket. Total: 3

=== Bucket 11/126: 1910-1910 ===
  Collected 0 snippets for this bucket. Total: 3

=== Bucket 12/126: 1911-1911 ===
  Co

KeyboardInterrupt: 