# 00 - Download and Clean Gutenberg Books

This notebook downloads public-domain books from Project Gutenberg, removes boilerplate headers/footers, and saves cleaned text files.

## Outputs
- `./data/raw/{book_id}.txt`
- `./data/metadata.csv`

## Notes
- Tries multiple Gutenberg URL patterns.
- Retries transient HTTP failures.
- Requires at least 15 successful books (target 18).


In [6]:
# Install required packages if missing
import importlib
import subprocess
import sys

REQUIRED_PACKAGES = [
    ("requests", "requests"),
    ("numpy", "numpy"),
    ("pandas", "pandas"),
]

for module_name, pip_name in REQUIRED_PACKAGES:
    try:
        importlib.import_module(module_name)
    except ImportError:
        print(f"Installing {pip_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name])

print("Dependency check complete.")


Dependency check complete.


In [7]:
from pathlib import Path
import json
import random
import re
import time
import unicodedata

import numpy as np
import pandas as pd
import requests

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"
METADATA_PATH = DATA_DIR / "metadata.csv"
BOOK_CATALOG_PATH = DATA_DIR / "book_catalog.json"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

MIN_SUCCESS = 15
MIN_WORDS = 1200
FORCE_REDOWNLOAD = False
MAX_RETRIES = 3
REQUEST_TIMEOUT = 25
BACKOFF_BASE_SECONDS = 1.2
USER_AGENT = "story-trajectory-analysis/1.0 (+https://github.com/)"

BOOK_SPECS = [
    {
        "pg_id": 1342,
        "title": "Pride and Prejudice",
        "author": "Jane Austen",
        "first_publication_year": 1813,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Romance",
        "genre_secondary": ["Comedy of Manners", "Realism"],
        "short_tags": ["love", "class", "marriage", "england", "sisters", "regency", "darcy"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1342.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1342",
        "notes": "First published anonymously.",
        "recognizability_rank": 7,
        "genre_clarity_rank": 8,
        "twist_peak_rank": 12,
        "twist_peak_reason": "Features a strong turning point with Mr. Darcy's letter revealing Wickham's true nature, but remains largely a steady social comedy.",
        "citations": [
            "https://en.wikipedia.org/wiki/Pride_and_Prejudice",
            "https://www.britannica.com/topic/Pride-and-Prejudice",
        ],
    },
    {
        "pg_id": 1260,
        "title": "Jane Eyre",
        "author": "Charlotte Bronte",
        "first_publication_year": 1847,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Gothic",
        "genre_secondary": ["Romance", "Coming-of-Age"],
        "short_tags": ["orphan", "governess", "secrets", "madness", "fire", "rochester"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1260.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1260",
        "notes": "Originally published under the pen name Currer Bell.",
        "recognizability_rank": 12,
        "genre_clarity_rank": 19,
        "twist_peak_rank": 4,
        "twist_peak_reason": "Contains a massive mid-story twist regarding Bertha Mason, the 'madwoman in the attic', drastically shifting the plot trajectory.",
        "citations": [
            "https://en.wikipedia.org/wiki/Jane_Eyre",
            "https://www.britannica.com/topic/Jane-Eyre",
        ],
    },
    {
        "pg_id": 1513,
        "title": "Romeo and Juliet",
        "author": "William Shakespeare",
        "first_publication_year": 1597,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "play",
        "genre_primary": "Tragedy",
        "genre_secondary": ["Romance"],
        "short_tags": ["star-crossed", "feuds", "verona", "poison", "lovers", "balcony"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1513.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1513",
        "notes": "1597 is the year of the First Quarto publication; likely first performed between 1591 and 1595.",
        "recognizability_rank": 1,
        "genre_clarity_rank": 4,
        "twist_peak_rank": 5,
        "twist_peak_reason": "The dramatic irony of Juliet's faked death leading to Romeo's actual suicide forms one of literature's most intense peaks.",
        "citations": [
            "https://en.wikipedia.org/wiki/Romeo_and_Juliet",
            "https://www.britannica.com/topic/Romeo-and-Juliet",
        ],
    },
    {
        "pg_id": 768,
        "title": "Wuthering Heights",
        "author": "Emily Bronte",
        "first_publication_year": 1847,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Gothic",
        "genre_secondary": ["Tragedy", "Romance"],
        "short_tags": ["moors", "ghosts", "revenge", "obsession", "dark", "heathcliff"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/768.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/768",
        "notes": "Originally published under the pen name Ellis Bell.",
        "recognizability_rank": 14,
        "genre_clarity_rank": 17,
        "twist_peak_rank": 6,
        "twist_peak_reason": "Driven by vicious cycles of revenge, sudden cruelty, and supernatural elements that create jagged, emotionally volatile turning points.",
        "citations": [
            "https://en.wikipedia.org/wiki/Wuthering_Heights",
            "https://www.britannica.com/topic/Wuthering-Heights",
        ],
    },
    {
        "pg_id": 1661,
        "title": "The Adventures of Sherlock Holmes",
        "author": "Arthur Conan Doyle",
        "first_publication_year": 1892,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "short-story collection",
        "genre_primary": "Mystery",
        "genre_secondary": ["Crime", "Detective"],
        "short_tags": ["london", "deduction", "watson", "murder", "clues", "sleuth", "baker-street"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1661.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1661",
        "notes": "Stories were originally serialized in The Strand Magazine between 1891 and 1892.",
        "recognizability_rank": 3,
        "genre_clarity_rank": 2,
        "twist_peak_rank": 7,
        "twist_peak_reason": "Each short story inherently hinges on a clever reveal or deductive twist at the climax.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Adventures_of_Sherlock_Holmes",
            "https://www.britannica.com/topic/The-Adventures-of-Sherlock-Holmes",
        ],
    },
    {
        "pg_id": 345,
        "title": "Dracula",
        "author": "Bram Stoker",
        "first_publication_year": 1897,
        "origin_country": "Ireland",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Horror",
        "genre_secondary": ["Gothic", "Epistolary"],
        "short_tags": ["vampire", "transylvania", "blood", "van-helsing", "night", "undead"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/345.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/345",
        "notes": "Stoker was Irish, but the book was published in the UK and set partly there and partly in Transylvania.",
        "recognizability_rank": 5,
        "genre_clarity_rank": 1,
        "twist_peak_rank": 8,
        "twist_peak_reason": "Features highly escalating dramatic peaks, including characters turning into vampires, betrayals, and the intense final chase.",
        "citations": [
            "https://en.wikipedia.org/wiki/Dracula",
            "https://www.britannica.com/topic/Dracula-novel",
        ],
    },
    {
        "pg_id": 84,
        "title": "Frankenstein; Or, The Modern Prometheus",
        "author": "Mary Wollstonecraft Shelley",
        "first_publication_year": 1818,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Sci-Fi",
        "genre_secondary": ["Gothic", "Horror"],
        "short_tags": ["monster", "science", "creation", "tragic", "hubris", "scientist"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/84.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/84",
        "notes": "First published anonymously in 1818. Revised 1831 edition is also very common.",
        "recognizability_rank": 6,
        "genre_clarity_rank": 15,
        "twist_peak_rank": 11,
        "twist_peak_reason": "Escalates with shocking murders of Victor's loved ones by the monster, serving as intense emotional and narrative peaks.",
        "citations": [
            "https://en.wikipedia.org/wiki/Frankenstein",
            "https://www.britannica.com/topic/Frankenstein",
        ],
    },
    {
        "pg_id": 43,
        "title": "The Strange Case of Dr. Jekyll and Mr. Hyde",
        "author": "Robert Louis Stevenson",
        "first_publication_year": 1886,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novella",
        "genre_primary": "Horror",
        "genre_secondary": ["Sci-Fi", "Psychological Thriller"],
        "short_tags": ["duality", "science", "murder", "potion", "london", "alter-ego"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/43.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/43",
        "notes": "Stevenson was Scottish; published in the UK.",
        "recognizability_rank": 17,
        "genre_clarity_rank": 16,
        "twist_peak_rank": 1,
        "twist_peak_reason": "Contains one of the most famous plot twists in literary history: Jekyll and Hyde are the same person.",
        "citations": [
            "https://en.wikipedia.org/wiki/Strange_Case_of_Dr_Jekyll_and_Mr_Hyde",
            "https://www.britannica.com/topic/The-Strange-Case-of-Dr-Jekyll-and-Mr-Hyde",
        ],
    },
    {
        "pg_id": 175,
        "title": "The Phantom of the Opera",
        "author": "Gaston Leroux",
        "first_publication_year": 1910,
        "origin_country": "France",
        "original_language": "French",
        "format": "novel",
        "genre_primary": "Gothic",
        "genre_secondary": ["Romance", "Mystery"],
        "short_tags": ["paris", "opera", "ghost", "music", "obsession", "unrequited-love"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/175.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/175",
        "notes": "Serialized from 1909 to 1910, published as a volume in 1910.",
        "recognizability_rank": 13,
        "genre_clarity_rank": 18,
        "twist_peak_rank": 3,
        "twist_peak_reason": "Culminates in a highly theatrical unmasking and a suspenseful, explosive climax in the subterranean lake.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Phantom_of_the_Opera",
            "https://www.britannica.com/topic/The-Phantom-of-the-Opera-by-Leroux",
        ],
    },
    {
        "pg_id": 11,
        "title": "Alice's Adventures in Wonderland",
        "author": "Lewis Carroll",
        "first_publication_year": 1865,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Fantasy",
        "genre_secondary": ["Children's Fiction", "Absurdist"],
        "short_tags": ["rabbit-hole", "tea-party", "dream", "nonsense", "queen-of-hearts", "magic"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/11.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/11",
        "notes": "Lewis Carroll is the pen name of Charles Lutwidge Dodgson.",
        "recognizability_rank": 2,
        "genre_clarity_rank": 9,
        "twist_peak_rank": 19,
        "twist_peak_reason": "Picaresque and episodic structure; the 'it was all a dream' ending functions as a reveal, but the narrative is otherwise dreamily meandering.",
        "citations": [
            "https://en.wikipedia.org/wiki/Alice%27s_Adventures_in_Wonderland",
            "https://www.britannica.com/topic/Alices-Adventures-in-Wonderland",
        ],
    },
    {
        "pg_id": 16,
        "title": "Peter Pan",
        "author": "J. M. Barrie",
        "first_publication_year": 1911,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Fantasy",
        "genre_secondary": ["Children's Fiction", "Adventure"],
        "short_tags": ["neverland", "pirates", "fairies", "childhood", "flying", "hook"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/16.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/16",
        "notes": "Play first performed in 1904; novelized as 'Peter and Wendy' in 1911.",
        "recognizability_rank": 4,
        "genre_clarity_rank": 10,
        "twist_peak_rank": 17,
        "twist_peak_reason": "Contains an action-packed final clash with Captain Hook, but primarily functions as an imaginative, whimsical childhood adventure.",
        "citations": [
            "https://en.wikipedia.org/wiki/Peter_and_Wendy",
            "https://www.britannica.com/topic/Peter-Pan-play",
        ],
    },
    {
        "pg_id": 55,
        "title": "The Wonderful Wizard of Oz",
        "author": "L. Frank Baum",
        "first_publication_year": 1900,
        "origin_country": "United States",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Fantasy",
        "genre_secondary": ["Children's Fiction", "Adventure"],
        "short_tags": ["tornado", "witches", "yellow-brick-road", "magic", "friends", "emerald-city"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/55.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/55",
        "notes": "First published in Chicago.",
        "recognizability_rank": 8,
        "genre_clarity_rank": 7,
        "twist_peak_rank": 16,
        "twist_peak_reason": "Features a solid late-narrative twist that the terrifying Wizard is actually just an ordinary man from Omaha.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Wonderful_Wizard_of_Oz",
            "https://www.britannica.com/topic/The-Wonderful-Wizard-of-Oz",
        ],
    },
    {
        "pg_id": 113,
        "title": "The Secret Garden",
        "author": "Frances Hodgson Burnett",
        "first_publication_year": 1911,
        "origin_country": "United States",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Children's Fiction",
        "genre_secondary": ["Coming-of-Age", "Realism"],
        "short_tags": ["orphans", "yorkshire", "healing", "nature", "garden", "friendship"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/113.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/113",
        "notes": "Serialized starting in 1910, published as a book in 1911. Burnett was British-American.",
        "recognizability_rank": 15,
        "genre_clarity_rank": 14,
        "twist_peak_rank": 18,
        "twist_peak_reason": "A steady, emotional slice-of-life healing narrative with very few sharp betrayals or twists, culminating in Colin's recovery.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Secret_Garden",
            "https://www.britannica.com/topic/The-Secret-Garden",
        ],
    },
    {
        "pg_id": 120,
        "title": "Treasure Island",
        "author": "Robert Louis Stevenson",
        "first_publication_year": 1883,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Adventure",
        "genre_secondary": ["Coming-of-Age", "Pirate Fiction"],
        "short_tags": ["pirates", "gold", "map", "mutiny", "sea", "long-john-silver"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/120.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/120",
        "notes": "Serialized 1881-1882 before book publication.",
        "recognizability_rank": 10,
        "genre_clarity_rank": 5,
        "twist_peak_rank": 9,
        "twist_peak_reason": "Driven by Long John Silver's shocking mutiny and continuous shifts of loyalty, providing strong narrative peaks.",
        "citations": [
            "https://en.wikipedia.org/wiki/Treasure_Island",
            "https://www.britannica.com/topic/Treasure-Island",
        ],
    },
    {
        "pg_id": 521,
        "title": "Robinson Crusoe",
        "author": "Daniel Defoe",
        "first_publication_year": 1719,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Adventure",
        "genre_secondary": ["Survival", "Historical Fiction"],
        "short_tags": ["island", "castaway", "survival", "shipwreck", "friday", "isolation"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/521.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/521",
        "notes": "Originally published presenting Crusoe as the author.",
        "recognizability_rank": 20,
        "genre_clarity_rank": 13,
        "twist_peak_rank": 20,
        "twist_peak_reason": "Almost entirely a slow, methodical survival journal with relatively low emotional spiking until Friday's arrival late in the text.",
        "citations": [
            "https://en.wikipedia.org/wiki/Robinson_Crusoe",
            "https://www.britannica.com/topic/Robinson-Crusoe-novel",
        ],
    },
    {
        "pg_id": 1184,
        "title": "The Count of Monte Cristo",
        "author": "Alexandre Dumas",
        "first_publication_year": 1844,
        "origin_country": "France",
        "original_language": "French",
        "format": "novel",
        "genre_primary": "Adventure",
        "genre_secondary": ["Historical Fiction", "Revenge"],
        "short_tags": ["prison", "treasure", "revenge", "betrayal", "disguise", "paris", "marseille"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1184.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1184",
        "notes": "Serialized starting in 1844.",
        "recognizability_rank": 18,
        "genre_clarity_rank": 20,
        "twist_peak_rank": 2,
        "twist_peak_reason": "Packed with shocking reveals, sudden betrayals, prison escapes, and a meticulously crafted, trap-springing vengeance arc.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Count_of_Monte_Cristo",
            "https://www.britannica.com/topic/The-Count-of-Monte-Cristo",
        ],
    },
    {
        "pg_id": 1257,
        "title": "The Three Musketeers",
        "author": "Alexandre Dumas",
        "first_publication_year": 1844,
        "origin_country": "France",
        "original_language": "French",
        "format": "novel",
        "genre_primary": "Adventure",
        "genre_secondary": ["Historical Fiction", "Swashbuckler"],
        "short_tags": ["swords", "friendship", "kings", "guards", "plot", "honor", "france"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/1257.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/1257",
        "notes": "Serialized from March to July 1844.",
        "recognizability_rank": 19,
        "genre_clarity_rank": 12,
        "twist_peak_rank": 15,
        "twist_peak_reason": "Contains strong court intrigue and sudden duels, culminating in the dark, climactic execution of Milady de Winter.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Three_Musketeers",
            "https://www.britannica.com/topic/The-Three-Musketeers",
        ],
    },
    {
        "pg_id": 103,
        "title": "Around the World in Eighty Days",
        "author": "Jules Verne",
        "first_publication_year": 1872,
        "origin_country": "France",
        "original_language": "French",
        "format": "novel",
        "genre_primary": "Adventure",
        "genre_secondary": ["Sci-Fi", "Travelogue"],
        "short_tags": ["travel", "bet", "train", "ship", "race", "time", "circumnavigation"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/103.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/103",
        "notes": "First published in French in 1872.",
        "recognizability_rank": 11,
        "genre_clarity_rank": 11,
        "twist_peak_rank": 10,
        "twist_peak_reason": "Culminates in a legendary, unexpected twist ending regarding time zones and the International Date Line.",
        "citations": [
            "https://en.wikipedia.org/wiki/Around_the_World_in_Eighty_Days",
            "https://www.britannica.com/topic/Around-the-World-in-Eighty-Days",
        ],
    },
    {
        "pg_id": 35,
        "title": "The Time Machine",
        "author": "H. G. Wells",
        "first_publication_year": 1895,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novella",
        "genre_primary": "Sci-Fi",
        "genre_secondary": ["Dystopian", "Adventure"],
        "short_tags": ["future", "time-travel", "eloi", "morlocks", "evolution", "invention"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/35.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/35",
        "notes": "Serialized early 1895; book published later the same year.",
        "recognizability_rank": 9,
        "genre_clarity_rank": 6,
        "twist_peak_rank": 13,
        "twist_peak_reason": "The revelation of the predatory Morlocks feeding on the Eloi provides a horrifying shift in the middle of the narrative.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_Time_Machine",
            "https://www.britannica.com/topic/The-Time-Machine",
        ],
    },
    {
        "pg_id": 36,
        "title": "The War of the Worlds",
        "author": "H. G. Wells",
        "first_publication_year": 1898,
        "origin_country": "United Kingdom",
        "original_language": "English",
        "format": "novel",
        "genre_primary": "Sci-Fi",
        "genre_secondary": ["Invasion", "Horror"],
        "short_tags": ["aliens", "martians", "tripods", "invasion", "survival", "destruction", "bacteria"],
        "plain_text_utf8_url": "https://www.gutenberg.org/ebooks/36.txt.utf-8",
        "ebook_page_url": "https://www.gutenberg.org/ebooks/36",
        "notes": "Serialized in 1897, published as a book in 1898.",
        "recognizability_rank": 16,
        "genre_clarity_rank": 3,
        "twist_peak_rank": 14,
        "twist_peak_reason": "The sudden defeat of the seemingly invincible Martians by microscopic Earth bacteria offers a sharp, deus-ex-machina-like resolution.",
        "citations": [
            "https://en.wikipedia.org/wiki/The_War_of_the_Worlds",
            "https://www.britannica.com/topic/The-War-of-the-Worlds",
        ],
    },
]

TARGET_SUCCESS = len(BOOK_SPECS)
CANDIDATE_IDS = [int(book["pg_id"]) for book in BOOK_SPECS]

print(f"Project root: {PROJECT_ROOT}")
print(f"Target successes: {TARGET_SUCCESS}, Minimum required: {MIN_SUCCESS}")
print(f"Books requested: {len(BOOK_SPECS)}")


Project root: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis
Target successes: 20, Minimum required: 15
Books requested: 20


In [8]:
def build_gutenberg_urls(book_id: int, primary_txt_url: str = None):
    urls = []
    if primary_txt_url:
        urls.append(primary_txt_url)

    urls.extend([
        f"https://www.gutenberg.org/ebooks/{book_id}.txt.utf-8",
        f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt",
        f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt.utf-8",
        f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt",
        f"https://www.gutenberg.org/files/{book_id}/{book_id}.txt",
    ])

    deduped = []
    seen = set()
    for url in urls:
        if url in seen:
            continue
        deduped.append(url)
        seen.add(url)
    return deduped


def decode_response_content(content: bytes):
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            return content.decode(enc)
        except UnicodeDecodeError:
            continue
    return content.decode("utf-8", errors="ignore")


def looks_like_html(text: str):
    head = text[:500].lower()
    return "<html" in head or "<!doctype html" in head


def download_with_retries(url: str, session: requests.Session, retries: int = 3, timeout: int = 25):
    for attempt in range(1, retries + 1):
        try:
            resp = session.get(url, timeout=timeout)
            if resp.status_code == 200 and resp.content:
                txt = decode_response_content(resp.content)
                if looks_like_html(txt):
                    return None
                return txt
        except requests.RequestException:
            pass

        if attempt < retries:
            wait_s = BACKOFF_BASE_SECONDS * (2 ** (attempt - 1))
            time.sleep(wait_s)
    return None


_START_PATTERNS = [
    re.compile(r"start\s+of\s+(?:this|the)\s+project\s+gutenberg\s+ebook", re.IGNORECASE),
    re.compile(r"\*\*\*\s*start\s+of", re.IGNORECASE),
]
_END_PATTERNS = [
    re.compile(r"end\s+of\s+(?:this|the)\s+project\s+gutenberg\s+ebook", re.IGNORECASE),
    re.compile(r"\*\*\*\s*end\s+of", re.IGNORECASE),
]

_TITLE_STOPWORDS = {
    "a", "an", "and", "or", "the", "of", "to", "in", "on", "for", "by", "with"
}


def abbreviate_title(title: str, max_words: int = 4):
    normalized = unicodedata.normalize("NFKD", title)
    ascii_text = normalized.encode("ascii", errors="ignore").decode("ascii")
    ascii_text = ascii_text.lower()
    ascii_text = re.sub(r"[^a-z0-9]+", " ", ascii_text)
    tokens = [t for t in ascii_text.split() if t]
    if not tokens:
        return "book"

    reduced = [t for t in tokens if t not in _TITLE_STOPWORDS]
    selected = (reduced if reduced else tokens)[:max_words]
    slug = "_".join(selected).strip("_")
    slug = slug[:60].rstrip("_")
    return slug or "book"


def _find_marker_index(lines, patterns):
    for i, line in enumerate(lines):
        for pat in patterns:
            if pat.search(line):
                return i
    return None


def clean_gutenberg_text(raw_text: str):
    text = raw_text.replace("\ufeff", "").replace("\x00", "")
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = text.split("\n")

    start_idx = _find_marker_index(lines, _START_PATTERNS)
    end_idx = _find_marker_index(lines, _END_PATTERNS)

    used_markers = False
    if start_idx is not None and end_idx is not None and end_idx > start_idx:
        lines = lines[start_idx + 1:end_idx]
        used_markers = True
    elif start_idx is not None:
        lines = lines[start_idx + 1:]
        used_markers = True
    elif end_idx is not None:
        lines = lines[:end_idx]
        used_markers = True

    cleaned = "\n".join(lines)
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
    return cleaned, used_markers


def extract_title(cleaned_text: str, book_id: int):
    lines = cleaned_text.splitlines()[:80]
    for line in lines:
        s = line.strip()
        if not s:
            continue
        low = s.lower()
        if low.startswith("title:"):
            candidate = s.split(":", 1)[1].strip()
            if candidate:
                return candidate[:180]
            continue
        if low.startswith(("author:", "illustrator:", "translator:", "release date:")):
            continue
        if low.startswith(("produced by", "ebook #", "project gutenberg")):
            continue
        if re.search(r"[a-zA-Z]", s):
            return s[:180]
    return f"Book_{book_id}"


def word_count(text: str):
    return len(re.findall(r"\S+", text))


In [9]:
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})

rows = []
failures = []
used_raw_filenames = set()

for book in BOOK_SPECS:
    if len(rows) >= TARGET_SUCCESS:
        break

    book_id = int(book["pg_id"])
    expected_title = book["title"]
    expected_author = book["author"]
    ebook_page_url = book["ebook_page_url"]
    preferred_plaintext_url = book["plain_text_utf8_url"]

    base_slug = abbreviate_title(expected_title)
    raw_filename = f"{base_slug}.txt"
    if raw_filename in used_raw_filenames:
        raw_filename = f"{base_slug}_{book_id}.txt"
    used_raw_filenames.add(raw_filename)

    processed_dir = Path(raw_filename).stem

    raw_path = RAW_DIR / raw_filename
    legacy_raw_path = RAW_DIR / f"{book_id}.txt"
    existing_path = raw_path if raw_path.exists() else (legacy_raw_path if legacy_raw_path.exists() else None)

    if existing_path is not None and not FORCE_REDOWNLOAD:
        cached_text = existing_path.read_text(encoding="utf-8", errors="ignore")
        cached_words = word_count(cached_text)
        if cached_words >= MIN_WORDS:
            if existing_path != raw_path:
                raw_path.write_text(cached_text, encoding="utf-8")

            row = dict(book)
            row.update({
                "id": book_id,
                "length": int(cached_words),
                "char_length": int(len(cached_text)),
                "source_url": "cached",
                "status": "cached",
                "raw_filename": raw_filename,
                "raw_path": str(raw_path),
                "processed_dir": processed_dir,
                "processed_path": str(PROCESSED_DIR / processed_dir),
            })
            rows.append(row)
            print(f"[cached] {book_id} | {expected_title} -> {raw_filename} ({cached_words} words)")
            continue

    selected_url = None
    cleaned_text = None
    marker_used = False

    for url in build_gutenberg_urls(book_id=book_id, primary_txt_url=preferred_plaintext_url):
        raw_text = download_with_retries(url, session=session, retries=MAX_RETRIES, timeout=REQUEST_TIMEOUT)
        if raw_text is None:
            continue

        candidate_cleaned, used_markers = clean_gutenberg_text(raw_text)
        n_words = word_count(candidate_cleaned)
        if n_words < MIN_WORDS:
            continue

        selected_url = url
        cleaned_text = candidate_cleaned
        marker_used = used_markers
        break

    if cleaned_text is None:
        failures.append(book)
        print(f"[failed] {book_id} | {expected_title}")
        continue

    raw_path.write_text(cleaned_text, encoding="utf-8")

    row = dict(book)
    row.update({
        "id": book_id,
        "length": int(word_count(cleaned_text)),
        "char_length": int(len(cleaned_text)),
        "source_url": selected_url,
        "status": "downloaded_marker" if marker_used else "downloaded_no_marker",
        "raw_filename": raw_filename,
        "raw_path": str(raw_path),
        "processed_dir": processed_dir,
        "processed_path": str(PROCESSED_DIR / processed_dir),
    })
    rows.append(row)
    print(f"[ok] {book_id} | {expected_title} -> {raw_filename} from {selected_url}")

success_count = len(rows)
print(f"\nCompleted download pass: {success_count} successes, {len(failures)} failures")
if failures:
    print("Failed IDs:", [int(f["pg_id"]) for f in failures])
    print("Failed titles:", [f["title"] for f in failures])


[ok] 1342 | Pride and Prejudice -> pride_prejudice.txt from https://www.gutenberg.org/ebooks/1342.txt.utf-8
[ok] 1260 | Jane Eyre -> jane_eyre.txt from https://www.gutenberg.org/ebooks/1260.txt.utf-8
[ok] 1513 | Romeo and Juliet -> romeo_juliet.txt from https://www.gutenberg.org/ebooks/1513.txt.utf-8
[ok] 768 | Wuthering Heights -> wuthering_heights.txt from https://www.gutenberg.org/ebooks/768.txt.utf-8
[ok] 1661 | The Adventures of Sherlock Holmes -> adventures_sherlock_holmes.txt from https://www.gutenberg.org/ebooks/1661.txt.utf-8
[ok] 345 | Dracula -> dracula.txt from https://www.gutenberg.org/ebooks/345.txt.utf-8
[ok] 84 | Frankenstein; Or, The Modern Prometheus -> frankenstein_modern_prometheus.txt from https://www.gutenberg.org/ebooks/84.txt.utf-8
[ok] 43 | The Strange Case of Dr. Jekyll and Mr. Hyde -> strange_case_dr_jekyll.txt from https://www.gutenberg.org/ebooks/43.txt.utf-8
[ok] 175 | The Phantom of the Opera -> phantom_opera.txt from https://www.gutenberg.org/ebooks/175.

In [10]:
if len(rows) < MIN_SUCCESS:
    raise RuntimeError(
        f"Only {len(rows)} books were downloaded/available. Minimum required is {MIN_SUCCESS}."
    )

metadata_df = pd.DataFrame(rows)

for col in ["genre_secondary", "short_tags", "citations"]:
    if col in metadata_df.columns:
        metadata_df[col] = metadata_df[col].apply(lambda v: json.dumps(v, ensure_ascii=False) if isinstance(v, list) else v)

ordered_cols = [
    "id",
    "pg_id",
    "title",
    "author",
    "first_publication_year",
    "origin_country",
    "original_language",
    "format",
    "genre_primary",
    "genre_secondary",
    "short_tags",
    "recognizability_rank",
    "genre_clarity_rank",
    "twist_peak_rank",
    "twist_peak_reason",
    "notes",
    "ebook_page_url",
    "plain_text_utf8_url",
    "raw_filename",
    "raw_path",
    "processed_dir",
    "processed_path",
    "length",
    "char_length",
    "source_url",
    "status",
    "citations",
]

metadata_df = metadata_df[[c for c in ordered_cols if c in metadata_df.columns]]
metadata_df = metadata_df.sort_values("id").reset_index(drop=True)
metadata_df.to_csv(METADATA_PATH, index=False)
BOOK_CATALOG_PATH.write_text(json.dumps(BOOK_SPECS, indent=2, ensure_ascii=False), encoding="utf-8")

print(f"Saved metadata: {METADATA_PATH}")
print(f"Saved catalog JSON: {BOOK_CATALOG_PATH}")
print(f"Rows: {len(metadata_df)}")
if len(rows) < TARGET_SUCCESS:
    print(f"Warning: requested {TARGET_SUCCESS} books, but only {len(rows)} succeeded.")
display(metadata_df.head(20))


Saved metadata: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/data/metadata.csv
Saved catalog JSON: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/data/book_catalog.json
Rows: 20


Unnamed: 0,id,pg_id,title,author,first_publication_year,origin_country,original_language,format,genre_primary,genre_secondary,...,plain_text_utf8_url,raw_filename,raw_path,processed_dir,processed_path,length,char_length,source_url,status,citations
0,11,11,Alice's Adventures in Wonderland,Lewis Carroll,1865,United Kingdom,English,novel,Fantasy,"[""Children's Fiction"", ""Absurdist""]",...,https://www.gutenberg.org/ebooks/11.txt.utf-8,alice_s_adventures_wonderland.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,alice_s_adventures_wonderland,/Users/kongfha/Desktop/Time_Series_Mining/stor...,26525,144529,https://www.gutenberg.org/ebooks/11.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Alice%27s_Adve..."
1,16,16,Peter Pan,J. M. Barrie,1911,United Kingdom,English,novel,Fantasy,"[""Children's Fiction"", ""Adventure""]",...,https://www.gutenberg.org/ebooks/16.txt.utf-8,peter_pan.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,peter_pan,/Users/kongfha/Desktop/Time_Series_Mining/stor...,47268,255514,https://www.gutenberg.org/ebooks/16.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Peter_and_Wend..."
2,35,35,The Time Machine,H. G. Wells,1895,United Kingdom,English,novella,Sci-Fi,"[""Dystopian"", ""Adventure""]",...,https://www.gutenberg.org/ebooks/35.txt.utf-8,time_machine.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,time_machine,/Users/kongfha/Desktop/Time_Series_Mining/stor...,32453,179621,https://www.gutenberg.org/ebooks/35.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/The_Time_Machi..."
3,36,36,The War of the Worlds,H. G. Wells,1898,United Kingdom,English,novel,Sci-Fi,"[""Invasion"", ""Horror""]",...,https://www.gutenberg.org/ebooks/36.txt.utf-8,war_worlds.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,war_worlds,/Users/kongfha/Desktop/Time_Series_Mining/stor...,60076,337303,https://www.gutenberg.org/ebooks/36.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/The_War_of_the..."
4,43,43,The Strange Case of Dr. Jekyll and Mr. Hyde,Robert Louis Stevenson,1886,United Kingdom,English,novella,Horror,"[""Sci-Fi"", ""Psychological Thriller""]",...,https://www.gutenberg.org/ebooks/43.txt.utf-8,strange_case_dr_jekyll.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,strange_case_dr_jekyll,/Users/kongfha/Desktop/Time_Series_Mining/stor...,25629,138775,https://www.gutenberg.org/ebooks/43.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Strange_Case_o..."
5,55,55,The Wonderful Wizard of Oz,L. Frank Baum,1900,United States,English,novel,Fantasy,"[""Children's Fiction"", ""Adventure""]",...,https://www.gutenberg.org/ebooks/55.txt.utf-8,wonderful_wizard_oz.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,wonderful_wizard_oz,/Users/kongfha/Desktop/Time_Series_Mining/stor...,39649,207707,https://www.gutenberg.org/ebooks/55.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/The_Wonderful_..."
6,84,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,1818,United Kingdom,English,novel,Sci-Fi,"[""Gothic"", ""Horror""]",...,https://www.gutenberg.org/ebooks/84.txt.utf-8,frankenstein_modern_prometheus.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,frankenstein_modern_prometheus,/Users/kongfha/Desktop/Time_Series_Mining/stor...,75042,419194,https://www.gutenberg.org/ebooks/84.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Frankenstein"",..."
7,103,103,Around the World in Eighty Days,Jules Verne,1872,France,French,novel,Adventure,"[""Sci-Fi"", ""Travelogue""]",...,https://www.gutenberg.org/ebooks/103.txt.utf-8,around_world_eighty_days.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,around_world_eighty_days,/Users/kongfha/Desktop/Time_Series_Mining/stor...,63334,369030,https://www.gutenberg.org/ebooks/103.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Around_the_Wor..."
8,113,113,The Secret Garden,Frances Hodgson Burnett,1911,United States,English,novel,Children's Fiction,"[""Coming-of-Age"", ""Realism""]",...,https://www.gutenberg.org/ebooks/113.txt.utf-8,secret_garden.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,secret_garden,/Users/kongfha/Desktop/Time_Series_Mining/stor...,80632,428538,https://www.gutenberg.org/ebooks/113.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/The_Secret_Gar..."
9,120,120,Treasure Island,Robert Louis Stevenson,1883,United Kingdom,English,novel,Adventure,"[""Coming-of-Age"", ""Pirate Fiction""]",...,https://www.gutenberg.org/ebooks/120.txt.utf-8,treasure_island.txt,/Users/kongfha/Desktop/Time_Series_Mining/stor...,treasure_island,/Users/kongfha/Desktop/Time_Series_Mining/stor...,68637,364356,https://www.gutenberg.org/ebooks/120.txt.utf-8,downloaded_marker,"[""https://en.wikipedia.org/wiki/Treasure_Islan..."
