In [35]:
# Cell 1 – Imports & notebook niceties
# %load_ext autoreload
# %autoreload 2            # reload local modules when you edit them
# %config InlineBackend.figure_format = "retina"

import sys, json, time, random, itertools, pathlib, logging, textwrap, re, typing as T
from dataclasses import dataclass

import requests
import difflib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO, format="%(levelname)s │ %(message)s")
logger = logging.getLogger(__name__)


In [36]:
# Cell 2 – All user-tunable knobs in one place
@dataclass
class Config:
    # --- INPUT -------------------------------------------------------------
    fetch_pages: list[str] | None = None         # list of Wikipedia page titles
    fetch_start: str | None = None               # "YYYY-MM-DD"
    fetch_end: str | None = None
    fetch_revs_per_page: int | None = 30         # None ⇒ pull ALL
    fetch_delay: float = 0.1                     # seconds between API hits

    file_paths: list[str] | None = None          # alternative: local JSONL(s)
    text_field: str = "added_text"
    timestamp_field: str = "timestamp"

    # --- SAMPLING ----------------------------------------------------------
    sample_size_revisions: int | None = 10_000   # None = keep everything
    sample_seed: int = 42

    # --- BASELINE WINDOW ---------------------------------------------------
    baseline_start: str = "2020-01"              # inclusive YYYY-MM
    baseline_end:   str = "2022-11"

    # --- SPIKE THRESHOLDS --------------------------------------------------
    gap_min: float = 0.5                         # per-million tokens
    ratio_min: float = 10.0

    # --- MISC --------------------------------------------------------------
    verbose: bool = True


In [37]:
# Cell 3 – Fetch ≤ N revisions/page via MediaWiki API, yield dicts

S = requests.Session()
API = "https://en.wikipedia.org/w/api.php"

def iso(date_str: str | None) -> str | None:
    """Turn 'YYYY-MM-DD' → 'YYYY-MM-DDT00:00:00Z' (API needs full timestamp)."""
    return None if date_str is None else f"{date_str}T00:00:00Z"

def fetch_revisions_api(cfg: Config):
    assert cfg.fetch_pages, "No pages specified"
    params_base = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "ids|timestamp|comment|content",
        "rvslots": "main",
        "format": "json",
        "formatversion": "2",
    }

    for title in cfg.fetch_pages:
        params = params_base | {
            "titles": title,
            "rvlimit": cfg.fetch_revs_per_page or "max",
            "rvstart": iso(cfg.fetch_end),
            "rvend":   iso(cfg.fetch_start),
            "rvdir": "newer",
        }
        logger.info(f"Fetching {title!r}")
        data = S.get(API, params=params, timeout=30).json()
        if "error" in data:
            logger.warning(f"API error for {title!r}: {data['error']['code']}")
            continue
        revs = data["query"]["pages"][0].get("revisions", [])
        for r in revs:
            yield {
                "page": title,
                "rev_id": r["revid"],
                "timestamp": r["timestamp"],
                "wikitext": r["slots"]["main"]["content"],
            }
        time.sleep(cfg.fetch_delay)


In [38]:
# Cell 4 – Given successive revisions of *one* page, emit only the inserted words
# Cell 4  (edit)
TOKEN_RX = re.compile(r"[A-Za-z]{3,}")   # accept 3-letter words

def extract_added_text(revisions: list[dict]) -> list[dict]:
    """
    revisions must be in chronological order.
    Returns list of dicts with 'timestamp' and 'added_text'.
    """
    out = []
    prev_lines = []
    for rev in revisions:
        curr_lines = rev["wikitext"].splitlines()
        diff = difflib.ndiff(prev_lines, curr_lines)
        added_words = []
        for line in diff:
            if line.startswith("+ "):
                added_words.extend(TOKEN_RX.findall(line[2:].lower()))
        if added_words:
            out.append({
                "timestamp": rev["timestamp"],
                "added_text": " ".join(added_words)
            })
        prev_lines = curr_lines
    return out


In [39]:
# Cell 5 – Unifies fetch + diff + JSONL into one generator of tiny dicts
def load_revisions(cfg: Config):
    if cfg.fetch_pages:
        # Group API revisions by page, then diff locally
        buf: list[dict] = []
        for raw_rev in fetch_revisions_api(cfg):
            if buf and raw_rev["page"] != buf[0]["page"]:
                yield from extract_added_text(buf)
                buf = []
            buf.append(raw_rev)
        if buf:
            yield from extract_added_text(buf)
    else:
        assert cfg.file_paths, "Provide file_paths or fetch_pages"
        for path in cfg.file_paths:
            with open(path) as f:
                for line in f:
                    obj = json.loads(line)
                    yield {
                        "timestamp": obj[cfg.timestamp_field],
                        "added_text": obj[cfg.text_field],
                    }


In [40]:
# Cell 6 – Keeps ≤ sample_size random revisions
def sample_revs(stream, cfg: Config):
    if cfg.sample_size_revisions is None:
        yield from stream
        return

    random.seed(cfg.sample_seed)
    sample, n_seen = [], 0
    k = cfg.sample_size_revisions
    for item in stream:
        n_seen += 1
        if len(sample) < k:
            sample.append(item)
        else:
            j = random.randrange(n_seen)
            if j < k:
                sample[j] = item
    yield from sample


In [41]:
# Cell 7
def build_counts(rev_iter, cfg: Config) -> pd.DataFrame:
    counts: dict[tuple[str, str], int] = {}
    totals: dict[str, int] = {}

    for rev in rev_iter:
        month = rev["timestamp"][:7]              # YYYY-MM
        tokens = TOKEN_RX.findall(rev["added_text"])
        totals[month] = totals.get(month, 0) + len(tokens)
        for w in tokens:
            key = (w, month)
            counts[key] = counts.get(key, 0) + 1

    rows = []
    for (w, m), c in counts.items():
        rows.append({"word": w,
                     "month": m,
                     "count": c,
                     "tokens_total": totals[m]})
    df = pd.DataFrame(rows)
    df["freq_obs"] = 1_000_000 * df["count"] / df["tokens_total"]
    return df


In [42]:
# Cell 8
from sklearn.linear_model import TheilSenRegressor

def fit_baseline(df: pd.DataFrame, cfg: Config) -> pd.DataFrame:
    df = df.copy()
    df["month_int"] = df["month"].str.replace("-", "").astype(int)

    base = df[(df["month"] >= cfg.baseline_start) & (df["month"] <= cfg.baseline_end)].copy()
    if base.empty:
        raise ValueError("No rows fall inside baseline window – widen fetch_start/end!")

    preds = []
    for w, group in base.groupby("word"):
        if len(group) < 2:
            continue
        X = group[["month_int"]]
        y = group["freq_obs"]
        model = TheilSenRegressor().fit(X, y)
        later = df[df["word"] == w]
        later["freq_exp"] = model.predict(later[["month_int"]])
        preds.append(later)

    return pd.concat(preds, ignore_index=True)


In [43]:
# Cell 9
def detect_spikes(df: pd.DataFrame, cfg: Config) -> pd.DataFrame:
    post = df[df["month"] > cfg.baseline_end].copy()
    post["gap"]   = post["freq_obs"] - post["freq_exp"]
    post["ratio"] = post["freq_obs"] / post["freq_exp"].replace(0, np.nan)
    spikes = (
        post[(post["gap"] >= cfg.gap_min) & (post["ratio"] >= cfg.ratio_min)]
        .sort_values(["gap", "ratio"], ascending=False)
        .reset_index(drop=True)
    )
    return spikes[["word", "month", "gap", "ratio"]]


In [44]:
# Cell 10
def run_pipeline(cfg: Config):
    stream   = load_revisions(cfg)
    stream   = sample_revs(stream, cfg)
    counts   = build_counts(stream, cfg)
    baseline = fit_baseline(counts, cfg)
    spikes   = detect_spikes(baseline, cfg)
    return baseline, spikes


In [49]:
# Cell 11 – Run on **tiny** live sample

cfg = Config(
    fetch_pages=["ChatGPT", "OpenAI"],
    fetch_start="2022-12-15",
    fetch_end="2024-01-07",
    fetch_revs_per_page=50,         # ≤ 50 per page
    sample_size_revisions=None,     # keep all fetched
    baseline_start="2022-11",       # short baseline, just for demo
    baseline_end="2022-12",
    gap_min=0.2, ratio_min=3        # lenient thresholds for tiny data
)

baseline_df, spike_df = run_pipeline(cfg)

display(baseline_df.head())
display(spike_df.head(20))


INFO Fetching 'ChatGPT'
INFO Fetching 'OpenAI'


KeyError: 'count'

In [53]:
from itertools import islice, tee
# Clone the generator so we don't consume the original
raw_stream, dbg_stream = tee(load_revisions(cfg))

print("First 3 raw items from load_revisions:")
for x in islice(dbg_stream, 3):
    print(x)

# Re-assign the untouched copy back into the pipeline run
def run_pipeline_debug(cfg):
    stream = raw_stream                        # ← use the clone
    stream = sample_revs(stream, cfg)
    counts = build_counts(stream, cfg)
    print("Counts shape:", counts.shape)
    return counts



INFO Fetching 'ChatGPT'


First 3 raw items from load_revisions:


INFO Fetching 'OpenAI'


In [54]:

counts_df = run_pipeline_debug(cfg)
counts_df

KeyError: 'count'