# Fetch article titles from root categories and their subcategories

In [46]:
import fetch_titles, fetch_revisions
from typing import List, Dict, Set, Tuple
import pandas as pd
import re

import parallel


nlp = spacy.load("en_core_web_sm")


In [14]:
API_URL = "https://en.wikipedia.org/w/api.php"
# Map of supercategories to lists of root categories (without 'Category:' prefix)
ROOT_CATEGORIES: Dict[str, List[str]] = {
    "Politics": [
        "Politics"  #, "Political history", "Elections", "Political parties"
        # ],
        # "Science & Medicine": [
        #     "Science", "Medicine", "Biology", "Physics", "Chemistry"
        # ],
        # "History": [
        #     "History", "Military history", "History by country"
        # ],
        # "Technology": [
        #     "Technology", "Computing", "Engineering"
        # ],
        # "Popular Culture": [
        #     "Popular culture", "Music", "Television", "Film", "Video games"
    ]
}
MAX_DEPTH = 1  # Subcategory traversal depth
OUTPUT_CSV = "category_titles_by_group.csv"

In [15]:
records = fetch_titles.collect_titles(ROOT_CATEGORIES, MAX_DEPTH)
fetch_titles.save_to_csv(records, OUTPUT_CSV)
print(f"Fetched {len(records)} articles across {len(ROOT_CATEGORIES)} groups (depth={MAX_DEPTH}).")


Supercategories:   0%|          | 0/1 [00:00<?, ?group/s]
Politics:   0%|          | 0/1 [00:00<?, ?cat/s][A
Politics: 100%|██████████| 1/1 [00:18<00:00, 18.25s/cat][A
Supercategories: 100%|██████████| 1/1 [00:18<00:00, 18.26s/group]

Fetched 1643 articles across 1 groups (depth=1).





In [57]:
# load from all_articles_by_category.csv if available
try:
    articles_df = pd.read_csv("../all_articles_by_category.csv")
    articles = set(articles_df["title"].tolist())
    print(f"Loaded {len(articles)} articles from CSV.")
except FileNotFoundError:
    print("CSV file not found. Using previously fetched articles.")

Loaded 27725 articles from CSV.


In [19]:
# check the output CSV
print("reading the output CSV file:", OUTPUT_CSV)
titles_df = pd.read_csv(OUTPUT_CSV)
titles_df.head()

reading the output CSV file: category_titles_by_group.csv


Unnamed: 0,supercategory,category,title,pageid
0,Politics,Animal rights and politics,Anarchism and animal rights,4398733
1,Politics,Animal rights and politics,Green anarchism,98514
2,Politics,Clothing in politics,Abacost,6011479
3,Politics,Clothing in politics,Act respecting the laicity of the State,60358521
4,Politics,Clothing in politics,Armband,1446987


# Fetch revisions for articles in the CSV file

In [62]:
# Configuration
ARTICLES_CSV = OUTPUT_CSV  #"mini_articles_by_category.csv"
OUTPUT_REVS_PICKLE = "SMOL_revision_snapshots.pkl"
CHECKPOINT_DIR = "checkpoints/revisions"
START_TS = "2022-01-01T00:00:00Z"
END_TS = "2024-01-31T23:59:59Z"
FREQ = "1ME"  # monthly snapshots

In [23]:
import fetch_revisions

# Load mini-articles sample
articles_df = pd.read_csv(ARTICLES_CSV)
titles = articles_df['title'].unique().tolist()

# Process in parallel with progress
rev_dfs = fetch_revisions.process_batch_with_progress(
    fetch_revisions.fetch_revision_snapshots,
    titles,
    desc="Fetching revision snapshots",
    use_threads=True,
    cpu_intensive=False,
    max_workers=8,
    batch_size=10,
    carry_forward=True
)

# Concatenate all results
all_revs = pd.concat(rev_dfs, ignore_index=True)
print(f"Fetched {len(all_revs)} revision snapshots for {len(titles)} articles.")


# add the supercategory and category columns back to all_revs DataFrame on pageid
def add_categories_to_revisions(revs_df: pd.DataFrame, articles_df: pd.DataFrame) -> pd.DataFrame:
    # Create a mapping from pageid to supercategory and category
    category_map = articles_df.set_index('pageid')[['supercategory', 'category']].to_dict(orient='index')

    # Map the categories to the revisions DataFrame
    revs_df['supercategory'] = revs_df['pageid'].map(lambda x: category_map.get(x, {}).get('supercategory', None))
    revs_df['category'] = revs_df['pageid'].map(lambda x: category_map.get(x, {}).get('category', None))

    return revs_df


all_revs = add_categories_to_revisions(all_revs, articles_df)

# Save final DataFrame
all_revs.to_pickle(OUTPUT_REVS_PICKLE)
print(f"Saved combined snapshots to {OUTPUT_REVS_PICKLE}")

Fetching revision snapshots:   0%|          | 0/1548 [00:00<?, ?it/s]

Processing batch 1/155 (10 items) with 8 workers
Processing batch 2/155 (10 items) with 8 workers
Processing batch 3/155 (10 items) with 8 workers
Processing batch 4/155 (10 items) with 8 workers
Processing batch 5/155 (10 items) with 8 workers
Processing batch 6/155 (10 items) with 8 workers
Processing batch 7/155 (10 items) with 8 workers
Processing batch 8/155 (10 items) with 8 workers
Processing batch 9/155 (10 items) with 8 workers
Processing batch 10/155 (10 items) with 8 workers
Processing batch 11/155 (10 items) with 8 workers
Processing batch 12/155 (10 items) with 8 workers
Processing batch 13/155 (10 items) with 8 workers
Processing batch 14/155 (10 items) with 8 workers
Processing batch 15/155 (10 items) with 8 workers
Processing batch 16/155 (10 items) with 8 workers
Processing batch 17/155 (10 items) with 8 workers
Processing batch 18/155 (10 items) with 8 workers
Processing batch 19/155 (10 items) with 8 workers
Processing batch 20/155 (10 items) with 8 workers
Processin

In [None]:
# add the supercategory and category columns back to all_revs DataFrame on pageid
def add_categories_to_revisions(revs_df: pd.DataFrame,
                                articles_df: pd.DataFrame) -> pd.DataFrame:
    """
    Joins supercategory and category onto revs_df by pageid,
    dropping any pre-existing category columns so you only end up
    with one clean set.
    """
    # 1. Drop any old columns in revs_df
    revs_df = revs_df.drop(columns=["supercategory", "category"], errors="ignore")

    # 2. Reduce articles_df to one row per pageid
    unique_articles = (
        articles_df
        .drop_duplicates(subset=["pageid"])
        .loc[:, ["pageid", "supercategory", "category"]]
    )

    # 3. Left-merge in the one copy of each category
    merged = revs_df.merge(
        unique_articles,
        on="pageid",
        how="left"
    )

    return merged


all_revs = add_categories_to_revisions(all_revs, articles_df)

# Save final DataFrame
all_revs.to_pickle(OUTPUT_REVS_PICKLE)
print(f"Saved combined snapshots to {OUTPUT_REVS_PICKLE}")

# load the all_revs DataFrame if needed
revisions = pd.read_pickle("SMOL_revision_snapshots.pkl")
revisions.head()


In [None]:
# print average revisions per article
avg_revisions = revisions.groupby('pageid').size().mean()
stdev_revisions = revisions.groupby('pageid').size().std()
print(f"Average revisions per article: {avg_revisions:.2f}")
print(f"Standard deviation of revisions per article: {stdev_revisions:.2f}")

print(f"Total number of revisions: {len(revisions)}")
print(f"distinct articles: {revisions['pageid'].nunique()}")
print(f"distinct categories: {revisions['category'].nunique()}")
print(f"distinct supercategories: {revisions['supercategory'].nunique()}")

from datetime import datetime

start_date = datetime.fromisoformat(START_TS[:-1])  # remove 'Z' for datetime
end_date = datetime.fromisoformat(END_TS[:-1])  # remove 'Z' for datetime
num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1
print(f"Number of months in interval: {num_months}")

In [55]:
def clean_text(wikitext: str) -> str:
    """Clean Wikipedia markup text to plain text"""
    if not isinstance(wikitext, str):
        return ""

    # Remove wiki markup—keep plain text for prototype
    text = re.sub(r"<ref>.*?</ref>", "", wikitext, flags=re.DOTALL)
    text = re.sub(r"\{\{.*?\}\}", "", text)  # simple template removal
    text = re.sub(r"\[\[([^|\]]*\|)?([^\]]+)\]\]", r"\2", text)  # keep link text
    text = re.sub(r"''+", "", text)  # remove italic/bold
    # Remove non-alphabetic chars except basic punctuation
    text = re.sub(r"[^A-Za-z0-9 \.\,\!\?\-\'\"]+ ", " ", text)
    # Lowercase and collapse whitespace
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:

# Clean text in parallel
all_revs = parallel.process_dataframe_parallel(
    all_revs,
    clean_text,
    column="content",
    new_column="plain_text",
    use_threads=True,  # Text cleaning is I/O-bound
    cpu_intensive=False
)




In [None]:
# Display sample
all_revs[["title", "plain_text"]].head(11)

# SpaCy parsing of cleaned text

In [None]:
import spaCyParser

result_spacy = parallel.process_dataframe_parallel(
    all_revs,
    spaCyParser.parse_with_spacy,
    column="plain_text",
    new_column="parsed",
    use_threads=True,
    cpu_intensive=False

)
result_spacy.head()

In [None]:
# save results to a new CSV file
result_spacy.to_csv("parsed_revisions.csv", index=False)

# Delta word frequency

In [None]:
import pandas as pd
from lexical_spike import load_trigger_set, compute_baseline_q, add_lexical_spike_delta

# 1. Load your data
all_revs = pd.read_csv("after_spacy_parsed100percat_with_categories_june23.csv")

# 2. Load trigger words
trigger_set = load_trigger_set("combined_chatgpt_words.csv")

# 3. Compute baseline q
q = compute_baseline_q(all_revs, trigger_set, cutoff_date="2022-11-01")
print(f"Baseline q: {q:.6f}")

# 4. Add p_t and delta
result_lexical_spike = add_lexical_spike_delta(all_revs, q, trigger_set)

# 5. Save or inspect
result_lexical_spike.to_csv("lexical_spikes.csv", index=False)


# Perplexity and burstiness

In [69]:
import pandas as pd
from gpt2_perplexity_burstiness import add_perplexity_and_burstiness_to_df

# load your revisions DataFrame however you like
all_revs = pd.read_csv("after_spacy_parsed100percat_with_categories_june23.csv")

# this will add .perplexity and .burstiness columns in place
all_revs = add_perplexity_and_burstiness_to_df(
    all_revs,
    text_col="plain_text",
    batch_size=8
)

# inspect
all_revs.head()


ImportError: cannot import name 'GPT2LMHeadModel' from 'transformers' (C:\Users\david\anaconda3\envs\wiki\Lib\site-packages\transformers\__init__.py)

In [None]:
# Extract attributes from parsed dictionaries
all_revs["upos_props"] = result_spacy["parsed"].apply(lambda x: x.get("upos_props", {}))
all_revs["mean_dep_depth"] = result_spacy["parsed"].apply(lambda x: x.get("mean_dep_depth", 0))
all_revs["clause_ratio"] = result_spacy["parsed"].apply(lambda x: x.get("clause_ratio", 0))
all_revs["voice_ratio"] = result_spacy["parsed"].apply(lambda x: x.get("voice_ratio", 0))


In [None]:
from textstat import textstat


def compute_readability(text: str):
    """Compute readability metrics for text"""
    if not isinstance(text, str) or not text.strip():
        return 0.0, 0.0, 0.0, 0.0

    try:
        fre = textstat.flesch_reading_ease(text)
        fog = textstat.gunning_fog(text)

        # Characters per sentence
        sentences = list(nlp(text).sents)
        chars_per_sent = sum(len(sent.text) for sent in sentences) / (len(sentences) or 1)

        # Sentences per paragraph (since we have flattened text, treat the entire text as one paragraph)
        sents_per_para = len(sentences)  # toy assumption: 1 paragraph = all sentences

        return fre, fog, chars_per_sent, sents_per_para
    except Exception as e:
        print(f"Error computing readability: {str(e)}")
        return 0.0, 0.0, 0.0, 0.0


In [None]:
# Compute readability metrics in parallel
all_revs = parallel.process_dataframe_parallel(
    all_revs,
    compute_readability,
    column="plain_text",
    new_column=["fre", "fog", "chars_per_sent", "sents_per_para"],
    use_threads=True,  # CPU-intensive
    cpu_intensive=True
)


In [None]:
def compute_vocab_diversity(text: str, window_size: int = 250):
    """Compute vocabulary diversity metrics"""
    if not isinstance(text, str) or not text.strip():
        return 0.0, 0.0

    tokens = text.split()[:window_size]
    unique_count = len(set(tokens))
    total_count = len(tokens) or 1

    # Normalized TTR = unique / sqrt(2 * total)
    nTTR = unique_count / ((2 * total_count) ** 0.5)

    # Word-density: lines = count of '\n' + 1, avg_line_len:
    lines = text.count("\n") + 1
    avg_line_len = sum(len(line) for line in text.split("\n")) / lines
    wd = 100 * unique_count / (lines * (avg_line_len or 1))

    return nTTR, wd


In [None]:
# Compute vocabulary diversity in parallel
all_revs = parallel.process_dataframe_parallel(
    all_revs,
    compute_vocab_diversity,
    column="plain_text",
    new_column=["nTTR", "word_density"],
    use_threads=True  # This is lightweight
)


In [None]:
def compute_citation_delta(wikitext: str):
    """Compute citation delta"""
    if not isinstance(wikitext, str) or not wikitext.strip():
        return 0.0

    # Count <ref> tags in raw wikitext
    added = len(re.findall(r"<ref[^>]*>", wikitext))
    removed = 0  # For prototype, assume no diff stored; set removed = 0
    tokens_changed = len(wikitext.split()) or 1
    return (added - removed) / tokens_changed


# Compute citation delta
all_revs["citation_delta"] = all_revs["content"].apply(compute_citation_delta)

In [None]:
# save everything to a single file
all_revs.to_pickle("FINAL.pkl")
all_revs.to_csv("FINAL.csv", index=False)