In [2]:
# Import required libraries
import requests
import pandas as pd
import spacy
from textstat import textstat
import re
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch
import pickle
import os
import time
from tqdm.auto import tqdm
from dateutil import parser
import numpy as np

# Import our parallel processing utilities
from parallel import (
    process_batch_with_progress,
    process_dataframe_parallel,
    gpu_batch_process,
    resilient_api_call,
    CheckpointManager
)


In [3]:
nlp = spacy.load("en_core_web_sm")


In [4]:
print(torch.__version__)
print(torch.version.cuda)  # Should not be None
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

2.7.1+cu118
11.8
True
1
NVIDIA GeForce RTX 2070 with Max-Q Design


In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
if torch.cuda.is_available():
    print("cuda available!")
    model.to("cuda")
else:
    print('cuda not available!')


cuda available!


## fetch article titles

In [6]:
def get_category_members(category, cmtype="page", namespace=0, limit=500):
    """
    Fetches members of a given Wikipedia category with improved error handling.

    Args:
        category: Category name without the 'Category:' prefix
        cmtype: 'page', 'subcat', or 'file'
        namespace: Namespace index (0 for articles)
        limit: Number of results per request (max 500 for users)

    Returns:
        List of dicts with 'pageid' and 'title'
    """

    def api_call(**params):
        S = requests.Session()
        res = S.get("https://en.wikipedia.org/w/api.php", params=params, timeout=30)
        res.raise_for_status()
        return res.json()

    members = []
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmtype": cmtype,
        "cmlimit": limit,
        "cmnamespace": namespace,
    }

    while True:
        try:
            # Use our resilient API call function with retries
            data = resilient_api_call(api_call, max_retries=5, **params)
            batch = data.get("query", {}).get("categorymembers", [])
            members.extend(batch)

            if "continue" in data:
                params.update(data["continue"])
            else:
                break
        except Exception as e:
            print(f"Error fetching category members for {category}: {str(e)}")
            break

    return members


In [7]:
def fetch_articles_from_categories(categories, include_subcats=False, max_subcat_depth=1):
    """
    Given a list of category names, fetches all articles in them.
    Optionally includes pages from subcategories up to specified depth.

    :param categories: List of category names (strings without prefix)
    :param include_subcats: Whether to traverse into subcategories
    :param max_subcat_depth: Maximum depth for subcategory traversal
    :returns: Set of page titles
    """
    all_articles = set()
    seen_cats = set()

    def _recurse(cat, depth):
        if cat in seen_cats or depth < 0:
            return
        seen_cats.add(cat)

        # Fetch pages
        pages = get_category_members(cat, cmtype="page")
        for p in pages:
            all_articles.add(p["title"])

        if include_subcats and depth > 0:
            subcats = get_category_members(cat, cmtype="subcat", namespace=14)
            for sc in subcats:
                sc_name = sc["title"].replace("Category:", "")
                _recurse(sc_name, depth - 1)

    for cat in tqdm(categories, desc="Traversing categories"):
        _recurse(cat, max_subcat_depth)

    return all_articles


## Declare root categories and their subcategories

In [8]:
root_categories = {
    "Politics": [
        "Politics", "Political history", "Elections", "Political parties"
    ],
    "Science & Medicine": [
        "Science", "Medicine", "Biology", "Physics", "Chemistry"
    ],
    "History": [
        "History", "Military history", "History by country"
    ],
    "Technology": [
        "Technology", "Computing", "Engineering"
    ],
    "Popular Culture": [
        "Popular culture", "Music", "Television", "Film", "Video games"
    ]
}


## Traverse root categories and fetch articles

In [9]:
file_name_articles_list = "articles_by_category_1_recurse.csv"
all_records = []
for stratum, roots in root_categories.items():
    for root in roots:
        articles = fetch_articles_from_categories([root], include_subcats=True, max_subcat_depth=1)
        for title in articles:
            all_records.append({"stratum": stratum, "root": root, "title": title})
pd.DataFrame(all_records).to_csv(file_name_articles_list, index=False)


Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

Traversing categories:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
# check fetched articles
# print len of all_records including subcats
len(all_records)

28713

In [12]:
# Load the articles from the saved file
article = pd.read_csv(file_name_articles_list)

In [15]:
def is_bot_username(username: str) -> bool:
    return username.lower().endswith("bot")

In [None]:
def fetch_revision_snapshots(
        title: str,
        start_ts: str,
        end_ts: str,
        freq: str = "7D",
        bot_test_fn: callable = None,
        carry_forward: bool = True,
        max_retries: int = 5,
        backoff_factor: float = 1.0,
        checkpoint_path: str = None,
) -> pd.DataFrame:
    """
    Fetches all revisions (including full content) for `title` between start_ts and end_ts,
    then returns snapshots at given frequency with content included.

    This version includes improved error handling and checkpointing.

    Args:
        title: Wikipedia page title
        start_ts: ISO8601 timestamp string, e.g. "2020-01-01T00:00:00Z"
        end_ts: ISO8601 timestamp string, e.g. "2020-12-31T23:59:59Z"
        freq: pandas offset alias (e.g. "7D")
        bot_test_fn: callable to flag bots (user->bool)
        carry_forward: reuse last snapshot if no newer revision
        max_retries: number of retries on network failure
        backoff_factor: multiplier for retry backoff in seconds
        checkpoint_path: path to pickle intermediate results

    Returns:
        DataFrame with columns ['page_title','snapshot_ts','rev_id','timestamp',
        'user','is_bot','content']
    """
    # Check if we have a checkpoint
    if checkpoint_path and os.path.exists(checkpoint_path):
        try:
            return pd.read_pickle(checkpoint_path)
        except Exception as e:
            print(f"Error loading checkpoint: {str(e)}")

    def api_call(**params):
        session = requests.Session()
        resp = session.get("https://en.wikipedia.org/w/api.php", params=params, timeout=30)
        resp.raise_for_status()
        return resp.json()

    # Phase 1: Fetch metadata + content in one pass
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "rvprop": "ids|timestamp|user|content",
        "rvstart": end_ts,
        "rvend": start_ts,
        "rvlimit": "max",
        "titles": title,
        "redirects": 1,
        "rvslots": "main",
    }

    all_revs = []

    try:
        while True:
            # Use our resilient API call function
            data = resilient_api_call(
                api_call,
                max_retries=max_retries,
                initial_backoff=backoff_factor,
                **params
            )

            pages = data.get("query", {}).get("pages", {})
            for page in pages.values():
                for rev in page.get("revisions", []) or []:
                    ts_str = rev.get("timestamp")
                    if not ts_str:
                        continue
                    ts = parser.isoparse(ts_str)
                    user = rev.get("user", "")
                    # content slot
                    content = rev.get("slots", {}).get("main", {}).get("*", "")
                    all_revs.append({
                        "rev_id": rev.get("revid"),
                        "timestamp": ts,
                        "user": user,
                        "is_bot": bot_test_fn(user) if bot_test_fn else False,
                        "content": content,
                    })

            if "continue" in data:
                params.update(data["continue"])
            else:
                break
    except Exception as e:
        print(f"Error fetching revisions for {title}: {str(e)}")
        # If we have some revisions, continue with what we have
        if not all_revs:
            return pd.DataFrame(columns=["page_title", "snapshot_ts", "rev_id",
                                         "timestamp", "user", "is_bot", "content"])

    # If no revisions, return empty DataFrame
    if not all_revs:
        return pd.DataFrame(columns=["page_title", "snapshot_ts", "rev_id",
                                     "timestamp", "user", "is_bot", "content"])

    # Build DataFrame and sort
    df = pd.DataFrame(all_revs)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.sort_values("timestamp").reset_index(drop=True)

    # Phase 2: Snapshot selection
    timestamps = df["timestamp"]
    sample_times = pd.date_range(
        start=pd.to_datetime(start_ts),
        end=pd.to_datetime(end_ts),
        freq=freq,
        tz=timestamps.dt.tz
    )

    snaps = []
    last_snap_time = None

    for snap_t in sample_times:
        pos = timestamps.searchsorted(snap_t, side='right') - 1
        if pos < 0:
            continue
        rev_time = timestamps.iloc[pos]
        # skip if no carry_forward and no new revision
        if not carry_forward and last_snap_time is not None and rev_time <= last_snap_time:
            last_snap_time = snap_t
            continue
        row = df.iloc[pos].to_dict()
        row["snapshot_ts"] = snap_t
        row["page_title"] = title
        snaps.append(row)
        last_snap_time = snap_t

    df_snap = pd.DataFrame(snaps)
    cols = ["page_title", "snapshot_ts", "rev_id", "timestamp", "user", "is_bot", "content"]
    result_df = df_snap[cols] if not df_snap.empty else pd.DataFrame(columns=cols)

    # Save checkpoint if path provided
    if checkpoint_path and not result_df.empty:
        try:
            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
            result_df.to_pickle(checkpoint_path)
        except Exception as e:
            print(f"Error saving checkpoint: {str(e)}")

    return result_df


In [None]:
# Modify parallel processing configuration
from parallel import process_batch_with_progress

# Define categories to fetch
categories = [
    "Politics",
    "History"
]

# Create checkpoint directory
os.makedirs("checkpoints", exist_ok=True)

# Configure more conservative parallel processing settings
PARALLEL_CONFIG = {
    'max_workers': 4,  # Reduce number of concurrent workers
    'use_threads': True,  # Use threads instead of processes for API calls
    'cpu_intensive': False,
    'timeout': 300,  # 5 minutes timeout
    'chunk_size': 5  # Process in smaller chunks
}


# Fetch articles with improved error handling
def fetch_with_retries(category, **kwargs):
    try:
        results = fetch_articles_from_categories(
            [category],
            include_subcats=True,
            max_subcat_depth=2
        )
        return results
    except Exception as e:
        print(f"Error processing category {category}: {str(e)}")
        return set()


# Process categories with more robust error handling
articles = set()
results = process_batch_with_progress(
    fetch_with_retries,
    categories,
    desc="Processing categories",
    **PARALLEL_CONFIG
)

# Combine results
for result in results:
    if result:
        articles.update(result)

print(f"Fetched {len(articles)} articles from categories and subcategories.")

# Save to file with error handling
try:
    with open("FINAL_NO_RECURSE_article_names_list.pkl", "wb") as f:
        pickle.dump(articles, f)
    print("Successfully saved articles list")
except Exception as e:
    print(f"Error saving articles list: {str(e)}")


In [21]:
# # Load the articles from the saved file if needed
# try:
#     with open("article_names_list.pkl", "rb") as f:
#         articles = pickle.load(f)
#     print(f"Loaded {len(articles)} articles from file.")
#     print(articles.pop())  # Print one article to verify
# except FileNotFoundError:
#     print("Article list file not found.")
#
#



Loaded 29306 articles from file.
Pregnancy school


In [18]:
# load from all_articles_by_category.csv if available
try:
    articles_df = pd.read_csv("all_articles_by_category.csv")
    articles = set(articles_df["title"].tolist())
    print(f"Loaded {len(articles)} articles from CSV.")

    print(articles_df.head())
except FileNotFoundError:
    print("CSV file not found. Using previously fetched articles.")


Loaded 27725 articles from CSV.
    stratum      root                                  title
0  Politics  Politics                   Modernization theory
1  Politics  Politics  Timeline of incidents involving QAnon
2  Politics  Politics                 Crises of the Republic
3  Politics  Politics          Governance of protected areas
4  Politics  Politics                    Political ReviewNet
