In [2]:
import requests
import pandas as pd
import spacy
from textstat import textstat
import re
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch




In [3]:
nlp = spacy.load("en_core_web_sm")


In [4]:
print(torch.__version__)
print(torch.version.cuda)  # Should not be None
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

2.7.1+cu118
11.8
True
1
NVIDIA GeForce RTX 2070 with Max-Q Design


In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
if torch.cuda.is_available():
    print("cuda available!")
    model.to("cuda")
else:
    print('cuda not available!')


cuda available!


In [6]:
sample_pages = [
    "Python (programming language)",
    "Machine learning",
    "Artificial intelligence"
]


In [7]:
START_TIMESTAMP = "2023-01-01T00:00:00Z"
END_TIMESTAMP   = "2023-03-31T23:59:59Z"


In [8]:
columns = [
    "page_title", "rev_id", "timestamp", "user", "is_bot", "content"
]
tiny_revs = pd.DataFrame(columns=columns)


In [9]:
def is_bot_username(username: str) -> bool:
    return username.lower().endswith("bot")


In [10]:
def fetch_revisions_for_page(title, start_ts, end_ts):
    """
    Calls the MediaWiki `action=query&prop=revisions` endpoint to
    fetch all revisions for one page between start_ts and end_ts.
    Returns a list of dicts with keys: rev_id, timestamp, user, content.
    """
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    revisions = []
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "rvprop": "ids|timestamp|user|comment|content",
        "rvstart": end_ts,
        "rvend": start_ts,
        "rvlimit": "max",
        "titles": title,
        "redirects": 1,
        "rvslots": "main",
    }
    while True:
        response = S.get(URL, params=params).json()
        pages = response["query"]["pages"]
        page_id = next(iter(pages))
        if "revisions" not in pages[page_id]:
            break
        for rev in pages[page_id]["revisions"]:
            content = rev.get("slots", {}).get("main", {}).get("*", "")
            revisions.append({
                "rev_id": rev["revid"],
                "timestamp": rev["timestamp"],
                "user": rev["user"],
                "is_bot": is_bot_username(rev["user"]),
                "content": content
            })
        if "continue" in response:
            params.update(response["continue"])
        else:
            break
    return revisions


In [12]:

rows = []
for pg in sample_pages:
    revs = fetch_revisions_for_page(pg, START_TIMESTAMP, END_TIMESTAMP)
    for r in revs:
        rows.append({
            "page_title": pg,
            "rev_id": r["rev_id"],
            "timestamp": r["timestamp"],
            "user": r["user"],
            "is_bot": r["is_bot"],
            "content": r["content"]
        })

tiny_revs = pd.DataFrame(rows)


In [15]:
tiny_revs.head()
# tiny_revs.shape


Unnamed: 0,page_title,rev_id,timestamp,user,is_bot,content
0,Python (programming language),1145862507,2023-03-21T11:50:34Z,Comp.arch,False,{{Lead too short|date=March 2023}}\n{{pp|small...
1,Python (programming language),1144434497,2023-03-13T19:11:54Z,Thumperward,False,{{Lead too short|date=March 2023}}\n{{pp|small...
2,Python (programming language),1144432784,2023-03-13T19:00:30Z,Thumperward,False,{{pp|small=yes}}\n{{Short description|General-...
3,Python (programming language),1140647220,2023-02-21T02:12:18Z,Tdmurlock,False,{{pp|small=yes}}\n{{Short description|General-...
4,Python (programming language),1138397948,2023-02-09T13:16:31Z,AirshipJungleman29,False,{{pp|small=yes}}\n{{Short description|General-...


In [14]:
tiny_revs.to_pickle("tiny_revisions.pkl")
