In [None]:
import requests
import time
import json
import os

def get_article_revisions(title, older_than=None):
    """
    Fetches revisions for a given article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history'
    parameters = {}
    if older_than:
        parameters['older_than'] = older_than
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData/d.galati@student.utwente.nl'  # Replace with your project and email
    }
    retries = 5
    for attempt in range(retries):
        response = requests.get(url, headers=headers, params=parameters)
        if response.status_code == 200:
            data = response.json()
            return data.get("revisions", [])
        elif response.status_code == 429:  # Rate-limiting error
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
        else:
            print(f"Error fetching revisions for {title}: {response.status_code}. Retrying...")
            time.sleep(2 ** attempt)  # Exponential backoff
    raise Exception(f"Failed to fetch revisions for {title} after {retries} retries.")

def loop_through_revisions(title, from_date=None, olderThanId=None):
    """
    Loops through revisions for an article until a specific timestamp is reached.
    """
    revisions = []
    while True:
        new_revisions = get_article_revisions(title, older_than=olderThanId)
        if not new_revisions:
            break
        revisions.extend(new_revisions)
        if new_revisions[-1]["timestamp"] <= from_date:
            break
        olderThanId = new_revisions[-1]["id"]
    print(f"Gathered {len(revisions)} revisions for article: {title}")
    return revisions

def get_revision_count(title):
    """
    Fetches the total revision count for an article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history/counts/edits'
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  
    }
    retries = 5
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Rate-limiting error
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
        else:
            print(f"Error fetching revision count for {title}: {response.status_code}. Retrying...")
            time.sleep(2 ** attempt)  # Exponential backoff
    raise Exception(f"Failed to fetch revision count for {title} after {retries} retries.")

def save_revisions_to_hdfs(revisions, filename):
    """
    Saves the revisions in an HDFS-compatible format (JSON Lines).
    """
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "a", encoding="utf-8") as f:
        for revision in revisions:
            json.dump(revision, f, ensure_ascii=False)
            f.write("\n")  # Each revision is a separate JSON line

def get_us_election_titles():
    """
    Returns lists of Wikipedia article titles for US elections in 2016, 2020, and 2024.
    """
    elections_2016 = [
        "2016_United_States_presidential_election",
        "2016_Democratic_Party_presidential_primaries",
        "2016_Republican_Party_presidential_primaries"
    ]
    
    elections_2020 = [
        "2020_United_States_presidential_election",
        "2020_Democratic_Party_presidential_primaries",
        "2020_Republican_Party_presidential_primaries"
    ]
    elections_2024 = [
        "2024_United_States_presidential_election",
        "2024_Democratic_Party_presidential_primaries",
        "2024_Republican_Party_presidential_primaries"
    ]
    
    return {
        "2016": elections_2016,
        "2020": elections_2020,
        "2024": elections_2024
    }

# revision from jan first 2016 
revisions_from= "2016-01-01T00:00:00Z"
output_dir = "election_revisions"

election_titles = get_us_election_titles()

for year in ["2016", "2020", "2024"]:
    for title in election_titles[year]:

        print(f"Fetching revisions for article: {title}")  

        # Fetch revision count
        revs = get_revision_count(title)
        print(revs)

        # Fetch revisions
        print(f"Fetching article revisions... after {revisions_from}")
        revisions = loop_through_revisions(title, from_date=revisions_from)
        
        # Save revisions to HDFS-compatible format
        output_file = os.path.join(output_dir, f"{title}_revisions.jsonl")
        save_revisions_to_hdfs(revisions, output_file)
        print(f"Revisions saved to: {output_file}")


Fetching revisions for article: 2016_United_States_presidential_election
Rate limited. Retrying after 1 seconds...
Rate limited. Retrying after 1 seconds...
Rate limited. Retrying after 1 seconds...
Rate limited. Retrying after 1 seconds...
Rate limited. Retrying after 1 seconds...


Exception: Failed to fetch revision count for 2016_United_States_presidential_election after 5 retries.