In [63]:
import requests
import time
import json
import os


In [64]:
def get_us_election_titles():
    """
    Returns lists of Wikipedia article titles for US elections in 2016, 2020, and 2024.
    """
    elections_2016 = [
        "2016_United_States_presidential_election",
        "2016_Democratic_Party_presidential_primaries",
        "2016_Republican_Party_presidential_primaries"
    ]
    
    elections_2020 = [
        "2020_United_States_presidential_election",
        "2020_Democratic_Party_presidential_primaries",
        "2020_Republican_Party_presidential_primaries"
    ]
    elections_2024 = [
        "2024_United_States_presidential_election",
        "2024_Democratic_Party_presidential_primaries",
        "2024_Republican_Party_presidential_primaries"
    ]
    
    return {
        "2016": elections_2016,
        "2020": elections_2020,
        "2024": elections_2024
    }


In [65]:
def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

In [66]:
def fetch_article(title):
    """
    Fetches a single article by title.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    return response.json()

In [67]:
def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

In [68]:
def fetch_and_save_articles(titles, output_file, batch_mode=True):
    """
    Fetches full text for a list of articles and saves them in an HDFS-compatible format.

    Args:
        titles (list): List of article titles to fetch.
        output_file (str): Path to the output file.
        batch_mode (bool): Whether to use batch fetching or individual fetching.
    """
    total_saved = 0  # Track the total number of articles saved
    batch_size = 50  # Max 50 titles per batch

    if batch_mode:
        # Process in batches
        batches = [titles[i:i + batch_size] for i in range(0, len(titles), batch_size)]
        for idx, batch in enumerate(batches, start=1):
            print(f"Fetching batch {idx}/{len(batches)}...")
            try:
                data = fetch_articles_in_batches(batch)
                processed_data = []
                for page_id, page_info in data["query"]["pages"].items():
                    title = page_info["title"]
                    if "revisions" in page_info:
                        text = page_info["revisions"][0]["slots"]["main"]["*"]
                    else:
                        text = "No content available"
                    processed_data.append({"title": title, "text": text})
                save_to_hdfs(processed_data, output_file)
                total_saved += len(processed_data)
            except Exception as e:
                print(f"Failed to fetch batch {idx}: {e}")
            time.sleep(1)  # Wait 1 second between batches
    else:
        # Process individually
        for idx, title in enumerate(titles, start=1):
            print(f"Fetching article {idx}/{len(titles)}: {title}...")
            try:
                data = fetch_article(title)
                processed_data = []
                for page_id, page_info in data["query"]["pages"].items():
                    if "revisions" in page_info:
                        text = page_info["revisions"][0]["slots"]["main"]["*"]
                    else:
                        text = "No content available"
                    processed_data.append({"title": page_info["title"], "text": text})
                save_to_hdfs(processed_data, output_file)
                total_saved += 1
            except Exception as e:
                print(f"Failed to fetch article {title}: {e}")
            time.sleep(1)  # Wait 1 second between requests

    print(f"Total articles saved: {total_saved}")

In [82]:
import requests

def fetch_category_articles(category, include_subcategories=False,write_to_file=False):
    def fetch_subcategories(category):
        subcategories = []
        cmcontinue = ""
        while True:
            url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmtype=subcat&cmlimit=max&format=json&cmcontinue={cmcontinue}"
            response = requests.get(url).json()
            subcategories.extend([cat["title"] for cat in response["query"]["categorymembers"]])
            if "continue" in response:
                cmcontinue = response["continue"]["cmcontinue"]
            else:
                break
        return subcategories

    def fetch_pages(category):
        pages = []
        cmcontinue = ""
        while True:
            url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&cmtype=page&cmlimit=max&format=json&cmcontinue={cmcontinue}"
            response = requests.get(url).json()
            pages.extend([page["title"] for page in response["query"]["categorymembers"]])
            if "continue" in response:
                cmcontinue = response["continue"]["cmcontinue"]
            else:
                break
        return pages

    articles = fetch_pages(category)
    if include_subcategories:
        subcategories = fetch_subcategories(category)
        for subcategory in subcategories:
            articles.extend(fetch_pages(subcategory.replace("Category:", "")))

    if write_to_file:
        with open(f"{category}_articles.txt", "w") as f:
            for article in articles:
                f.write(article + "\n")

    return articles


In [85]:
# Get US election titles

election_titles = fetch_category_articles("Elections in the United States", include_subcategories=True,write_to_file=True)

 

# use the fetch and save articles function to fetch the articles from election_titles; save them into the all_elections_data/all_elections_data.jsonl file

fetch_and_save_articles(election_titles, "all_elections_data/all_elections_data.jsonl", batch_mode=True)

Fetching batch 1/21...
Fetching batch 2/21...
Fetching batch 3/21...
Fetching batch 4/21...
Fetching batch 5/21...
Fetching batch 6/21...
Fetching batch 7/21...
Fetching batch 8/21...
Fetching batch 9/21...
Fetching batch 10/21...
Fetching batch 11/21...
Fetching batch 12/21...
Fetching batch 13/21...
Fetching batch 14/21...
Fetching batch 15/21...
Fetching batch 16/21...
Fetching batch 17/21...
Fetching batch 18/21...
Fetching batch 19/21...
Fetching batch 20/21...
Fetching batch 21/21...
Total articles saved: 1024
