Bazham Khanatayev \
Data 512 HW_2 \
10.15.2023 \
The purpose of this notebook is to take the articles listed in the us_cities_by_state_SEPT.2023.csv file and get the article page revision ID's. We are using the MediaWiki API to grab this information. The notebook outputs a file called article_data.csv that includes the revision ID's.

In [None]:
import csv
import requests
import tqdm
from itertools import islice

In [None]:
# Define the MediaWiki API endpoint URL
MEDIAWIKIPEDIA_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"

In [None]:
# Create dictionaries to store the article data and error logs
article_data = {}
error_log = []

We need to test the API before we commit to parsing the entire file. I started with just 10 rows.

The following is to test with just 10 rows and without the title

In [None]:
# Number of rows to process
N_ROWS = 10

In [None]:
# Open the US cities by state CSV file
with open("us_cities_by_state_SEPT.2023.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)  # Skip header if the CSV has one

    progress_bar = tqdm.tqdm(total=N_ROWS)

    # Iterate over the rows in the CSV file
    # islice is used to work with the iterator since it does not have traditional indices
    for i, row in islice(enumerate(reader), N_ROWS):
        # Extract just the title from the Wikipedia URL
        wikipedia_article_title = row[1]

        # Make a page info request to the MediaWiki API
        response = requests.get(MEDIAWIKIPEDIA_API_ENDPOINT, params={
            "action": "query",
            "format": "json",
            "titles": wikipedia_article_title,
            "prop": "info|revisions",
            "inprop": "url",
            "rvprop": "ids",
            "rvlimit": 1  # Only get the latest revision
        })

        # Parse the JSON response
        json_response = response.json()

        # Extract the page info data
        page_data = next(iter(json_response.get("query", {}).get("pages", {}).values()), {})
        full_url = page_data.get("fullurl", "")
        revision_id = page_data.get("revisions", [{}])[0].get("revid", None) if "revisions" in page_data else None
        
        # log the rows where no revision_id was returned
        if revision_id:
            article_data[full_url] = {
                "url": full_url,
                "revision_id": revision_id
            }
        else:
            error_log.append(full_url)

        # Update the progress bar
        progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

In [2]:
print("Article Data:", article_data)
print("\nErrors:", error_log)

100%|██████████| 10/10 [00:02<00:00,  4.54it/s]

Article Data: {'https://en.wikipedia.org/wiki/Abbeville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Abbeville,_Alabama', 'revision_id': 1171163550}, 'https://en.wikipedia.org/wiki/Adamsville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Adamsville,_Alabama', 'revision_id': 1177621427}, 'https://en.wikipedia.org/wiki/Addison,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Addison,_Alabama', 'revision_id': 1168359898}, 'https://en.wikipedia.org/wiki/Akron,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Akron,_Alabama', 'revision_id': 1165909508}, 'https://en.wikipedia.org/wiki/Alabaster,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Alabaster,_Alabama', 'revision_id': 1179139816}, 'https://en.wikipedia.org/wiki/Albertville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Albertville,_Alabama', 'revision_id': 1179198677}, 'https://en.wikipedia.org/wiki/Alexander_City,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Alexander_City,_Alabama', 'revision_id': 1179140073}, '




Add the title as well and test with 10.

In [None]:
# Create dictionaries to store the article data and error logs
article_data = {}
error_log = []

In [None]:
# Open the US cities by state CSV file
with open("us_cities_by_state_SEPT.2023.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)  # Skip header if the CSV has one

    progress_bar = tqdm.tqdm(total=N_ROWS)

    # Iterate over the rows in the CSV file
    # islice is used to work with the iterator since it does not have traditional indices
    for i, row in islice(enumerate(reader), N_ROWS):
        # Extract just the title from the Wikipedia URL
        wikipedia_article_title = row[1]

        # Make a page info request to the MediaWiki API
        response = requests.get(MEDIAWIKIPEDIA_API_ENDPOINT, params={
            "action": "query",
            "format": "json",
            "titles": wikipedia_article_title,
            "prop": "info|revisions",
            "inprop": "url",
            "rvprop": "ids",
            "rvlimit": 1  # Only get the latest revision
        })

        # Parse the JSON response
        json_response = response.json()

        # Extract the page info data
        page_data = next(iter(json_response.get("query", {}).get("pages", {}).values()), {})
        full_url = page_data.get("fullurl", "")
        revision_id = page_data.get("revisions", [{}])[0].get("revid", None) if "revisions" in page_data else None
        
        
        # log the rows where no revision_id was returned
        if revision_id:
            article_data[full_url] = {
                "url": full_url,
                "revision_id": revision_id,
                "title": wikipedia_article_title
            }
        else:
            error_log.append(full_url)

        # Update the progress bar
        progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

In [3]:
# Print the article data
print(article_data)

100%|██████████| 10/10 [00:02<00:00,  4.79it/s]

{'https://en.wikipedia.org/wiki/Abbeville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Abbeville,_Alabama', 'revision_id': 1171163550, 'title': 'Abbeville, Alabama'}, 'https://en.wikipedia.org/wiki/Adamsville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Adamsville,_Alabama', 'revision_id': 1177621427, 'title': 'Adamsville, Alabama'}, 'https://en.wikipedia.org/wiki/Addison,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Addison,_Alabama', 'revision_id': 1168359898, 'title': 'Addison, Alabama'}, 'https://en.wikipedia.org/wiki/Akron,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Akron,_Alabama', 'revision_id': 1165909508, 'title': 'Akron, Alabama'}, 'https://en.wikipedia.org/wiki/Alabaster,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Alabaster,_Alabama', 'revision_id': 1179139816, 'title': 'Alabaster, Alabama'}, 'https://en.wikipedia.org/wiki/Albertville,_Alabama': {'url': 'https://en.wikipedia.org/wiki/Albertville,_Alabama', 'revision_id': 1179198677, 'title': 'Albert




Now we can go through the entire file

In [None]:
# Create dictionaries to store the article data and error logs
article_data = {}
error_log = []

In [None]:
# Number of rows to process (set this to a high number if you want to process all rows)
N_ROWS = 22160

In [None]:
# Open the US cities by state CSV file
with open("us_cities_by_state_SEPT.2023.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)  # Skip header if the CSV has one

    progress_bar = tqdm.tqdm(total=N_ROWS)

    # Iterate over the rows in the CSV file
    # islice is used to work with the iterator since it does not have traditional indices
    for i, row in islice(enumerate(reader), N_ROWS):
        # Extract just the title from the Wikipedia URL
        wikipedia_article_title = row[1]

        # Make a page info request to the MediaWiki API
        response = requests.get(MEDIAWIKIPEDIA_API_ENDPOINT, params={
            "action": "query",
            "format": "json",
            "titles": wikipedia_article_title,
            "prop": "info|revisions",
            "inprop": "url",
            "rvprop": "ids",
            "rvlimit": 1  # Only get the latest revision
        })

        # Parse the JSON response
        json_response = response.json()

        # Extract the page info data
        page_data = next(iter(json_response.get("query", {}).get("pages", {}).values()), {})
        full_url = page_data.get("fullurl", "")
        revision_id = page_data.get("revisions", [{}])[0].get("revid", None) if "revisions" in page_data else None
        
        # log the rows where no revision_id was returned
        if revision_id:
            article_data[full_url] = {
                "url": full_url,
                "revision_id": revision_id,
                "title": wikipedia_article_title
            }
        else:
            error_log.append(full_url)

        # Update the progress bar
        progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

Save the outputs.

In [4]:
# Open the output CSV file
with open("article_data.csv", "w", newline='') as f:
    writer = csv.writer(f)

    # Write the header row
    writer.writerow(["Article Title", "URL", "Revision ID"])

    # Write the article data to the CSV file
    for article_url, article_data in article_data.items():
        writer.writerow([article_data["title"], article_url, article_data["revision_id"]])

100%|█████████▉| 22157/22160 [1:22:27<00:00,  4.48it/s]
