In [18]:
# Import necessary Python modules
import json
import time
import requests
import pandas as pd
from tqdm import tqdm


In [19]:
# Original Code Attribution
# This code example was developed by Dr. David W. McDonald for use in DATA 512, a course in the UW MS Data Science degree program.
# This code is provided under the Creative Commons CC-BY license. Revision 1.1 - August 14, 2023
# Source: https://colab.research.google.com/drive/15UoE16s-IccCTOXREjU3xDIz07tlpyrl#scrollTo=2i0WSJn4TXqu&printMode=true

# Define Constants and Configuration
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_LATENCY_ASSUMED = 0.002
API_THROTTLE_WAIT = (1.0/100.0) - API_LATENCY_ASSUMED
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}



In [20]:
# Function to request page info for a single article
def request_pageinfo_per_article(article_title=None, endpoint_url=API_ENWIKIPEDIA_ENDPOINT,
                                 request_template=PAGEINFO_PARAMS_TEMPLATE, headers=REQUEST_HEADERS):
    if article_title:
        request_template['titles'] = article_title
    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response



In [21]:
# Load Wikipedia Data from CSV
df = pd.read_csv('us_cities_by_state_SEPT.20232.csv')



In [22]:
df

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"
5,Alabama,"Albertville, Alabama","https://en.wikipedia.org/wiki/Albertville,_Ala..."
6,Alabama,"Alexander City, Alabama","https://en.wikipedia.org/wiki/Alexander_City,_..."


In [23]:
# Request and store Page Info for Wikipedia Articles in a CSV file
data = []
for title in tqdm(df['page_title'].tolist()):
    info = request_pageinfo_per_article(title)
    if 'query' in info and 'pages' in info['query']:
        pages = info['query']['pages']
        for key, value in pages.items():
            if 'lastrevid' in value and 'title' in value:
                data.append({'Title': value['title'], 'Last_Revision_ID': value['lastrevid']})

# Convert the data into a DataFrame and store it in a CSV file
result_df = pd.DataFrame(data)
result_df.to_csv('wiki_page_info.csv', index=False)

100%|█████████████████████████████████████████████| 7/7 [00:01<00:00,  4.16it/s]
