In [7]:
import json, time, urllib.parse
import pandas as pd
from tqdm import tqdm
import requests

Reading CSV

In [8]:
df = pd.read_csv("thank_the_academy.AUG.2023.csv")

API Endpoints

In [9]:
# API endpoints for mobile web, mobile app, and desktop data
base_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia"
mobile_web_suffix = "/mobile-web/all-agents/"
mobile_app_suffix = "/mobile-app/all-agents/"
desktop_suffix = "/desktop/all-agents/"
granularity = "/monthly/2015070100/2023123100"  # Updated date range


Headers

In [10]:
# User-Agent header
headers = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

Response Dictionaries

In [11]:
# dictionaries to store data
all_monthly_mobile_access = {}
all_monthly_desktop_access = {}
all_monthly_cumulative = {}


Data Aquisition: Iterating rows, Calling APIs, and combining results

In [12]:
# Loop through each row
for index, row in tqdm(df.iterrows()):
    article_name = row["name"]
    article_url = row["url"]

    mobile_web_api_endpoint = f"{base_api_url}{mobile_web_suffix}{article_name}{granularity}"
    mobile_app_api_endpoint = f"{base_api_url}{mobile_app_suffix}{article_name}{granularity}"
    desktop_api_endpoint = f"{base_api_url}{desktop_suffix}{article_name}{granularity}"

    mobile_web_response = requests.get(mobile_web_api_endpoint, headers=headers)
    mobile_app_response = requests.get(mobile_app_api_endpoint, headers=headers)
    desktop_response = requests.get(desktop_api_endpoint, headers=headers)

    # Check if the requests were successful
    if (mobile_web_response.status_code == 200 and
        mobile_app_response.status_code == 200 and
        desktop_response.status_code == 200):
        
        # Parse mobile web data
        mobile_web_data = mobile_web_response.json()['items']
        monthly_mobile_web_access = {}
        for item in mobile_web_data:
            month = item['timestamp'][:6]  
            pageviews = item['views']
            monthly_mobile_web_access[month] = pageviews

        # Parse mobile app data
        mobile_app_data = mobile_app_response.json()['items']
        monthly_mobile_app_access = {}
        for item in mobile_app_data:
            month = item['timestamp'][:6] 
            pageviews = item['views']
            monthly_mobile_app_access[month] = pageviews

        # Combine mobile web and mobile app data for total mobile access
        monthly_mobile_access = {}
        for month in monthly_mobile_web_access.keys():
            web_count = monthly_mobile_web_access.get(month, 0)
            app_count = monthly_mobile_app_access.get(month, 0)
            total_mobile_count = web_count + app_count
            monthly_mobile_access[month] = total_mobile_count

        # Parse desktop data
        desktop_data = desktop_response.json()['items']
        monthly_desktop_access = {}
        for item in desktop_data:
            month = item['timestamp'][:6] 
            pageviews = item['views']
            monthly_desktop_access[month] = pageviews

        # Store data in dictionaries
        all_monthly_mobile_access[article_name] = monthly_mobile_access
        all_monthly_desktop_access[article_name] = monthly_desktop_access

        # Calculate cumulative data
        monthly_cumulative = {}
        for month in monthly_mobile_access.keys():
            total_mobile = monthly_mobile_access.get(month, 0)
            total_desktop = monthly_desktop_access.get(month, 0)
            total_cumulative = total_mobile + total_desktop
            monthly_cumulative[month] = total_cumulative

        all_monthly_cumulative[article_name] = monthly_cumulative
    else:
        print(f"Failed to retrieve data for '{article_name}'. Mobile web status code: {mobile_web_response.status_code}, Mobile app status code: {mobile_app_response.status_code}, Desktop status code: {desktop_response.status_code}")



583it [16:39,  1.37s/it]

Failed to retrieve data for 'Victor/Victoria'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


654it [18:46,  1.35s/it]

Failed to retrieve data for 'Who Are the DeBolts? And Where Did They Get Nineteen Kids?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


746it [21:27,  1.46s/it]

Failed to retrieve data for 'Is It Always Right to Be Right?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


754it [21:39,  1.36s/it]

Failed to retrieve data for 'They Shoot Horses, Don't They? (film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


795it [22:49,  1.45s/it]

Failed to retrieve data for 'Who's Afraid of Virginia Woolf? (film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1331it [38:43,  1.30s/it]

Failed to retrieve data for 'What Ever Happened to Baby Jane? (1962 film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1338it [38:54,  1.28s/it]

Failed to retrieve data for 'Why Korea?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1359it [39:32,  1.75s/it]


Saving Data

In [13]:
# Save all data to JSON files
with open("academy_monthly_mobile_201507-202312.json", "w") as mobile_file:
    json.dump(all_monthly_mobile_access, mobile_file)

with open("academy_monthly_desktop_201507-202312.json", "w") as desktop_file:
    json.dump(all_monthly_desktop_access, desktop_file)

with open("academy_monthly_cumulative_201507-202312.json", "w") as cumulative_file:
    json.dump(all_monthly_cumulative, cumulative_file)

print("All data saved successfully.")

All data saved successfully.
