In [11]:
import json, time, urllib.parse
import pandas as pd
from tqdm import tqdm
import requests
import os

# Reading CSV
Reading csv file to get all the articles from the csv file given. Sorting them according to the name of the article.

In [12]:
df = pd.read_csv(os.path.join("..","data","thank_the_academy.AUG.2023.csv"))

df.sort_values(by=['name'])


Unnamed: 0,name,url
137,12 Years a Slave (film),https://en.wikipedia.org/wiki/12_Years_a_Slave...
53,1917 (2019 film),https://en.wikipedia.org/wiki/1917_(2019_film)
148,20 Feet from Stardom,https://en.wikipedia.org/wiki/20_Feet_from_Sta...
819,"20,000 Leagues Under the Sea (1954 film)","https://en.wikipedia.org/wiki/20,000_Leagues_U..."
766,2001: A Space Odyssey (film),https://en.wikipedia.org/wiki/2001:_A_Space_Od...
...,...,...
520,Young at Heart (1987 film),https://en.wikipedia.org/wiki/Young_at_Heart_(...
751,Z (1969 film),https://en.wikipedia.org/wiki/Z_(1969_film)
156,Zero Dark Thirty,https://en.wikipedia.org/wiki/Zero_Dark_Thirty
104,Zootopia,https://en.wikipedia.org/wiki/Zootopia


# API Endpoints
This section stores all the URLs which we'll be calling. This also define few of the parmeters which will remain constant throughout the program.

In [13]:
# API endpoints for mobile web, mobile app, and desktop data
base_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia"
mobile_web_suffix = "/mobile-web/user/"
mobile_app_suffix = "/mobile-app/user/"
desktop_suffix = "/desktop/user/"
granularity = "/monthly/2015070100/2023093000"  # Updated date range


# Headers
Using headers as stated in documentation.

In [14]:
# User-Agent header
headers = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# Response Dictionaries
All responses will be stored in this and then change to json files.

In [15]:
# dictionaries to store data
all_monthly_mobile_access = {}
all_monthly_desktop_access = {}
all_monthly_cumulative = {}


# Data Aquisition: Iterating rows, Calling APIs, and combining results
In this block, we're calling the apis with the parameters mentioned above, and then on the bases of the logic provided we;re collecting data.

In [16]:
# Loop through each row
for index, row in tqdm(df.iterrows()):
    article_name = row["name"]
    article_url = row["url"]

    mobile_web_api_endpoint = f"{base_api_url}{mobile_web_suffix}{article_name}{granularity}"
    mobile_app_api_endpoint = f"{base_api_url}{mobile_app_suffix}{article_name}{granularity}"
    desktop_api_endpoint = f"{base_api_url}{desktop_suffix}{article_name}{granularity}"

    mobile_web_response = requests.get(mobile_web_api_endpoint, headers=headers)
    mobile_app_response = requests.get(mobile_app_api_endpoint, headers=headers)
    desktop_response = requests.get(desktop_api_endpoint, headers=headers)

    # Check if the requests were successful
    if (mobile_web_response.status_code == 200 and
        mobile_app_response.status_code == 200 and
        desktop_response.status_code == 200):
        
        # Parse mobile web data
        mobile_web_data = mobile_web_response.json()['items']
        monthly_mobile_web_access = {}
        for item in mobile_web_data:
            month = item['timestamp'][:6]  
            pageviews = item['views']
            monthly_mobile_web_access[month] = pageviews

        # Parse mobile app data
        mobile_app_data = mobile_app_response.json()['items']
        monthly_mobile_app_access = {}
        for item in mobile_app_data:
            month = item['timestamp'][:6] 
            pageviews = item['views']
            monthly_mobile_app_access[month] = pageviews

        # Combine mobile web and mobile app data for total mobile access
        monthly_mobile_access = {}
        for month in monthly_mobile_web_access.keys():
            web_count = monthly_mobile_web_access.get(month, 0)
            app_count = monthly_mobile_app_access.get(month, 0)
            total_mobile_count = web_count + app_count
            monthly_mobile_access[month] = total_mobile_count

        # Parse desktop data
        desktop_data = desktop_response.json()['items']
        monthly_desktop_access = {}
        for item in desktop_data:
            month = item['timestamp'][:6] 
            pageviews = item['views']
            monthly_desktop_access[month] = pageviews

        # Store data in dictionaries
        all_monthly_mobile_access[article_name] = monthly_mobile_access
        all_monthly_desktop_access[article_name] = monthly_desktop_access

        # Calculate cumulative data
        monthly_cumulative = {}
        for month in monthly_mobile_access.keys():
            total_mobile = monthly_mobile_access.get(month, 0)
            total_desktop = monthly_desktop_access.get(month, 0)
            total_cumulative = total_mobile + total_desktop
            monthly_cumulative[month] = total_cumulative

        all_monthly_cumulative[article_name] = monthly_cumulative
    else:
        print(f"Failed to retrieve data for '{article_name}'. Mobile web status code: {mobile_web_response.status_code}, Mobile app status code: {mobile_app_response.status_code}, Desktop status code: {desktop_response.status_code}")



583it [16:44,  1.50s/it]

Failed to retrieve data for 'Victor/Victoria'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


654it [18:52,  1.44s/it]

Failed to retrieve data for 'Who Are the DeBolts? And Where Did They Get Nineteen Kids?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


746it [21:33,  1.21s/it]

Failed to retrieve data for 'Is It Always Right to Be Right?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


754it [21:46,  1.39s/it]

Failed to retrieve data for 'They Shoot Horses, Don't They? (film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


795it [22:58,  1.31s/it]

Failed to retrieve data for 'Who's Afraid of Virginia Woolf? (film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1331it [38:41,  1.45s/it]

Failed to retrieve data for 'What Ever Happened to Baby Jane? (1962 film)'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1338it [38:52,  1.28s/it]

Failed to retrieve data for 'Why Korea?'. Mobile web status code: 404, Mobile app status code: 404, Desktop status code: 404


1359it [39:29,  1.74s/it]


Saving Data

In [18]:
# Save all data to JSON files
with open("../data/academy_monthly_mobile_201507-202312.json", "w") as mobile_file:
    json.dump(all_monthly_mobile_access, mobile_file)

with open("../data/academy_monthly_desktop_201507-202312.json", "w") as desktop_file:
    json.dump(all_monthly_desktop_access, desktop_file)

with open("../data/academy_monthly_cumulative_201507-202312.json", "w") as cumulative_file:
    json.dump(all_monthly_cumulative, cumulative_file)

print("All data saved successfully.")

All data saved successfully.
