# The goal of this project is to analyse the monthly trends of different articles views on wikipedia based on mobile (divided into application and web) and desktop access. 

Here we import all relevant libraries needed for our analysis

In [1]:
# 
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

I Import the dataset of articles provided to us and extract the names of the articles which would be used later on
to pass to our API for extraction of views on a monthly basis. The extract names are reshaped to match the formatting requirement for our API. The dataset is available on https://docs.google.com/spreadsheets/d/1A1h_7KAo7KXaVxdScJmIVPTvjb3IuY9oZhNV4ZHxrxw/edit#gid=1229854301

In [2]:
df = pd.read_csv('thank_the_academy.AUG.2023.csv')
names = df['name'].values
names.reshape(-1)
print('')




The following method to call API is developed by Dr. David W. McDonald(following four cells are from here). This code is provided under the [Creative Commons](https://creativecommons.org) [CC-BY license](https://creativecommons.org/licenses/by/4.0/). Revision 1.2 - August 14, 2023

The API documentation, [pageviews/per-article](https://wikimedia.org/api/rest_v1/#/Pageviews%20data), covers additional details that may be helpful when trying to use or understand this example.

In [3]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include your email address which will allow them
# to contact you if something happens - such as - your code exceeding rate limits - or some other error 
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
#ARTICLE_TITLES = [ 'Bison', 'Northern flicker', 'Red squirrel', 'Chinook salmon', 'Horseshoe bat' ]
ARTICLE_TITLES = names 
# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015010100",   # start and end dates need to be set
    "end":         "2023010100"    # this is likely the wrong end date
}


The API request will be made using one procedure. The idea is to make this reusable. The procedure is parameterized, but relies on the constants above for the important parameters. The underlying assumption is that this will be used to request data for a set of article pages. Therefore the parameter most likely to change is the article_title.

Note that this is slightly modified to automate introducing device_type as the function parameter. 

In [4]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, device_type = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")
    
    if device_type:
        request_template['access'] = device_type
        
    if not request_template['access']:
        raise Exception("Must supply a device type to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [None]:
print("Getting pageview data for: ",ARTICLE_TITLES[1])
views = request_pageviews_per_article(article_title = ARTICLE_TITLES[1], device_type = 'mobile-app')

Printing sample output for one article 

In [None]:
#print(json.dumps(views,indent=4))
#print("Have %d months of pageview data"%(len(views['items'])))
for month in views['items']:
    print(json.dumps(month,indent=4))

The following code extracts views for all articles only with desktop access

In [None]:
merged_desktop = []
for i in range(len(ARTICLE_TITLES)):
    views = request_pageviews_per_article(article_title = ARTICLE_TITLES[i], device_type = 'desktop')
    merged_desktop.extend(views['items'])

Here I print the json file for all articles views with desktop access

In [None]:
for month in merged_desktop:
    print(json.dumps(month,indent=4))

In the below cell the same steps are repeated as performed above first for mobile-web and then for mobile-app. Mobile-web is basically getting views for articles month wise for users who accessed through browser using mobile. Mobile-app
is getting views for articles month wise for users who accessed through mobile application. 


In [None]:
merged_mobile_web = []
for i in range(len(ARTICLE_TITLES)):
    views = request_pageviews_per_article(article_title = ARTICLE_TITLES[i], device_type = 'mobile-web')
    merged_mobile_web.extend(views['items'])

In [None]:
for month in merged_mobile_web:
    print(json.dumps(month,indent=4))

In [None]:
merged_mobile_app = []
for i in range(len(ARTICLE_TITLES)):
    views = request_pageviews_per_article(article_title = ARTICLE_TITLES[i], device_type = 'mobile-app')
    merged_mobile_app.extend(views['items'])

In [None]:
for month in merged_mobile_app:
    print(json.dumps(month,indent=4))

In the below cell I have combined the views contained in the json file for different categories of mobile. The main thought process behind the following algorithm is to look for matches with same article name and same timestamp. Then sum their views to return total mobile views. 

In [None]:
list2_dict = {(item['article'], item['timestamp']): item for item in merged_mobile_app}

# Initialize a new list to store merged data
merged_list = []

# Loop through entries in list1
for item1 in merged_mobile_web:
    # Check if a matching entry exists in list2_dict
    key = (item1['article'], item1['timestamp'])
    if key in list2_dict:
        # Create a merged entry with the common 'article' and 'timestamp'
        merged_entry = {
            "project": item1["project"],
            "article": item1["article"],
            "granularity": item1["granularity"],
            "timestamp": item1["timestamp"],
            "access_app": item1["access"],
            "access_web": list2_dict[key]["access"],
            "agent": item1["agent"],
            "views_app": item1["views"],
            "views_web": list2_dict[key]["views"],
            "total_views": item1["views"] + list2_dict[key]["views"]  # Calculate the sum of views
        }
        merged_list.append(merged_entry)

# Now, merged_list contains the combined data with the total views
for item in merged_list:
    print(json.dumps(item, indent=4))

In the below cell I have combined the views for mobile from the output of the above cell with the views of the desktop file we had. The core algorithm is the same as above. However, this outputs the combined views from desktop and mobile. 

In [None]:
# Create a dictionary to efficiently match entries based on 'article' and 'timestamp'
list3_dict = {(item['article'], item['timestamp']): item for item in merged_desktop}

# Initialize a new list to store the final merged data
final_merged_list = []

# Loop through entries in merged_list
for merged_entry in merged_list:
    # Check if a matching entry exists in list3_dict
    key = (merged_entry['article'], merged_entry['timestamp'])
    if key in list3_dict:
        # Update the merged entry with data from list3_dict
        merged_entry["access_desktop"] = list3_dict[key]["access"]
        merged_entry["views_desktop"] = list3_dict[key]["views"]
        merged_entry["total_views"] += list3_dict[key]["views"]  # Update the total views
    else:
        # If no match is found, add the merged entry as is
        merged_entry["access_desktop"] = None
        merged_entry["views_desktop"] = None

    final_merged_list.append(merged_entry)

# Now, final_merged_list contains the combined data with mobile-app, mobile-web, and desktop data
for item in final_merged_list:
    print(json.dumps(item, indent=4))

The views from desktop are saved into a json file showing the start and end year and month. 

In [None]:
# Specify the file path where you want to save the JSON data
file_path = "academy_monthly_desktop_<start201501>-<end202309>.json"

# Open the file in write mode and use json.dump() to write the list to the file
with open(file_path, "w") as json_file:
    json.dump(merged_desktop, json_file, indent=4)

The views from mobile are saved into a json file showing the start and end year and month. Note that this is the combined output(mobile-app + mobile-web). 

In [None]:
# Specify the file path where you want to save the JSON data
file_path = "academy_monthly_mobile_<start201501>-<end202309>.json"

# Open the file in write mode and use json.dump() to write the list to the file
with open(file_path, "w") as json_file:
    json.dump(merged_list, json_file, indent=4)

The combined views from desktop and mobile are saved into a json file showing the start and end year and month. 

In [None]:
# Specify the file path where you want to save the JSON data
file_path = "academy_monthly_cummulative_<start201501>-<end202309>.json"

# Open the file in write mode and use json.dump() to write the list to the file
with open(file_path, "w") as json_file:
    json.dump(final_merged_list, json_file, indent=4)

The json file with combined views into converted into a dataframe for further EDA and visualisation

In [None]:
df = pd.DataFrame(final_merged_list)

I have extracted only mobile views and created a new column named views_mobile as it will be needed in future analysis

In [None]:
df['views_mobile'] = df['views_app'] + df['views_web']

In [None]:
df.head()

Over here I have grouped by article to find monthly average total views for each article. 

In [None]:
result_desktop = df.groupby('article')['total_views'].mean()
result_mobile = df.groupby('article')['total_views'].mean()

To analyze the article with the highest and lowest views I have first of all sorted the articles by their monthly average views and then selected the first and the last name as they are the ones with the lowest and highest views. 

In [None]:
df_sorted_desktop = result_desktop.sort_values()
names_desktop = df_sorted_desktop.index.tolist()
max_views_desktop_name = names_desktop[-1]
min_views_desktop_name = names_desktop[0]

df_sorted_mobile = result_mobile.sort_values()
names_mobile = df_sorted_mobile.index.tolist()
max_views_mobile_name = names_mobile[-1]
min_views_mobile_name = names_mobile[0]

To visualize the trends I have created smaller dataframes with only views and timestamps where the article name is the same as required by us

In [None]:
max_array_desktop = df[['views_desktop', 'timestamp']][df['article'] == max_views_desktop_name]
min_array_desktop = df[['views_desktop', 'timestamp']][df['article'] == min_views_desktop_name]


max_array_mobile = df[['views_mobile', 'timestamp']][df['article'] == max_views_mobile_name]
min_array_mobile = df[['views_mobile', 'timestamp']][df['article'] == min_views_mobile_name]

Plotted a line graph and labelled it for proper understanding

In [None]:
plt.figure(figsize=(10, 6))  
plt.plot(max_array_desktop['timestamp'], max_array_desktop['views_desktop'], 
         label = f'max_views_desktop ({max_views_desktop_name})')
plt.plot(min_array_desktop['timestamp'], min_array_desktop['views_desktop'], 
         label = f'min_views_desktop ({min_views_desktop_name})')
plt.plot(max_array_mobile['timestamp'], max_array_mobile['views_mobile'], 
         label = f'max_views_mobile ({max_views_mobile_name})')
plt.plot(min_array_mobile['timestamp'], min_array_mobile['views_mobile'], 
         label = f'min_views_mobile ({min_views_mobile_name})')
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')

# Add labels and a title
plt.xlabel('timestamp')
plt.ylabel('total_views')
plt.title('Line Plot with Rotated Y-axis Labels')
plt.legend()
plt.show()

Created a dataframe from json file of only mobile views

In [None]:
df_mobile = pd.DataFrame(merged_list)

In the following cells I have extracted the number of times each article is appearing in our dataset. Basically this tells us the number of months for which we have data for each article. I have picked up the lowest 10 articles in this case and did my analysis

In [None]:
mobile_month_counts = df_mobile['article'].value_counts()

In [None]:
mobile_month_counts[90:101]

In [None]:
#sorted_mobile_month_counts = mobile_month_counts.sort_values()
mobile_article_names = mobile_month_counts.index.tolist()

Same steps are repeated for desktop views only

In [None]:
df_desktop = pd.DataFrame(merged_desktop)

In [None]:
desktop_month_counts = df_desktop['article'].value_counts()

In [None]:
desktop_month_counts[90:101]

Since the last ten 

Movies with lowest months of data available are extracted and their graphs are plotted

In [None]:
desktop_month_counts = desktop_month_counts.index.tolist()[91:101]
mobile_month_counts = mobile_month_counts.index.tolist()[91:101]

In [None]:
plt.figure(figsize=(12, 10))  
j = 1
labels = []
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
# Customize the tick spacing on the X-axis using MultipleLocator
x_locator = ticker.MultipleLocator(base=2)  # Adjust the base value as needed
# Apply the locator to the X-axis
plt.gca().xaxis.set_major_locator(x_locator)
for i in range(10):
    temp = df[['views_desktop', 'timestamp']][df['article'] == desktop_month_counts[i]]
    temp2 = df[['views_mobile', 'timestamp']][df['article'] == mobile_month_counts[i]]
    sorted_temp = temp.sort_values(by='timestamp')
    sorted_temp2 = temp2.sort_values(by='timestamp')
    plt.plot(sorted_temp['timestamp'], sorted_temp['views_desktop'],
                label=f'{desktop_month_counts[i]} (desktop)')
    labels.append(f'{desktop_month_counts[i]} (desktop)')
    plt.plot(sorted_temp2['timestamp'], sorted_temp2['views_mobile'], 
                label=f'{mobile_month_counts[i]} (mobile)')
    labels.append(f'{mobile_month_counts[i]} (mobile)')

plt.legend(labels)