In [109]:
# 
# These are standard python modules
import json, time, urllib.parse
import pandas as pd
import numpy as np
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

The example relies on some constants that help make the code a bit more readable.

In [3]:
#########
#
#    CONSTANTS
#
# Reading relevant files into dataframe
politician_df = pd.read_csv('/Users/kirsteenng/Desktop/UW/DATA 512/data-512-homework_2/politicians_by_country_SEPT_2022.csv')
population_df = pd.read_csv('/Users/kirsteenng/Desktop/UW/DATA 512/data-512-homework_2/population_by_country_2022.csv')

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = politician_df['name']

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}


The API request will be made using one procedure. The idea is to make this reusable. The procedure is parameterized, but relies on the constants above for the important parameters. The underlying assumption is that this will be used to request data for a set of article pages. Therefore the parameter most likely to change is the article_title.

In [4]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        print(type(response))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

### Consideration and data cleaning
Consideration 1: We first need to check for duplicates in both the politician and population file

In [88]:
print(len(population_df))
print(len(population_df.drop_duplicates()))

233
233


The above results show that there are not duplicates in population_df.

In [93]:
print('Length of political_df: ', len(politician_df))
print('Duplicated names: ', len(politician_df) - len(politician_df.drop_duplicates('name')))
print('Duplicated rows: ',  len(politician_df) - len(politician_df.drop_duplicates()))

Length of political_df:  7584
Duplicated names:  50
Duplicated rows:  50
Duplicated rows:  2


The above result shows that there are 50 duplicated names but there are only 2 completely identical rows. We can further investigate the duplicated names. 

In [92]:
politician_df[politician_df.duplicated(subset = 'name')]

Unnamed: 0,name,url,country
1566,Rudi Kolak,https://en.wikipedia.org/wiki/Rudi_Kolak,Croatia
1654,Count Wenzel Chotek of Chotkow and Wognin,https://en.wikipedia.org/wiki/Count_Wenzel_Cho...,Czechia
1669,Eduard Hedvicek,https://en.wikipedia.org/wiki/Eduard_Hedvicek,Czechia
1676,Konstantin Jireček,https://en.wikipedia.org/wiki/Konstantin_Jireček,Czechia
1680,Maximilian Ulrich von Kaunitz,https://en.wikipedia.org/wiki/Maximilian_Ulric...,Czechia
1711,"Leopold, Count von Thun und Hohenstein","https://en.wikipedia.org/wiki/Leopold,_Count_v...",Czechia
1914,Ibrahim Harun,https://en.wikipedia.org/wiki/Ibrahim_Harun,Ethiopia
2513,José Alejandro de Aycinena,https://en.wikipedia.org/wiki/José_Alejandro_d...,Guatemala
2659,José Francisco Barrundia,https://en.wikipedia.org/wiki/José_Francisco_B...,Honduras
3419,Luca Rovinalti,https://en.wikipedia.org/wiki/Luca_Rovinalti,Italy


These rows will not be dropped because I am interested to see if the country of origin impacts the rating of the article. 

Consideration 2: We will check if population_df and politician_df have the same countries.

In [114]:
diff_country = pd.DataFrame(set(population_df['Geography'].unique()) ^ set(politician_df['country'].unique()))


In [128]:
diff_country_updated = diff_country[~ diff_country[0].apply(lambda x: x.isupper())]
diff_country_updated.to_csv('wp_countries-no_match.txt'

Now we move on to collect the ratings for these articles. Before that, we will need to retrieve their revision ID first.

In [None]:

# return a title last revid panda dataframe
revid = []
length = len(ARTICLE_TITLES)
for i in range(0,length):
    info = request_pageinfo_per_article(ARTICLE_TITLES[i])
    try:
        info_dict = pd.DataFrame.from_dict(info['query']['pages']).loc['lastrevid'].values[0]
        revid.append(info_dict)
    except Exception as e:
        print(e)
        revid.append(0)

In [None]:
politician_df['revision_id'] = revid
politician_df.head()

In [None]:
# Return the rating
score_list = []
no_rating = []

for i in range(0,len(politician_df)):
    curr_revid = politician_df['revision_id'][i]
    score = request_ores_score_per_article(curr_revid)
    try:
        info_dict = score['enwiki']['scores'][str(curr_revid)]['articlequality']['score']['prediction']
        score_list.append(info_dict)
    except Exception as e:
        print(e)
        score_list.append('N/A')
        no_rating.append(politician_df['name'][i])


In [None]:
politician_df['article_quality'] = score_list
politician_df.head()

### Step 3: Combining the Datasets
Now that we have the rating prediction, we can merge it with the population dataset.

In [10]:
merged = pd.merge(politician_df, population_df, left_on= 'country', right_on = 'Geography', how = 'inner')
merged.head()

Unnamed: 0,name,url,country,Geography,Population (millions),Region
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan,Afghanistan,41.1,SOUTH ASIA
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Afghanistan,41.1,SOUTH ASIA
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Afghanistan,41.1,SOUTH ASIA
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Afghanistan,41.1,SOUTH ASIA
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Afghanistan,41.1,SOUTH ASIA


In [36]:
len(politician_df)

7584

In [13]:
merged.to_csv('merged_politician_population.csv')

In [26]:
merged.columns = ['article_title','url_link','country','geography','population','region','revision_id','article_quality']
merged.head()

Unnamed: 0,article_title,url_link,country,geography,population,region,revision_id,article_quality
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan,Afghanistan,41.1,SOUTH ASIA,1099689043,GA
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Afghanistan,41.1,SOUTH ASIA,943562276,Start
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Afghanistan,41.1,SOUTH ASIA,852404094,Start
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,Afghanistan,41.1,SOUTH ASIA,1095102390,B
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Afghanistan,41.1,SOUTH ASIA,1104998382,Start


In [28]:
# Rearranging columns
copy = merged[['country','region','population','article_title','revision_id','article_quality']]
copy.to_csv('wp_politicians_by_country.csv')

In [153]:
#TODO: total articles per capita
group_by_total_articles = copy.groupby(['country','population']).agg('count').reset_index()


In [154]:
group_by_total_articles['article per capita'] = group_by_total_articles['article_title']/group_by_total_articles['population']
group_by_total_articles = group_by_total_articles.sort_values(by = 'article per capita',ascending= True)

### 5.1 Top 10 countries by coverage: The 10 countries with the highest total articles per capita (in descending order) .

In [155]:
top_10_coverage = group_by_total_articles.iloc[-10:].sort_values(by = 'article per capita',ascending= False)
top_10_coverage

Unnamed: 0,country,population,region,article_title,revision_id,article_quality,article per capita
115,Nauru,0.0,2,2,2,2,inf
95,Liechtenstein,0.0,2,2,2,2,inf
172,Tuvalu,0.0,11,11,11,11,inf
125,Palau,0.0,1,1,1,1,inf
139,San Marino,0.0,2,2,2,2,inf
108,Monaco,0.0,13,13,13,13,inf
5,Antigua and Barbuda,0.1,17,17,17,17,170.0
54,Federated States of Micronesia,0.1,13,13,13,13,130.0
3,Andorra,0.1,10,10,10,10,100.0
13,Barbados,0.3,28,28,28,28,93.333333


We can see here that there are countries that have zero population which makes the article per capita going towards infinity. For the sake of this analysis, we will remove the countries with 
zero population so that the analysis makes more sense.

In [158]:
group_by_total_articles = group_by_total_articles[group_by_total_articles['population'] > 0.0]
top_10_coverage = group_by_total_articles.iloc[-10:].sort_values(by = 'article per capita', ascending = False)
top_10_coverage

Unnamed: 0,country,population,region,article_title,revision_id,article_quality,article per capita
5,Antigua and Barbuda,0.1,17,17,17,17,170.0
54,Federated States of Micronesia,0.1,13,13,13,13,130.0
3,Andorra,0.1,10,10,10,10,100.0
13,Barbados,0.3,28,28,28,28,93.333333
104,Marshall Islands,0.1,9,9,9,9,90.0
110,Montenegro,0.6,36,36,36,36,60.0
143,Seychelles,0.1,6,6,6,6,60.0
97,Luxembourg,0.7,37,37,37,37,52.857143
18,Bhutan,0.8,41,41,41,41,51.25
64,Grenada,0.1,5,5,5,5,50.0


### 5.2 Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita (in ascending order) .

In [159]:
bottom_10_coverage = group_by_total_articles.iloc[:10]
bottom_10_coverage

Unnamed: 0,country,population,region,article_title,revision_id,article_quality,article per capita
32,China,1436.6,2,2,2,2,0.001392
106,Mexico,127.5,1,1,1,1,0.007843
140,Saudi Arabia,36.7,3,3,3,3,0.081744
134,Romania,19.0,2,2,2,2,0.105263
73,India,1417.2,179,179,179,179,0.126305
153,Sri Lanka,22.4,3,3,3,3,0.133929
48,Egypt,103.5,14,14,14,14,0.135266
53,Ethiopia,123.4,24,24,24,24,0.194489
161,Taiwan,23.2,5,5,5,5,0.215517
180,Vietnam,99.4,27,27,27,27,0.27163


### Studying countries with high quality articles. 
First lets label the articles based on the predicted ranking. GA or FA is considered High quality and label as 1 and others are Not High and labelled as 0.

In [186]:
quality_added = copy
quality_added = quality_added[quality_added['population'] > 0]
quality_added['High quality'] = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quality_added['High quality'] = 0


In [187]:
quality_added.loc[copy['article_quality'] == 'GA', 'High quality'] = 1
quality_added.loc[copy['article_quality'] == 'FA', 'High quality'] = 1

### 5.3 Top 10 countries by high quality: The 10 countries with the highest high quality articles per capita (in descending order) .

In [188]:
group_by_quality = quality_added.groupby(['country','population'])['High quality'].agg('sum').reset_index()
group_by_quality.sort_values(by = 'High quality', ascending = True)
group_by_quality['hq article per capita'] = group_by_quality['High quality']/group_by_quality['population']

In [197]:
group_by_quality.sort_values(by = 'hq article per capita', ascending = True)
top_10_quality = group_by_quality[-11:].sort_values(by = 'hq article per capita', ascending = False)
top_10_quality

Unnamed: 0,country,population,High quality,hq article per capita
169,United Arab Emirates,9.4,4,0.425532
170,Uruguay,3.6,1,0.277778
168,Ukraine,41.0,5,0.121951
175,Yemen,33.7,2,0.059347
167,Uganda,47.2,1,0.021186
174,Vietnam,99.4,2,0.020121
171,Uzbekistan,35.6,0,0.0
172,Vanuatu,0.3,0,0.0
173,Venezuela,28.3,0,0.0
176,Zambia,20.0,0,0.0


### 5.4 Bottom 10 countries by high quality: The 10 countries with the lowest high quality articles per capita (in ascending order).

In [190]:
bottom_10_quality = group_by_quality[:10].sort_values(by = 'hq article per capita', ascending = True)
bottom_10_quality

Unnamed: 0,country,population,High quality,hq article per capita
2,Algeria,44.9,0,0.0
4,Angola,35.6,0,0.0
5,Antigua and Barbuda,0.1,0,0.0
6,Argentina,46.2,0,0.0
8,Austria,9.0,0,0.0
9,Azerbaijan,10.2,1,0.098039
0,Afghanistan,41.1,6,0.145985
7,Armenia,3.0,1,0.333333
1,Albania,2.8,6,2.142857
3,Andorra,0.1,2,20.0


### 5.5 Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [194]:
geography = copy[copy['population'] > 0]
group_by_geo = geography.groupby(['region']).agg({'population':'sum',
                                                                'article_title': 'count'})
group_by_geo['total articles per capita'] = group_by_geo['article_title']/group_by_geo['population']
group_by_geo.sort_values(by = 'total articles per capita', ascending = False)

Unnamed: 0_level_0,population,article_title,total articles per capita
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OCEANIA,110.1,72,0.653951
NORTHERN EUROPE,1327.4,260,0.195872
CARIBBEAN,1239.5,201,0.162162
CENTRAL AMERICA,1728.3,193,0.11167
CENTRAL ASIA,1736.0,103,0.059332
SOUTHERN EUROPE,19179.3,872,0.045466
WESTERN ASIA,15552.3,685,0.044045
EASTERN AFRICA,18891.8,646,0.034195
NORTHERN AFRICA,7639.9,227,0.029712
MIDDLE AFRICA,7919.0,203,0.025635


### 5.6 Geographic regions by high quality coverage: Rank ordered list of geographic regions (in descending order) by high quality articles per capita.


In [195]:
group_by_geo_hq = geography.groupby(['region']).agg({'population':'sum',
                                                                'High quality': 'sum'})
group_by_geo_hq['hq articles per capita'] = group_by_geo_hq['High quality']/group_by_geo_hq['population']
group_by_geo_hq.sort_values(by = 'hq articles per capita', ascending = False)

Unnamed: 0_level_0,population,High quality,hq articles per capita
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OCEANIA,110.1,1,0.009083
CARIBBEAN,1239.5,8,0.006454
NORTHERN EUROPE,1327.4,8,0.006027
CENTRAL AMERICA,1728.3,10,0.005786
SOUTHERN EUROPE,19179.3,46,0.002398
WESTERN ASIA,15552.3,28,0.0018
CENTRAL ASIA,1736.0,3,0.001728
EASTERN EUROPE,37029.5,39,0.001053
EASTERN AFRICA,18891.8,15,0.000794
NORTHERN AFRICA,7639.9,6,0.000785
