In [1]:
# import libraries needed
import json
import requests

## Step 1: Getting the Article and Population Data

In this step, two data set are downloaded.

The Wikipedia politicians by country dataset is downloaded to page_data.csv
The population data is available in CSV format as WPDS_2020_data.csv. This dataset is drawn from the world population data sheet published by the Population Reference Bureau.

## Step 2: Cleaning the Data

Clean up the two data set we got in Step 1
- page_data.csv dataset contains some page names that start with the string "Template:". which should be removed
- WPDS_2020_data.csv contains some rows that provide cumulative regional population counts, 
rather than country-level counts. These rows are distinguished by having ALL CAPS values in the 'geography' field 
(e.g. AFRICA, OCEANIA), we will separate the country and sub-region level data into different table, while add a new column in country level data for sub-region name



In [2]:
import pandas as pd


In [3]:
page_data = pd.read_csv('page_data.csv')
page_data

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [4]:
page_data_clean = page_data.loc[(page_data['page'].apply(lambda r: (r.startswith('Template:') == False)))]
page_data_clean

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [5]:
wpds_data = pd.read_csv('WPDS_2020_data.csv')
wpds_data

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.850,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


In [6]:
wpds_data_clean = wpds_data[['Name', 'Type', 'Population']]
wpds_data_clean

Unnamed: 0,Name,Type,Population
0,WORLD,World,7772850000
1,AFRICA,Sub-Region,1337918000
2,NORTHERN AFRICA,Sub-Region,244344000
3,Algeria,Country,44357000
4,Egypt,Country,100803000
...,...,...,...
229,Samoa,Country,200000
230,Solomon Islands,Country,715000
231,Tonga,Country,99000
232,Tuvalu,Country,10000


In [7]:
wpds_data_clean_country = wpds_data_clean.loc[(wpds_data_clean['Type'].apply(lambda r: r =='Country'))]
wpds_data_clean_country

Unnamed: 0,Name,Type,Population
3,Algeria,Country,44357000
4,Egypt,Country,100803000
5,Libya,Country,6891000
6,Morocco,Country,35952000
7,Sudan,Country,43849000
...,...,...,...
229,Samoa,Country,200000
230,Solomon Islands,Country,715000
231,Tonga,Country,99000
232,Tuvalu,Country,10000


In [8]:
# Process WPDS data to get Sub-region to country map list
def get_country_region_map():
    country_region_map = {}
    region_name = None
    for index, row in wpds_data_clean.iterrows():
        t = row['Type']
        name = row['Name']
       
        # skip world 
        if (name == 'WORLD'):
            continue
        if (t == 'Sub-Region' and name.isupper()):
            if region_name is None or region_name != name:
                region_name = name
                continue
            
        country_region_map[name] = region_name
    
    return country_region_map


country_region_map = get_country_region_map()
country_region_map
        

{'Algeria': 'NORTHERN AFRICA',
 'Egypt': 'NORTHERN AFRICA',
 'Libya': 'NORTHERN AFRICA',
 'Morocco': 'NORTHERN AFRICA',
 'Sudan': 'NORTHERN AFRICA',
 'Tunisia': 'NORTHERN AFRICA',
 'Western Sahara': 'NORTHERN AFRICA',
 'Benin': 'WESTERN AFRICA',
 'Burkina Faso': 'WESTERN AFRICA',
 'Cape Verde': 'WESTERN AFRICA',
 "Cote d'Ivoire": 'WESTERN AFRICA',
 'Gambia': 'WESTERN AFRICA',
 'Ghana': 'WESTERN AFRICA',
 'Guinea': 'WESTERN AFRICA',
 'Guinea-Bissau': 'WESTERN AFRICA',
 'Liberia': 'WESTERN AFRICA',
 'Mali': 'WESTERN AFRICA',
 'Mauritania': 'WESTERN AFRICA',
 'Niger': 'WESTERN AFRICA',
 'Nigeria': 'WESTERN AFRICA',
 'Senegal': 'WESTERN AFRICA',
 'Sierra Leone': 'WESTERN AFRICA',
 'Togo': 'WESTERN AFRICA',
 'Burundi': 'EASTERN AFRICA',
 'Comoros': 'EASTERN AFRICA',
 'Djibouti': 'EASTERN AFRICA',
 'Eritrea': 'EASTERN AFRICA',
 'Ethiopia': 'EASTERN AFRICA',
 'Kenya': 'EASTERN AFRICA',
 'Madagascar': 'EASTERN AFRICA',
 'Malawi': 'EASTERN AFRICA',
 'Mauritius': 'EASTERN AFRICA',
 'Mayotte': 'E

In [9]:
wpds_data_clean_country_with_region = wpds_data_clean_country.copy()
wpds_data_clean_country_with_region['sub_region'] = wpds_data_clean_country_with_region['Name'].map(country_region_map)
wpds_data_clean_country_with_region

Unnamed: 0,Name,Type,Population,sub_region
3,Algeria,Country,44357000,NORTHERN AFRICA
4,Egypt,Country,100803000,NORTHERN AFRICA
5,Libya,Country,6891000,NORTHERN AFRICA
6,Morocco,Country,35952000,NORTHERN AFRICA
7,Sudan,Country,43849000,NORTHERN AFRICA
...,...,...,...,...
229,Samoa,Country,200000,OCEANIA
230,Solomon Islands,Country,715000,OCEANIA
231,Tonga,Country,99000,OCEANIA
232,Tuvalu,Country,10000,OCEANIA


## Step 3: Getting Article Quality Predictions

In this step, we will try to get the predicted quality category for each article in the Wikipedia dataset.
To support easy repro and avoid install the ORES client, we will use the API call to get the page quality prediction results

Pages that cannot get prediction result are saved in page_data_with_no_quality.csv file



In [10]:
# Use the batch API to get the page quality prediction to speed up
api_endpoint = "https://ores.wikimedia.org/v3/scores/enwiki?models=articlequality&revids={rev_ids}"


In [11]:
headers = {
    'User-Agent': 'https://github.com/IvyLinMS',
    'From': 'ivylin@uw.edu'
}

In [12]:
def api_call(rev_ids):
    call = requests.get(api_endpoint.format(rev_ids = rev_ids), headers=headers)
    response = call.json()
    return response



def get_page_quality_prediction():
    page_quality_results = {}
    
    # split list into even size chunks for speedy process
    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
    # API allow max 50 in a batch
    batch_size = 50
    for rev_id_chunks in chunks(page_data_clean['rev_id'], batch_size):
        rev_ids = '|'.join(str(rev_id) for rev_id in rev_id_chunks)
        print('.', end =' ')
        result = api_call(rev_ids)
        for key in result['enwiki']['scores']:
            if 'score' in result['enwiki']['scores'][key]['articlequality']:
                page_quality_results[key] = result['enwiki']['scores'][key]['articlequality']['score']['prediction']
            else:
                page_quality_results[key] = 'N/A'
    return page_quality_results
    


In [13]:
page_quality_results = get_page_quality_prediction()



. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [14]:
page_data_with_quality = page_data_clean.copy()
page_data_with_quality['article_quality_est'] = page_data_with_quality['rev_id'].astype(str).map(page_quality_results)
page_data_with_quality

Unnamed: 0,page,country,rev_id,article_quality_est
1,Bir I of Kanem,Chad,355319463,Stub
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
12,Yos Por,Cambodia,393822005,Stub
23,Julius Gregr,Czech Republic,395521877,Stub
24,Edvard Gregr,Czech Republic,395526568,Stub
...,...,...,...,...
47192,Yahya Jammeh,Gambia,807482007,GA
47193,Lucius Fairchild,United States,807483006,C
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153,GA
47195,Francis Fessenden,United States,807483270,C


In [15]:
page_data_with_quality_cleaned = page_data_with_quality.loc[page_data_with_quality['article_quality_est'].apply(lambda r: (r != 'N/A'))]
page_data_with_quality_cleaned

Unnamed: 0,page,country,rev_id,article_quality_est
1,Bir I of Kanem,Chad,355319463,Stub
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
12,Yos Por,Cambodia,393822005,Stub
23,Julius Gregr,Czech Republic,395521877,Stub
24,Edvard Gregr,Czech Republic,395526568,Stub
...,...,...,...,...
47191,Hal Bidlack,United States,807481636,C
47192,Yahya Jammeh,Gambia,807482007,GA
47193,Lucius Fairchild,United States,807483006,C
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153,GA


In [16]:
# Pages that cannot get prediction result are saved in page_data_with_no_quality.csv file
page_data_with_no_quality = page_data_with_quality.loc[page_data_with_quality['article_quality_est'].apply(lambda r: (r == 'N/A'))]
page_data_with_no_quality.to_csv('page_data_with_no_quality.csv', index=False) 
page_data_with_no_quality

Unnamed: 0,page,country,rev_id,article_quality_est
126,List of politicians in Poland,Poland,516633096,
222,Tingtingru,Vanuatu,550682925,
330,Daud Arsala,Afghanistan,627547024,
359,Book:Two Political Biographies,India,636911471,
514,Dilaver Bey,Turkey,669987106,
...,...,...,...,...
46782,John Rose (Trotskyist),United Kingdom,807336308,
46862,Jalal Movaghar,Iran,807367030,
46863,Mohsen Movaghar,Iran,807367166,
47182,King Gutierrez,Philippines,807479587,


## Step 4: Combining the Datasets

In this step, we will merge the wikipedia data and population data together use the contry name as key,
 - Rows that do not have matching data, and output them to a CSV file called: wp_wpds_countries-no_match.csv
 - remaining data into a single CSV file called: wp_wpds_politicians_by_country.csv with columns country, article_name, revision_id, article_quality_est, population
 
 





In [17]:

merged_data = page_data_with_quality_cleaned.merge(wpds_data_clean_country_with_region,how='outer',left_on=['country'],right_on=['Name'])
merged_data

Unnamed: 0,page,country,rev_id,article_quality_est,Name,Type,Population,sub_region
0,Bir I of Kanem,Chad,355319463.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
1,Abdullah II of Kanem,Chad,498683267.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
2,Salmama II of Kanem,Chad,565745353.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
3,Kuri I of Kanem,Chad,565745365.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
4,Mohammed I of Kanem,Chad,565745375.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
...,...,...,...,...,...,...,...,...
46446,,,,,French Polynesia,Country,280000.0,OCEANIA
46447,,,,,Guam,Country,175000.0,OCEANIA
46448,,,,,New Caledonia,Country,295000.0,OCEANIA
46449,,,,,Palau,Country,18000.0,OCEANIA


In [18]:
# get the unmatched data
unmatched_data = merged_data.loc[merged_data['country'].isna() | merged_data['Name'].isna()]
unmatched_data.to_csv('wp_wpds_countries-no_match.csv', index=False) 
unmatched_data

Unnamed: 0,page,country,rev_id,article_quality_est,Name,Type,Population,sub_region
488,Julius Gregr,Czech Republic,395521877.0,Stub,,,,
489,Edvard Gregr,Czech Republic,395526568.0,Stub,,,,
490,Miroslav Poche,Czech Republic,672862914.0,Stub,,,,
491,Vojtěch Mynář,Czech Republic,673008587.0,Stub,,,,
492,Jan Malypetr,Czech Republic,704424304.0,Stub,,,,
...,...,...,...,...,...,...,...,...
46446,,,,,French Polynesia,Country,280000.0,OCEANIA
46447,,,,,Guam,Country,175000.0,OCEANIA
46448,,,,,New Caledonia,Country,295000.0,OCEANIA
46449,,,,,Palau,Country,18000.0,OCEANIA


In [19]:
matched_data = merged_data.loc[(merged_data['country'].notna()) & (merged_data['Name'].notna())]
matched_data

Unnamed: 0,page,country,rev_id,article_quality_est,Name,Type,Population,sub_region
0,Bir I of Kanem,Chad,355319463.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
1,Abdullah II of Kanem,Chad,498683267.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
2,Salmama II of Kanem,Chad,565745353.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
3,Kuri I of Kanem,Chad,565745365.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
4,Mohammed I of Kanem,Chad,565745375.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
...,...,...,...,...,...,...,...,...
46414,Rita Sinon,Seychelles,800323154.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA
46415,Sylvette Frichot,Seychelles,800323798.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA
46416,May De Silva,Seychelles,800969960.0,Start,Seychelles,Country,98000.0,EASTERN AFRICA
46417,Vincent Meriton,Seychelles,802051093.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA


In [20]:
matched_data_renamed = matched_data.rename(columns={'page': 'article_name', 'rev_id': 'revision_id', 'Population': 'population'})
matched_data_renamed

Unnamed: 0,article_name,country,revision_id,article_quality_est,Name,Type,population,sub_region
0,Bir I of Kanem,Chad,355319463.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
1,Abdullah II of Kanem,Chad,498683267.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
2,Salmama II of Kanem,Chad,565745353.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
3,Kuri I of Kanem,Chad,565745365.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
4,Mohammed I of Kanem,Chad,565745375.0,Stub,Chad,Country,16877000.0,MIDDLE AFRICA
...,...,...,...,...,...,...,...,...
46414,Rita Sinon,Seychelles,800323154.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA
46415,Sylvette Frichot,Seychelles,800323798.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA
46416,May De Silva,Seychelles,800969960.0,Start,Seychelles,Country,98000.0,EASTERN AFRICA
46417,Vincent Meriton,Seychelles,802051093.0,Stub,Seychelles,Country,98000.0,EASTERN AFRICA


In [21]:
# generate the data in required format
politicians_by_country_data = pd.concat( [matched_data_renamed['country'], 
                         matched_data_renamed['article_name'],
                         matched_data_renamed['revision_id'],
                         matched_data_renamed['article_quality_est'],
                         matched_data_renamed['population'],
                        ], axis=1)
politicians_by_country_data

Unnamed: 0,country,article_name,revision_id,article_quality_est,population
0,Chad,Bir I of Kanem,355319463.0,Stub,16877000.0
1,Chad,Abdullah II of Kanem,498683267.0,Stub,16877000.0
2,Chad,Salmama II of Kanem,565745353.0,Stub,16877000.0
3,Chad,Kuri I of Kanem,565745365.0,Stub,16877000.0
4,Chad,Mohammed I of Kanem,565745375.0,Stub,16877000.0
...,...,...,...,...,...
46414,Seychelles,Rita Sinon,800323154.0,Stub,98000.0
46415,Seychelles,Sylvette Frichot,800323798.0,Stub,98000.0
46416,Seychelles,May De Silva,800969960.0,Start,98000.0
46417,Seychelles,Vincent Meriton,802051093.0,Stub,98000.0


In [22]:
politicians_by_country_data.to_csv('wp_wpds_politicians_by_country.csv', index=False) 

## Step 5: Analysis

In this step, we will get the country and sub region level article per population and high quality article percetage 

In [23]:
# get aggregate data per country
country_article_count_temp = matched_data_renamed.groupby('country').agg({'article_name':'count'})
country_article_count = country_article_count_temp.rename(columns={'article_name': 'article_count'})
country_article_count

Unnamed: 0_level_0,article_count
country,Unnamed: 1_level_1
Afghanistan,319
Albania,456
Algeria,116
Andorra,34
Angola,106
...,...
Venezuela,130
Vietnam,187
Yemen,116
Zambia,25


In [24]:
country_population = matched_data_renamed.groupby('country').agg({'population':'mean'})
country_population

Unnamed: 0_level_0,population
country,Unnamed: 1_level_1
Afghanistan,38928000.0
Albania,2838000.0
Algeria,44357000.0
Andorra,82000.0
Angola,32522000.0
...,...
Venezuela,28645000.0
Vietnam,96209000.0
Yemen,29826000.0
Zambia,18384000.0


In [25]:
high_quality_article_data = matched_data_renamed.loc[(matched_data_renamed['article_quality_est'].apply(lambda r: r == 'FA' or r =='GA'))]
high_quality_article_data

Unnamed: 0,article_name,country,revision_id,article_quality_est,Name,Type,population,sub_region
82,Hissène Habré,Chad,803166806.0,GA,Chad,Country,16877000.0,MIDDLE AFRICA
199,Abdullah Rimawi,Palestinian Territory,788953220.0,GA,Palestinian Territory,Country,5008000.0,WESTERN ASIA
204,Khalida Jarrar,Palestinian Territory,791881528.0,GA,Palestinian Territory,Country,5008000.0,WESTERN ASIA
218,Ahmed Yassin,Palestinian Territory,797122322.0,GA,Palestinian Territory,Country,5008000.0,WESTERN ASIA
225,Marwan Barghouti,Palestinian Territory,798913975.0,GA,Palestinian Territory,Country,5008000.0,WESTERN ASIA
...,...,...,...,...,...,...,...,...
46021,Mohammad bin Salman,Saudi Arabia,807463170.0,GA,Saudi Arabia,Country,35041000.0,WESTERN ASIA
46022,Fahd of Saudi Arabia,Saudi Arabia,807483153.0,GA,Saudi Arabia,Country,35041000.0,WESTERN ASIA
46050,Jack Warner (football executive),Trinidad and Tobago,805253461.0,GA,Trinidad and Tobago,Country,1369000.0,CARIBBEAN
46121,Eugenia Charles,Dominica,802175384.0,GA,Dominica,Country,72000.0,CARIBBEAN


In [26]:
country_high_quality_article_count_temp = high_quality_article_data.groupby('country').agg({'article_name':'count'})
country_high_quality_article_count = country_high_quality_article_count_temp.rename(columns={'article_name': 'high_quality_article_count'})
country_high_quality_article_count

Unnamed: 0_level_0,high_quality_article_count
country,Unnamed: 1_level_1
Afghanistan,13
Albania,3
Algeria,2
Argentina,16
Armenia,5
...,...
Vanuatu,3
Venezuela,3
Vietnam,13
Yemen,3


In [27]:
# join these data
country_merged_aggregate_data_intermediate = country_article_count.merge(country_population,how='left',left_on=['country'],right_on=['country'])
country_merged_aggregate_data_intermediate


Unnamed: 0_level_0,article_count,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,319,38928000.0
Albania,456,2838000.0
Algeria,116,44357000.0
Andorra,34,82000.0
Angola,106,32522000.0
...,...,...
Venezuela,130,28645000.0
Vietnam,187,96209000.0
Yemen,116,29826000.0
Zambia,25,18384000.0


In [28]:
country_merged_aggregate_data =  country_merged_aggregate_data_intermediate.merge(country_high_quality_article_count,how='left',left_on=['country'],right_on=['country'])
country_merged_aggregate_data

Unnamed: 0_level_0,article_count,population,high_quality_article_count
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,319,38928000.0,13.0
Albania,456,2838000.0,3.0
Algeria,116,44357000.0,2.0
Andorra,34,82000.0,
Angola,106,32522000.0,
...,...,...,...
Venezuela,130,28645000.0,3.0
Vietnam,187,96209000.0,13.0
Yemen,116,29826000.0,3.0
Zambia,25,18384000.0,


In [29]:
country_merged_aggregate_data.fillna(0, inplace=True)

In [30]:
# Calculate the percentage
country_merged_aggregate_data['articles_per_population'] = country_merged_aggregate_data['article_count'].astype('int64') / country_merged_aggregate_data['population'].astype('int64') * 100
country_merged_aggregate_data['high_quality_article_percentage'] = country_merged_aggregate_data['high_quality_article_count'].astype('int64') / country_merged_aggregate_data['article_count'].astype('int64') * 100

country_merged_aggregate_data

Unnamed: 0_level_0,article_count,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,319,38928000.0,13.0,0.000819,4.075235
Albania,456,2838000.0,3.0,0.016068,0.657895
Algeria,116,44357000.0,2.0,0.000262,1.724138
Andorra,34,82000.0,0.0,0.041463,0.000000
Angola,106,32522000.0,0.0,0.000326,0.000000
...,...,...,...,...,...
Venezuela,130,28645000.0,3.0,0.000454,2.307692
Vietnam,187,96209000.0,13.0,0.000194,6.951872
Yemen,116,29826000.0,3.0,0.000389,2.586207
Zambia,25,18384000.0,0.0,0.000136,0.000000


In [31]:
# region level data
region_article_count_temp = matched_data_renamed.groupby('sub_region').agg({'article_name':'count'})
region_article_count = region_article_count_temp.rename(columns={'article_name': 'article_count'})
region_article_count

Unnamed: 0_level_0,article_count
sub_region,Unnamed: 1_level_1
CARIBBEAN,695
CENTRAL AMERICA,1543
CENTRAL ASIA,245
EAST ASIA,2473
EASTERN AFRICA,2502
EASTERN EUROPE,3732
MIDDLE AFRICA,665
NORTHERN AFRICA,899
NORTHERN AMERICA,1901
NORTHERN EUROPE,3763


In [32]:
region_high_quality_article_count_temp = high_quality_article_data.groupby('sub_region').agg({'article_name':'count'})
region_high_quality_article_count = region_high_quality_article_count_temp.rename(columns={'article_name': 'high_quality_article_count'})
region_high_quality_article_count

Unnamed: 0_level_0,high_quality_article_count
sub_region,Unnamed: 1_level_1
CARIBBEAN,13
CENTRAL AMERICA,23
CENTRAL ASIA,7
EAST ASIA,76
EASTERN AFRICA,35
EASTERN EUROPE,118
MIDDLE AFRICA,16
NORTHERN AFRICA,19
NORTHERN AMERICA,104
NORTHERN EUROPE,102


In [33]:
region_population_temp= wpds_data_clean.loc[(wpds_data_clean['Type'].apply(lambda r: r == 'Sub-Region'))]
region_population = region_population_temp.rename(columns={'Population': 'population', 'Name': 'sub_region'})
region_population

Unnamed: 0,sub_region,Type,population
1,AFRICA,Sub-Region,1337918000
2,NORTHERN AFRICA,Sub-Region,244344000
10,WESTERN AFRICA,Sub-Region,401115000
27,EASTERN AFRICA,Sub-Region,444970000
48,MIDDLE AFRICA,Sub-Region,179757000
58,SOUTHERN AFRICA,Sub-Region,67732000
64,NORTHERN AMERICA,Sub-Region,368193000
67,LATIN AMERICA AND THE CARIBBEAN,Sub-Region,651036000
68,CENTRAL AMERICA,Sub-Region,178611000
77,CARIBBEAN,Sub-Region,43233000


In [34]:
# join region level data
region_merged_aggregate_data_intermediate = region_article_count.merge(region_population,how='left',left_on=['sub_region'],right_on=['sub_region'])
region_merged_aggregate_data_intermediate

Unnamed: 0,sub_region,article_count,Type,population
0,CARIBBEAN,695,Sub-Region,43233000
1,CENTRAL AMERICA,1543,Sub-Region,178611000
2,CENTRAL ASIA,245,Sub-Region,74961000
3,EAST ASIA,2473,Sub-Region,1641063000
4,EASTERN AFRICA,2502,Sub-Region,444970000
5,EASTERN EUROPE,3732,Sub-Region,291902000
6,MIDDLE AFRICA,665,Sub-Region,179757000
7,NORTHERN AFRICA,899,Sub-Region,244344000
8,NORTHERN AMERICA,1901,Sub-Region,368193000
9,NORTHERN EUROPE,3763,Sub-Region,105990000


In [35]:
region_merged_aggregate_data = region_merged_aggregate_data_intermediate.merge(region_high_quality_article_count,how='left',left_on=['sub_region'],right_on=['sub_region'])
region_merged_aggregate_data

Unnamed: 0,sub_region,article_count,Type,population,high_quality_article_count
0,CARIBBEAN,695,Sub-Region,43233000,13
1,CENTRAL AMERICA,1543,Sub-Region,178611000,23
2,CENTRAL ASIA,245,Sub-Region,74961000,7
3,EAST ASIA,2473,Sub-Region,1641063000,76
4,EASTERN AFRICA,2502,Sub-Region,444970000,35
5,EASTERN EUROPE,3732,Sub-Region,291902000,118
6,MIDDLE AFRICA,665,Sub-Region,179757000,16
7,NORTHERN AFRICA,899,Sub-Region,244344000,19
8,NORTHERN AMERICA,1901,Sub-Region,368193000,104
9,NORTHERN EUROPE,3763,Sub-Region,105990000,102


In [36]:
# Calculate the percentage for region level data
region_merged_aggregate_data['articles_per_population'] = region_merged_aggregate_data['article_count'].astype('int64')  / region_merged_aggregate_data['population'].astype('int64') * 100
region_merged_aggregate_data['high_quality_article_percentage'] = region_merged_aggregate_data['high_quality_article_count'].astype('int64')/ region_merged_aggregate_data['article_count'].astype('int64') * 100

region_merged_aggregate_data

Unnamed: 0,sub_region,article_count,Type,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
0,CARIBBEAN,695,Sub-Region,43233000,13,0.001608,1.870504
1,CENTRAL AMERICA,1543,Sub-Region,178611000,23,0.000864,1.490603
2,CENTRAL ASIA,245,Sub-Region,74961000,7,0.000327,2.857143
3,EAST ASIA,2473,Sub-Region,1641063000,76,0.000151,3.07319
4,EASTERN AFRICA,2502,Sub-Region,444970000,35,0.000562,1.398881
5,EASTERN EUROPE,3732,Sub-Region,291902000,118,0.001279,3.161844
6,MIDDLE AFRICA,665,Sub-Region,179757000,16,0.00037,2.406015
7,NORTHERN AFRICA,899,Sub-Region,244344000,19,0.000368,2.113459
8,NORTHERN AMERICA,1901,Sub-Region,368193000,104,0.000516,5.470805
9,NORTHERN EUROPE,3763,Sub-Region,105990000,102,0.00355,2.710603


## Step 6: Results

+ Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population
+ Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population
+ Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
+ Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
+ Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population
+ Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality


In [37]:
# Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population
country_merged_aggregate_data.sort_values('articles_per_population', ascending=False).head(10)


Unnamed: 0_level_0,article_count,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tuvalu,54,10000.0,4.0,0.54,7.407407
Nauru,52,11000.0,0.0,0.472727,0.0
San Marino,81,34000.0,0.0,0.238235,0.0
Monaco,40,38000.0,0.0,0.105263,0.0
Liechtenstein,28,39000.0,0.0,0.071795,0.0
Marshall Islands,37,57000.0,0.0,0.064912,0.0
Tonga,63,99000.0,0.0,0.063636,0.0
Iceland,201,368000.0,2.0,0.05462,0.995025
Andorra,34,82000.0,0.0,0.041463,0.0
Federated States of Micronesia,36,106000.0,0.0,0.033962,0.0


In [38]:
# Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

country_merged_aggregate_data.sort_values('articles_per_population', ascending=True).head(10)


Unnamed: 0_level_0,article_count,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
India,968,1400100000.0,13.0,6.9e-05,1.342975
Indonesia,209,271739000.0,9.0,7.7e-05,4.30622
China,1129,1402385000.0,40.0,8.1e-05,3.542958
Uzbekistan,28,34174000.0,3.0,8.2e-05,10.714286
Ethiopia,101,114916000.0,2.0,8.8e-05,1.980198
Zambia,25,18384000.0,0.0,0.000136,0.0
"Korea, North",36,25779000.0,8.0,0.00014,22.222222
Thailand,112,66534000.0,3.0,0.000168,2.678571
Mozambique,58,31166000.0,0.0,0.000186,0.0
Bangladesh,317,169809000.0,3.0,0.000187,0.946372


In [39]:
# Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
country_merged_aggregate_data.sort_values('high_quality_article_percentage', ascending=False).head(10)



Unnamed: 0_level_0,article_count,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Korea, North",36,25779000.0,8.0,0.00014,22.222222
Saudi Arabia,117,35041000.0,15.0,0.000334,12.820513
Romania,343,19241000.0,42.0,0.001783,12.244898
Central African Republic,66,4830000.0,8.0,0.001366,12.121212
Uzbekistan,28,34174000.0,3.0,8.2e-05,10.714286
Mauritania,48,4650000.0,5.0,0.001032,10.416667
Guatemala,83,18066000.0,7.0,0.000459,8.433735
Dominica,12,72000.0,1.0,0.016667,8.333333
Syria,128,19398000.0,10.0,0.00066,7.8125
Benin,91,12209000.0,7.0,0.000745,7.692308


In [40]:
# Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
country_merged_aggregate_data.sort_values('high_quality_article_percentage', ascending=True).head(10)


Unnamed: 0_level_0,article_count,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Solomon Islands,97,715000.0,0.0,0.013566,0.0
Tonga,63,99000.0,0.0,0.063636,0.0
Nauru,52,11000.0,0.0,0.472727,0.0
Namibia,162,2541000.0,0.0,0.006375,0.0
Djibouti,37,988000.0,0.0,0.003745,0.0
Mozambique,58,31166000.0,0.0,0.000186,0.0
Monaco,40,38000.0,0.0,0.105263,0.0
Eritrea,16,3546000.0,0.0,0.000451,0.0
Estonia,148,1331000.0,0.0,0.011119,0.0
Moldova,421,3535000.0,0.0,0.011909,0.0


In [41]:
# Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population

region_merged_aggregate_data.sort_values('articles_per_population', ascending=False)



Unnamed: 0,sub_region,article_count,Type,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
10,OCEANIA,3126,Sub-Region,43155000,63,0.007244,2.015355
9,NORTHERN EUROPE,3763,Sub-Region,105990000,102,0.00355,2.710603
15,SOUTHERN EUROPE,3710,Sub-Region,153251000,74,0.002421,1.994609
18,WESTERN EUROPE,4560,Sub-Region,195479000,56,0.002333,1.22807
0,CARIBBEAN,695,Sub-Region,43233000,13,0.001608,1.870504
5,EASTERN EUROPE,3732,Sub-Region,291902000,118,0.001279,3.161844
14,SOUTHERN AFRICA,634,Sub-Region,67732000,9,0.000936,1.419558
17,WESTERN ASIA,2563,Sub-Region,280927000,89,0.000912,3.472493
1,CENTRAL AMERICA,1543,Sub-Region,178611000,23,0.000864,1.490603
11,SOUTH AMERICA,3032,Sub-Region,429191000,40,0.000706,1.319261


In [42]:
# Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality
region_merged_aggregate_data.sort_values('high_quality_article_percentage', ascending=False)

Unnamed: 0,sub_region,article_count,Type,population,high_quality_article_count,articles_per_population,high_quality_article_percentage
8,NORTHERN AMERICA,1901,Sub-Region,368193000,104,0.000516,5.470805
13,SOUTHEAST ASIA,2020,Sub-Region,661845000,73,0.000305,3.613861
17,WESTERN ASIA,2563,Sub-Region,280927000,89,0.000912,3.472493
5,EASTERN EUROPE,3732,Sub-Region,291902000,118,0.001279,3.161844
3,EAST ASIA,2473,Sub-Region,1641063000,76,0.000151,3.07319
2,CENTRAL ASIA,245,Sub-Region,74961000,7,0.000327,2.857143
9,NORTHERN EUROPE,3763,Sub-Region,105990000,102,0.00355,2.710603
6,MIDDLE AFRICA,665,Sub-Region,179757000,16,0.00037,2.406015
7,NORTHERN AFRICA,899,Sub-Region,244344000,19,0.000368,2.113459
10,OCEANIA,3126,Sub-Region,43155000,63,0.007244,2.015355
