In [1]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from newsapi import NewsApiClient

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

# Newsapi

In [2]:
newsapi = NewsApiClient(api_key='acade396511c4284bde1a972db2ca181')

In [3]:
headlines = newsapi.get_top_headlines(q='bitcoin',
                          category='business',
                          language='en',
                          country='us')

In [4]:
headlines['articles'][0]['title']

'Stocks edge higher as inflation, jobs data looms: Stock market news today | August 28, 2023 - Yahoo Finance'

# Gender Gap Indexes - PDF scraping

In [5]:
import tabula as tb
import re

In [6]:
file = 'raw_data/ggi_2023.pdf'

In [7]:
# Let's get the raw data of individual countries for each dimension and take a look

data1 = tb.read_pdf(file, pages = '17')[0]
data2 = tb.read_pdf(file, pages = '18')[0]

data1

Unnamed: 0,Rank,Country,Score (0–1),Rank.1,Country.1,Score (0–1).1,Rank.2,Country.2,Score (0–1).2,Rank.3,Country.3,Score (0–1).3
0,1,Liberia,0.895,74,Austria,0.692,1,Argentina,1.000,74,Vanuatu,0.991
1,2,Jamaica,0.894,75,Israel,0.688,1,Belgium,1.000,75,Belarus,0.991
2,3,"Moldova, Republic of",0.863,76,Paraguay,0.685,1,Botswana,1.000,76,Portugal,0.991
3,4,Barbados,0.860,77,Netherlands,0.684,1,Canada,1.000,77,Zimbabwe,0.991
4,5,Lao PDR,0.851,78,Sierra Leone,0.684,1,Colombia,1.000,78,Australia,0.991
...,...,...,...,...,...,...,...,...,...,...,...,...
68,69,Serbia,0.697,142,India,0.367,69,El Salvador,0.993,142,Angola,0.738
69,70,Cameroon,0.694,143,Pakistan,0.362,70,Suriname,0.993,143,Guinea,0.710
70,71,Kyrgyzstan,0.694,144,Iran (Islamic Republic of),0.344,71,Mauritius,0.993,144,"Congo, Dem. Rep. of the",0.683
71,72,Romania,0.693,145,Algeria,0.317,72,Singapore,0.993,145,Chad,0.637


In [8]:
# Pretty clean. We change the column names and concatenate the info in 4 separate dataframes by topics

data1.columns = ['rank','country','score','rank','country','score','rank','country','score','rank','country','score']
data2.columns = ['rank','country','score','rank','country','score','rank','country','score','rank','country','score']


economic_part_and_opp = pd.concat([data1.iloc[:,:3], data1.iloc[:,3:6]], ignore_index = True, axis = 0).replace('Croatia*','Croatia')
educational_attainment = pd.concat([data1.iloc[:,6:9], data1.iloc[:,9:]], ignore_index = True, axis = 0).replace('Croatia*','Croatia')
health_survival = pd.concat([data2.iloc[:,:3], data2.iloc[:,3:6]], ignore_index = True, axis = 0).replace('Croatia*','Croatia')
political_empowerment = pd.concat([data2.iloc[:,6:9], data2.iloc[:,9:]], ignore_index = True, axis = 0).replace('Croatia*','Croatia')

health_survival

Unnamed: 0,rank,country,score
0,1,Belarus,0.980
1,1,Belize,0.980
2,1,Botswana,0.980
3,1,Brazil,0.980
4,1,Cabo Verde,0.980
...,...,...,...
141,142,India,0.950
142,143,Qatar,0.947
143,144,Viet Nam,0.946
144,145,China,0.937


In [9]:
# Now we go for the global indexes of individual countries. 
# These are several tables in the same pages, so we would have to get them one by one with area definitions...
# Instead, we will get the numbers from the other tables as the global score is just the average of the
# scores of the other dimensions:

average_gap = pd.DataFrame(economic_part_and_opp['country'])

average_gap['rank'] = 0  # We initialize the column

def av_score(country):
        return (economic_part_and_opp[economic_part_and_opp['country'] == country].score.iloc[0] \
                + educational_attainment[educational_attainment['country'] == country].score.iloc[0] \
                + health_survival[health_survival['country'] == country].score.iloc[0] \
                + political_empowerment[political_empowerment['country'] == country].score.iloc[0])/4

average_gap['score'] = average_gap.country.apply(av_score)

average_gap = average_gap.sort_values(by = 'score', ascending = False) # We sort according to score

average_gap['rank'] = [i for i in range(1, average_gap.shape[0] + 1)] # We assign the rank

average_gap

Unnamed: 0,country,rank,score
13,Iceland,1,0.91225
10,Norway,2,0.87875
19,Finland,3,0.86325
41,New Zealand,4,0.85575
14,Sweden,5,0.81525
...,...,...,...
142,Pakistan,142,0.57500
143,Iran (Islamic Republic of),143,0.57475
144,Algeria,144,0.57275
125,Chad,145,0.57050


In [10]:
# Got it! Now lets join all dataframes in a unique one. First we sort them alphabetically:

average_gap = average_gap.sort_values(by = 'country', ascending = False).reset_index(drop = True)
economic_part_and_opp = economic_part_and_opp.sort_values(by = 'country', ascending = False).reset_index(drop = True)
health_survival = health_survival.sort_values(by = 'country', ascending = False).reset_index(drop = True)
educational_attainment = educational_attainment.sort_values(by = 'country', ascending = False).reset_index(drop = True)
political_empowerment = political_empowerment.sort_values(by = 'country', ascending = False).reset_index(drop = True)

# I left the country columns to check they were properly sorted after concatenating.

countries_gap = pd.concat([average_gap,
                           economic_part_and_opp,
                           political_empowerment,
                           educational_attainment,
                           health_survival], axis = 1)

# We update the column names:

countries_gap.columns = ['country','rank_av','score_av',
                         'rank_ec','country1','score_ec',
                         'rank_pol','country1','score_pol',
                        'rank_ed','country1','score_ed',
                        'rank_h','country1','score_h']

# We get rid of the extra country columns:

countries_gap.drop(columns = 'country1', inplace = True)

# We create a country id column:

countries_gap['country_id'] = 0

# We reorder columns and sort by score_av:

countries_gap = countries_gap.reindex(columns = ['country_id','country', 'rank_av','rank_ec','rank_pol','rank_ed','rank_h',
                                                 'score_av', 'score_ec', 'score_pol', 'score_ed', 'score_h'])

countries_ggi = countries_gap.sort_values(by = 'score_av', ascending = False)


countries_ggi['country_id'] = [i for i in range(1, countries_gap.shape[0]+1)]

countries_ggi

Unnamed: 0,country_id,country,rank_av,rank_ec,rank_pol,rank_ed,rank_h,score_av,score_ec,score_pol,score_ed,score_h
85,1,Iceland,1,14,1,79,128,0.91225,0.796,0.901,0.991,0.961
41,2,Norway,2,11,2,84,127,0.87875,0.800,0.765,0.989,0.961
96,3,Finland,3,20,4,1,71,0.86325,0.783,0.700,1.000,0.970
46,4,New Zealand,4,42,3,1,101,0.85575,0.732,0.725,1.000,0.966
18,5,Sweden,5,15,11,1,118,0.81525,0.795,0.503,1.000,0.963
...,...,...,...,...,...,...,...,...,...,...,...,...
39,142,Pakistan,142,143,95,138,132,0.57500,0.362,0.152,0.825,0.961
82,143,Iran (Islamic Republic of),143,144,143,112,116,0.57475,0.344,0.031,0.960,0.964
143,144,Algeria,144,145,135,116,137,0.57275,0.317,0.065,0.951,0.958
116,145,Chad,145,126,105,145,72,0.57050,0.538,0.137,0.637,0.970


In [11]:
# Now global data by region

data3 = tb.read_pdf(file, pages = '20')[0]

data3

Unnamed: 0.1,"Eurasia and Central Asia\rEast Asia and the Pacific\rEurope\rLatin America and the Caribbean\rMiddle East and North Africa\rNorth America\rSouthern Asia\rSub-Saharan Africa\rGlobal average\rSource\rWorld Economic Forum, Globa","Subindexes\rOverall IndexEconomic ParticipationEducationalHealthPolitical\rand OpportunityAttainmentand SurvivalEmpowerment\r69.0%68.8%98.9%97.4%10.9%\r68.8%71.0%95.5%94.9%14.0%\r76.3%69.7%99.6%97.0%39.1%\r74.3%65.2%99.2%97.6%35.0%\r62.6%44.0%95.9%96.4%\r14.0%\r75.0%77.6%99.5%96.9%\r26.1%\r63.4%37.2%96.0%95.3%25.1%\r68.2%67.2%86.0%97.2%22.6%\r68.4%60.1%95.2%96.0%22.1%\rParity\r0%50%100%\rNote\rGender Gap Index, 2023.\rPopulation-weighted averages for the 146 economies featured in the Global Gender Gap Index\r2023. The percentages are indicative of the gender gap that has been closed.",Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,,69.0%,68.8%,98.9%,97.4%,10.9%,,,,,
1,,68.8%,71.0%,95.5%,94.9%,14.0%,,,,,
2,,76.3%,69.7%,99.6%,97.0%,39.1%,,,,,
3,,74.3%,65.2%,99.2%,97.6%,35.0%,,,,,
4,,62.6%,44.0%,95.9%,96.4%,14.0%,,,,,
5,,75.0%,77.6%,99.5%,96.9%,26.1%,,,,,
6,,63.4%,37.2%,96.0%,95.3%,25.1%,,,,,
7,,68.2%,67.2%,86.0%,97.2%,22.6%,,,,,
8,68.4%,,,,,,,,,,
9,68.4%,60.1%,95.2%,96.0%,22.1%,,,,,,


In [12]:
# It's dirtier. Let's perform some transformations to clean it up

data3.columns = range(data3.shape[1])

data3 = data3.drop(columns = range(6,11), index = [8,10])

data3.loc[9,:] = [0, *[i for i in data3.loc[9,range(0,5)]]]

data3 = data3.drop(columns = 0)
data3 = data3.drop(index = 9)

data3.columns = ['overall_index','economic_part_and_opp','educational_attainment',
                 'health_survival','political_empowerment']

data3['region'] = ['eurasia_central_asia','east_asia_pacific','europe',
               'latinamerica_caribbean','middle_east_north_africa',
               'north_america','southern_asia','sub_saharan_africa']

regions_ggi = data3.reset_index(drop = True)
regions_ggi = regions_ggi.reindex(columns = ['region',*data3.columns[:-1]]) 

# Change of units to match the other table:

def to_float(x):
    return float(x.replace('%','').strip())/100

for i in range(1,regions_ggi.shape[1]):
    regions_ggi.iloc[:,i] = regions_ggi.iloc[:,i].apply(to_float)

    
# Sort by overall_index:
    
regions_ggi = regions_ggi.sort_values(by = 'overall_index', ascending = False).reset_index(drop = True)


# Create a region id:

regions_ggi['region_id'] = [i for i in range(1, regions_ggi.shape[0]+1)]

# Reorder columns:

regions_ggi = regions_ggi.reindex(columns = ['region_id',*regions_ggi.columns[:-1]]) 

regions_ggi

Unnamed: 0,region_id,region,overall_index,economic_part_and_opp,educational_attainment,health_survival,political_empowerment
0,1,europe,0.763,0.697,0.996,0.97,0.391
1,2,north_america,0.75,0.776,0.995,0.969,0.261
2,3,latinamerica_caribbean,0.743,0.652,0.992,0.976,0.35
3,4,eurasia_central_asia,0.69,0.688,0.989,0.974,0.109
4,5,east_asia_pacific,0.688,0.71,0.955,0.949,0.14
5,6,sub_saharan_africa,0.682,0.672,0.86,0.972,0.226
6,7,southern_asia,0.634,0.372,0.96,0.953,0.251
7,8,middle_east_north_africa,0.626,0.44,0.959,0.964,0.14


In [13]:
data3

Unnamed: 0,overall_index,economic_part_and_opp,educational_attainment,health_survival,political_empowerment,region
0,69.0%,68.8%,98.9%,97.4%,10.9%,eurasia_central_asia
1,68.8%,71.0%,95.5%,94.9%,14.0%,east_asia_pacific
2,76.3%,69.7%,99.6%,97.0%,39.1%,europe
3,74.3%,65.2%,99.2%,97.6%,35.0%,latinamerica_caribbean
4,62.6%,44.0%,95.9%,96.4%,14.0%,middle_east_north_africa
5,75.0%,77.6%,99.5%,96.9%,26.1%,north_america
6,63.4%,37.2%,96.0%,95.3%,25.1%,southern_asia
7,68.2%,67.2%,86.0%,97.2%,22.6%,sub_saharan_africa


In [14]:
# Now we need an intermediate table to assign the individual countries for each region. 
# In order to do that, we scrape some more tables.

In [15]:
data4 = tb.read_pdf(file, area = [40,20,63,36], relative_area = True, pages = '25')[0]

eurasia_central_asia = data4.iloc[2:,1].reset_index(drop = True)

eurasia_central_asia[:5]

0    Moldova, Republic of
1                 Belarus
2                 Armenia
3              Kazakhstan
4                 Ukraine
Name: The Global Ge, dtype: object

In [16]:
data5 = tb.read_pdf(file, area = [40,58,95,75], relative_area = True, pages = '25')[0]

europe = data5.iloc[1:,1].reset_index(drop = True)

europe[:5]

0    Iceland
1     Norway
2    Finland
3     Sweden
4    Germany
Name: Unnamed: 1, dtype: object

In [17]:
data6 = tb.read_pdf(file, area = [65,20,100,40], relative_area = True, pages = '25')[0]

east_asia_pacific = data6.iloc[1:,1].reset_index(drop = True)

east_asia_pacific.iloc[-1] = 'Japan'

east_asia_pacific[:5]

0    New Zealand
1    Philippines
2      Australia
3      Singapore
4        Lao PDR
Name: Unnamed: 1, dtype: object

In [18]:
data7 = tb.read_pdf(file, area = [0,20,40,36], relative_area = True, pages = '26')[0]

latinamerica_caribbean = data7.iloc[2:,1].reset_index(drop = True)

latinamerica_caribbean[:5]

0     Nicaragua
1    Costa Rica
2       Jamaica
3         Chile
4      Barbados
Name: The Global Ge, dtype: object

In [19]:
data8 = tb.read_pdf(file, area = [0,56,25,75], relative_area = True, pages = '26')[0]

southern_asia = data8.iloc[2:,0].reset_index(drop = True)

southern_asia[:5]

0    Bangladesh
1        Bhutan
2     Sri Lanka
3         Nepal
4      Maldives
Name: region, 2023, dtype: object

In [20]:
data9 = tb.read_pdf(file, area = [0,20,100,36], relative_area = True, pages = '26')[0]

middle_east_north_africa = data9.iloc[21:34,1].reset_index(drop = True)

middle_east_north_africa.iloc[-1] = 'Algeria'

middle_east_north_africa[:5]

0    United Arab Emirates
1                  Israel
2                 Bahrain
3                  Kuwait
4                  Jordan
Name: Latin America an\rCountry\rNicaragua\rCosta Rica\rJamaica\rChile\rBarbados\rMexico\rPeru\rArgentina\rColombia\rEcuador\rSuriname\rHonduras\rBolivia\rBrazil\rPanama\rUruguay\rEl Salvador\rDominican Republic\rBelize\rParaguay\rGuatemala\rMiddle East and\rCountry\rUnited Arab Emirates\rIsrael\rBahrain\rKuwait\rJordan\rTunisia\rSaudi Arabia\rLebanon\rQatar\rEgypt\rMorocco\rOman\rAlgeria\rNorth America\rCountry\rCanada\rUnited States of Amer\rGender Gap Index, 20, dtype: object

In [21]:
north_america = pd.DataFrame(['Canada','United States of America'])[0]

north_america

0                      Canada
1    United States of America
Name: 0, dtype: object

In [22]:
data10 = tb.read_pdf(file, area = [20,56,100,75], relative_area = True, pages = '26')[0]

sub_saharan_africa = data10.iloc[4:40,1].reset_index(drop = True)

sub_saharan_africa[:5]

0         Namibia
1          Rwanda
2    South Africa
3      Mozambique
4         Burundi
Name: Pakistan, dtype: object

In [23]:
# We create the mixed table

countries_regions_ggi = countries_ggi[['country_id','country']]


def assign_region(x):
    if x in list(europe):
        return '1'
    elif x in list(north_america):
        return '2'
    elif x in list(latinamerica_caribbean):
        return '3'
    elif x in list(eurasia_central_asia):
        return '4'
    elif x in list(east_asia_pacific):
        return '5'
    elif x in list(sub_saharan_africa):
        return '6'
    elif x in list(southern_asia):
        return '7'
    elif x in list(middle_east_north_africa):
        return '8'
    else:
        return 'unknown'

countries_regions_ggi['region_id'] = [assign_region(i) for i in countries_regions_ggi.country]

countries_regions_ggi.loc[countries_regions_ggi[countries_regions_ggi.country_id == 55].index,'region_id'] = '3'
countries_regions_ggi.loc[countries_regions_ggi[countries_regions_ggi.country_id == 48].index,'region_id'] = '6'


countries_regions_ggi = countries_regions_ggi.drop(columns = ['country'])\
                       .sort_values(by = 'region_id', ascending = True).reset_index(drop = True)

countries_regions_ggi

Unnamed: 0,country_id,region_id
0,1,1
1,29,1
2,32,1
3,38,1
4,40,1
...,...,...
141,120,8
142,71,8
143,144,8
144,133,8


In [25]:
# Done. Lastly, We clean up some country names:

countries_ggi.loc[countries_ggi[countries_ggi.country == 'Bolivia (Plurinational State of)'].index,'country'] = 'Bolivia'
countries_ggi.loc[countries_ggi[countries_ggi.country == 'Iran (Islamic Republic of)'].index,'country'] = 'Iran'
countries_ggi.loc[countries_ggi[countries_ggi.country == 'Tanzania, United Republic of'].index,'country'] = 'Tanzania'
countries_ggi.loc[countries_ggi[countries_ggi.country == 'Korea, Republic of'].index,'country'] = 'South Korea'

In [28]:
# And we save the three clean csv of gender gap index data:

countries_ggi.to_csv('clean_data/countries_ggi.csv', index = False)
regions_ggi.to_csv('clean_data/regions_ggi.csv', index = False)
countries_regions_ggi.to_csv('clean_data/countries_regions_ggi.csv', index = False)