In [1]:
# Querying the world bank indicators  https://data.worldbank.org/indicator
# https://pypi.org/project/world-bank-data/



In [2]:
# pip install world_bank_data --upgrade

In [3]:
import pandas as pd
import world_bank_data as wb
pd.set_option('display.max_rows', None)

In [4]:
wb.get_topics()

Unnamed: 0_level_0,value,sourceNote
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Agriculture & Rural Development,For the 70 percent of the world's poor who liv...
2,Aid Effectiveness,Aid effectiveness is the impact that aid has i...
3,Economy & Growth,Economic growth is central to economic develop...
4,Education,Education is one of the most powerful instrume...
5,Energy & Mining,The world economy needs ever-increasing amount...
6,Environment,Natural and man-made environmental resources –...
7,Financial Sector,An economy's financial markets are critical to...
8,Health,Improving health is central to the Millennium ...
9,Infrastructure,Infrastructure helps determine the success of ...
10,Social Protection & Labor,The supply of labor available in an economy in...


In [5]:
#wb.get_sources()

In [6]:
countries_all = wb.get_countries()#.head()
high_inc_countries = countries_all.loc[countries_all['incomeLevel'] == 'High income']

oecd = ['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'HUN', 'ISL', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'MEX', 'NLD',
       'NZL', 'NOR', 'POL', 'PRT', 'SVK', 'ESP', 'SWE', 'CHE', 'GBR',
       'CHL', 'COL', 'EST', 'ISR', 'LVA', 'SGP', 'SVN', 'USA', 'MLT',
       'TUR', 'CRI', 'LTU', 'ROU', 'GRC', 'HRV']


In [7]:
# Input
# indicators - list of strings such as ['SH.STA.MMRT', '...']
# countries - list of relevant country codes
#
# Output 
# world_bank_dict - dictionary of dictionaries structured as follows: 
#
# {
#    "Indicator1": {
#        "Country1": {2021: 100, 2020: 90},
#        "Country2": {2021: 150, 2020: 140},
#    },
#    "Indicator2": {
#        "Country1": {2021: 200, 2020: 180},
#        "Country2": {2021: 250, 2020: 240},
#    }


def indicator_to_dict(indicators, countries): 
    world_bank_dict = {}
    
    for indicator in indicators: 
        # iterate through each indicator and pull the relevant series 
        indicator_dict = {} 
        
        try: # in case indicators are not available 
            
            indicator_series = wb.get_series(indicator, id_or_value = 'id', simplify_index=True) 
       
            # iterate through each country of interest and add that countries data to indicator_dict
            for country in countries: 
                try: 
                    indicator_dict[country] = indicator_series[country].to_dict()
                except KeyError:
                    value = None

            # add all data for a given indicator to the world_bank_dict with indicator as key         
            world_bank_dict[indicator] = indicator_dict
        
        
        except ValueError as value_err:
            
            if "The indicator was not found" in str(value_err):
                print(f"Error: The indicator {indicator} was not found. It may have been deleted or archived.")
            else:
                print(f"Other ValueError occurred: {value_err}")

        

        
        
    
    return world_bank_dict
    

In [8]:
def dictionary_to_df(dictionary):
    # Create an empty list to store the data
    data_list = []

    # Iterate through the dictionary to flatten the data
    for indicator, indicator_data in dictionary.items():
        for country, year_data in indicator_data.items():
            for year, value in year_data.items():
                data_list.append([country, year, indicator, value])

    # Create a DataFrame from the flattened data
    df = pd.DataFrame(data_list, columns=["Country", "Year", "Indicator", "Value"])

    # Pivot the DataFrame to get one column per indicator
    df_pivot = df.pivot(index=["Country", "Year"], columns="Indicator", values="Value").reset_index()

    return df_pivot

In [9]:
topics = [4,8,11,14,15,17]
topicName = ["Education", 'Health', 'Poverty', 'ScienceAndTech', 'SocialDev', 'Gender']

for i, t in enumerate(topics): 

    indicators = wb.get_indicators(topic=t).index 
    # data_dict = indicator_to_dict(indicators, high_inc_countries.index)
    data_dict = indicator_to_dict(indicators, oecd) # pulling oecd countries 
    df = dictionary_to_df(data_dict)
    
    filePath = topicName[i] + '_WorldBankData.csv'
    df.to_csv(filePath, index=False)
    




Error: The indicator SE.TER.ENRL.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SE.XPD.MPRM.ZS was not found. It may have been deleted or archived.
Error: The indicator SE.XPD.MSEC.ZS was not found. It may have been deleted or archived.
Error: The indicator SE.XPD.MTER.ZS was not found. It may have been deleted or archived.
Error: The indicator SE.XPD.MTOT.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PRIM.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PRIM.MA.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PRIM.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.SECO.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.SECO.MA.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.SECO.ZS was not found. It may have been deleted or archived.
Error:

## Maternal Mortality Rate is indicator: SH.STA.MMRT in the gender topic dataset

# Apendix
ie. Stuff that is no longer useful

In [10]:
# Indicators 
indicators = [
    'SH.STA.MMRT', #Maternal mortality ratio (modeled estimate, per 100,000 live births)
    'SP.ADO.TFRT' #Adolescent fertility rate (births per 1,000 women ages 15-19)
#    'SP.DYN.CONU' #Contraceptive prevalence, any method (% of married women ages 15-49)
#    'SL.FAM.WORK.MA.ZS' # Contributing family workers, male (% of male employment) (modeled ILO estimate)
]

####### Alternatively #########
# this will take all the indicators for a given topic

indicators = wb.get_indicators(topic=17).index # gender topic indicators

In [11]:
### this converts to wide format! ###

def wide_format(data_dict): 
    
    # Convert the nested dictionary to a DataFrame
    wide_df = pd.DataFrame.from_dict({(indicator, country): values
                                 for indicator, countries in data_dict.items()
                                 for country, values in countries.items()},
                                orient='index')

    # Reset the index to separate the indicator and country columns
    wide_df.reset_index(inplace=True)
    wide_df.rename(columns={'level_0': 'Indicator', 'level_1': 'Country'}, inplace=True)

    return wide_df

In [12]:
### this converts to long format! ###

def long_format(wide_df):

    long_df = pd.melt(wide_df, id_vars=["Indicator", "Country"], var_name="Year", value_name="Value")

    # Optional: Sort the DataFrame by Indicator, Country, and Year
    long_df = long_df.sort_values(by=["Indicator", "Country", "Year"])

    # Reset the index if needed
    long_df.reset_index(drop=True, inplace=True)
    
    return long_df
