In [1]:
# Querying the world bank indicators  https://data.worldbank.org/indicator
# https://pypi.org/project/world-bank-data/



In [2]:
# pip install world_bank_data --upgrade

In [3]:
import pandas as pd
import world_bank_data as wb
pd.set_option('display.max_rows', None)

In [4]:
wb.get_topics()

Unnamed: 0_level_0,value,sourceNote
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Agriculture & Rural Development,For the 70 percent of the world's poor who liv...
2,Aid Effectiveness,Aid effectiveness is the impact that aid has i...
3,Economy & Growth,Economic growth is central to economic develop...
4,Education,Education is one of the most powerful instrume...
5,Energy & Mining,The world economy needs ever-increasing amount...
6,Environment,Natural and man-made environmental resources –...
7,Financial Sector,An economy's financial markets are critical to...
8,Health,Improving health is central to the Millennium ...
9,Infrastructure,Infrastructure helps determine the success of ...
10,Social Protection & Labor,The supply of labor available in an economy in...


In [5]:
wb.get_sources()

Unnamed: 0_level_0,lastupdated,name,code,description,url,dataavailability,metadataavailability,concepts
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2021-08-18,Doing Business,DBS,,,Y,Y,3
2,2023-09-19,World Development Indicators,WDI,,,Y,Y,3
3,2023-09-29,Worldwide Governance Indicators,WGI,,,Y,Y,3
5,2016-03-21,Subnational Malnutrition Database,SNM,,,Y,Y,3
6,2022-12-06,International Debt Statistics,IDS,,,Y,Y,4
11,2013-02-22,Africa Development Indicators,ADI,,,Y,Y,3
12,2020-12-20,Education Statistics,EDS,,,Y,Y,3
13,2022-03-25,Enterprise Surveys,ESY,,,Y,Y,3
14,2023-07-24,Gender Statistics,GDS,,,Y,Y,3
15,2023-07-21,Global Economic Monitor,GEM,,,Y,Y,3


In [6]:
countries_all = wb.get_countries()#.head()
high_inc_countries = countries_all.loc[countries_all['incomeLevel'] == 'High income']
#high_inc_countries

In [18]:
# Indicators 
indicators = [
    'SH.STA.MMRT', #Maternal mortality ratio (modeled estimate, per 100,000 live births)
    'SP.ADO.TFRT' #Adolescent fertility rate (births per 1,000 women ages 15-19)
#    'SP.DYN.CONU' #Contraceptive prevalence, any method (% of married women ages 15-49)
#    'SL.FAM.WORK.MA.ZS' # Contributing family workers, male (% of male employment) (modeled ILO estimate)
]

####### Alternatively #########
# this will take all the indicators for a given topic

indicators = wb.get_indicators(topic=17).index # gender topic indicators

In [23]:
# Input
# indicators - list of strings such as ['SH.STA.MMRT', '...']
# countries - list of relevant country codes
#
# Output 
# world_bank_dict - dictionary of dictionaries structured as follows: 
#
# {
#    "Indicator1": {
#        "Country1": {2021: 100, 2020: 90},
#        "Country2": {2021: 150, 2020: 140},
#    },
#    "Indicator2": {
#        "Country1": {2021: 200, 2020: 180},
#        "Country2": {2021: 250, 2020: 240},
#    }


def indicator_to_dict(indicators, countries): 
    world_bank_dict = {}
    
    for indicator in indicators: 
        # iterate through each indicator and pull the relevant series 
        indicator_dict = {} 
        
        try: # in case indicators are not available 
            
            indicator_series = wb.get_series(indicator, id_or_value = 'id', simplify_index=True) 
       
            # iterate through each country of interest and add that countries data to indicator_dict
            for country in countries: 
                try: 
                    indicator_dict[country] = indicator_series[country].to_dict()
                except KeyError:
                    value = None

            # add all data for a given indicator to the world_bank_dict with indicator as key         
            world_bank_dict[indicator] = indicator_dict
        
        
        except ValueError as value_err:
            
            if "The indicator was not found" in str(value_err):
                print(f"Error: The indicator {indicator} was not found. It may have been deleted or archived.")
            else:
                print(f"Other ValueError occurred: {value_err}")

        

        
        
    
    return world_bank_dict
    

In [24]:
 data_dict = indicator_to_dict(indicators, high_inc_countries.index)

Error: The indicator SG.JOB.NOPN.EQ was not found. It may have been deleted or archived.
Error: The indicator SG.LAW.CHMR was not found. It may have been deleted or archived.
Error: The indicator SG.LAW.LEVE.PU was not found. It may have been deleted or archived.
Error: The indicator SG.MMR.LEVE.EP was not found. It may have been deleted or archived.
Error: The indicator SG.NOD.CONS was not found. It may have been deleted or archived.
Error: The indicator SL.EMP.INSV.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.MNF.WAGE.FM was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PART.TL.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PRIM.FE.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.PRIM.MA.ZS was not found. It may have been deleted or archived.
Error: The indicator SL.TLF.SECO.FE.ZS was not found. It may have been deleted or archived.
Error: Th

In [29]:
### this converts to wide format! ###

# Convert the nested dictionary to a DataFrame
wide_df = pd.DataFrame.from_dict({(indicator, country): values
                             for indicator, countries in data_dict.items()
                             for country, values in countries.items()},
                            orient='index')

# Reset the index to separate the indicator and country columns
wide_df.reset_index(inplace=True)
wide_df.rename(columns={'level_0': 'Indicator', 'level_1': 'Country'}, inplace=True)

#wide_df

In [33]:
wide_df.head()

Unnamed: 0,Indicator,Country,1960,1961,1962,1963,1964,1965,1966,1967,...,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100
0,IC.FRM.FEMM.ZS,ABW,,,,,,,,,...,,,,,,,,,,
1,IC.FRM.FEMM.ZS,AND,,,,,,,,,...,,,,,,,,,,
2,IC.FRM.FEMM.ZS,ARE,,,,,,,,,...,,,,,,,,,,
3,IC.FRM.FEMM.ZS,ASM,,,,,,,,,...,,,,,,,,,,
4,IC.FRM.FEMM.ZS,ATG,,,,,,,,,...,,,,,,,,,,


In [30]:
### this converts to long format! ###

long_df = pd.melt(wide_df, id_vars=["Indicator", "Country"], var_name="Year", value_name="Value")

# Optional: Sort the DataFrame by Indicator, Country, and Year
long_df = long_df.sort_values(by=["Indicator", "Country", "Year"])

# Reset the index if needed
long_df.reset_index(drop=True, inplace=True)


In [32]:
long_df.head()

Unnamed: 0,Indicator,Country,Year,Value
0,IC.FRM.FEMM.ZS,ABW,1960,
1,IC.FRM.FEMM.ZS,ABW,1961,
2,IC.FRM.FEMM.ZS,ABW,1962,
3,IC.FRM.FEMM.ZS,ABW,1963,
4,IC.FRM.FEMM.ZS,ABW,1964,


In [12]:
# There are no extra countries in data_df - the indicator/country combo is per row 
difference = list(set(data_df.Country) - set(high_inc_countries.index))
print(difference)

[]


## Maternal Mortality Rate is indicator: SH.STA.MMRT

### Next steps 
1. Which indicators are interesting for modeling tasks 