This file will data scrape from the World Bank Dataset using the World Bank API <br/><br/>
It will go through all of our World Bank indicator codes <br/><br/>
and create a csv file with a row for each indicator code with the following column names: <br/><br/>
indicator_code, indicator_name, long_definition

In [1]:
import requests
import numpy as np
import pandas as pd

In [2]:
# merged_worldbank_data_1 = pd.read_csv('../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRT.csv')
# get the main World Bank data
merged_worldbank_data_1 = pd.read_csv('../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRTNE.csv')

In [3]:
column_names_1 = merged_worldbank_data_1.columns

In [4]:
len(column_names_1)

782

In [5]:
def get_worldbank_indicator_detailed_info(indicator_code):
    # the format to query the World bank. See the documentation for more info
    response = requests.get(f"http://api.worldbank.org/v2/indicator/{indicator_code}?format=json")
    data = response.json()
    
    # extract the useful definition information
    
    useful_data = {}
    useful_data['indicator_code'] = indicator_code

    # try to add a indicator name
    try:
        useful_data['indicator_name'] = data[1][0]['name']
    
    except:
        useful_data['indicator_name'] = 'indicator name missing'
    
    # try to add a definition
    try:
        useful_data['definition_or_additional_info'] = data[1][0]['sourceNote']
    
    except:
        useful_data['definition_or_additional_info'] = 'definition missing'

    return useful_data

In [6]:
# test = get_worldbank_indicator_detailed_info("SL.EMP.OWAC.FE.ZS")
# test = get_worldbank_indicator_detailed_info("SG.AGE.FUPN.EQ")
# test = get_worldbank_indicator_detailed_info("SH.MED.PHYS.ZS")
# test

In [7]:
# go through all the data and download definition information and more
world_bank_indicator_info_list = []
# for column in column_names_1[0:50]:
for column in column_names_1:
    cur_data = get_worldbank_indicator_detailed_info(column)
    world_bank_indicator_info_list.append(cur_data)

In [8]:
world_bank_indicator_definition_info_df = pd.DataFrame(world_bank_indicator_info_list)

In [9]:
# uncomment to see what the mapping dataframe looks like with info that was data scraped from the World Bank API
# world_bank_indicator_definition_info_df

## Additional codes to add that for some reason didn't get added in first pass
## add more as needed. Add if I find errors when doing analyses in other files

In [10]:
# world_bank_indicator_definition_info_df.query("indicator_code == 'SE.SEC.CUAT.PO.MA.ZS'")
# world_bank_indicator_definition_info_df.query("indicator_code == 'SH.HIV.1524.MA.ZS'")

In [11]:
additional_codes = ["SE.SEC.CUAT.PO.FE.ZS","SE.SEC.CUAT.PO.ZS","SE.SEC.CUAT.PO.MA.ZS",
                    "SG.GEN.MNST.ZS","SH.PRV.SMOK.FE","SG.POP.MIGR.FE.ZS",
                    "SE.TER.CUAT.DO.FE.ZS", "SE.TER.CUAT.DO.ZS", "SE.TER.CUAT.DO.MA.ZS",
                    "SH.PRV.SMOK.FE","SE.TER.CUAT.MS.FE.ZS","SE.TER.CUAT.DO.FE.ZS","SL.EMP.MPYR.FE.ZS","SE.TER.CUAT.DO.ZS",
                    "SE.TER.CUAT.MS.MA.ZS","SE.PRM.NINT.FE.ZS","SE.PRM.NINT.MA.ZS","SH.MMR.WAGE.ZS"
                    ]

# remove any duplicates just in case
additional_codes = np.unique(additional_codes)

additional_codes_not_already_in_main_df = []
# preliminary check. See if the code isn't already in the world bank dataframe
for code in additional_codes:
    if code in world_bank_indicator_definition_info_df['indicator_code'].values:
        print(f"code {code} is already in the dataframe world_bank_indicator_definition_info_df")
    elif code not in world_bank_indicator_definition_info_df:
        additional_codes_not_already_in_main_df.append(code)

# print(additional_codes_modified)

additional_codes_info = []
for code in additional_codes_not_already_in_main_df:
    cur_data = get_worldbank_indicator_detailed_info(code)
    additional_codes_info.append(cur_data)

# print(additional_codes_info)
additional_codes_info_df = pd.DataFrame(additional_codes_info)

code SL.EMP.MPYR.FE.ZS is already in the dataframe world_bank_indicator_definition_info_df


In [12]:
# test = get_worldbank_indicator_detailed_info("SE.SEC.CUAT.PO.FE.ZS")

In [13]:
# additional_codes_info_df

In [14]:
# add additional_codes_info_df to world_bank_indicator_definition_info_df
# world_bank_indicator_definition_info_df = world_bank_indicator_definition_info_df.append(additional_codes_info_df, ignore_index = True)
# CHANGE THIS LATER. THERE IS SOME BIZARRE reason .append isn't working. But maybe change later
world_bank_indicator_definition_info_df = pd.concat([world_bank_indicator_definition_info_df, additional_codes_info_df],
                                                    ignore_index=True)

In [15]:
# world_bank_indicator_definition_info_df

# Finally write the organized, scraped data from the World Bank to a csv file

In [16]:
# quoting = 1 below will put all the data in quotes when writing the data which hopefully will prevent errors where it is formatted incorrectly
# this is become some data has commas in it, and maybe some has tabs too.
world_bank_indicator_definition_info_df.to_csv('../WorldBankDatasets/Cleaned/World_Bank_Indicator_Definition_Info.csv',
                                            sep='\t', quoting=1,index=False)