In [None]:
!pip install --upgrade pip requests beautifulsoup4 lxml

In [None]:
import requests

# a listing of all world bank indicators is at this url
response = requests.get("https://data.worldbank.org/indicator?tab=all")
response.status_code

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "lxml")

print(soup.prettify())

In [None]:
indicators_list = []

# get a list of all indicators
# they are found in (div class=overviewArea)
# topics are found in (section class=nav-item)
# indicators are found in (ul li)

for div in soup.select("div.overviewArea section.nav-item"):
    topic = div.select_one("h3")["id"]
#     print(topic)
    for li in div.select("ul li"):
        indicator = li.text
#         print(F"   indicator name = {indicator}")
        link = li.select_one("a")["href"].split("?")[0]
        link = "http://api.worldbank.org/v2/en" + link + "?downloadformat=csv"
#         print(F"   indicator link = {link}")
        indicators_list.append((topic, indicator, link))

# indicators_list

In [None]:
import pandas as pd

df = pd.DataFrame(indicators_list, columns=["indicator-area", "indicator-text", "indicator-link"])

# extract the indicator code from the link
df["indicator-code"] = df["indicator-link"].map(lambda row: row.split("/")[-1])
df["indicator-code"] = df["indicator-code"].map(lambda row: row.split("?")[0])

df

In [None]:
import os.path
import time

# indicators will be downloaded in "data-raw-worldbank"
# please make the directory
cwd = os.getcwd()
data_path = os.path.join(cwd, "data-raw-worldbank")

for index, row in df.iterrows():
    
    # check and skip if file already exists
    data_file = os.path.join(data_path, row["indicator-code"]+".zip")
    
    if os.path.exists(data_file):
        continue
    
    # print current step
    print(index, row["indicator-link"], row["indicator-code"])

    try:
        response = requests.get(row["indicator-link"], stream=True)
        
        # throw an error for bad status codes
        response.raise_for_status()

        with open(data_file, "wb") as handle:
            for block in response.iter_content(1024):
                handle.write(block)

    except:
        print(F"  FAILED")
    
    # keep them happy
    time.sleep(1)

In [None]:
import textwrap

for index, row in df.iterrows():
    if "pollution" in row["indicator-text"]:
        wrapper = textwrap.TextWrapper(initial_indent="     ",subsequent_indent="     ", width=70)
        print(F'{index:04d} {row["indicator-area"]}:')
        print(F'{wrapper.fill(row["indicator-text"])}')
        print(F'{wrapper.fill(row["indicator-code"])}')