In [1]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
url = "https://data.worldbank.org/country"
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.title.text)

Countries | Data


In [5]:
response.text[:10]

'<!doctype '

In [6]:
links = soup.find_all('a')
links

[<a class="logo" data-reactid="9" href="http://www.worldbank.org" target="_blank"><img alt="The World Bank" data-reactid="10" src="/assets/images/logo-wb-header-en.svg"/></a>,
 <a class="datalink" data-reactid="11" href="/">Data</a>,
 <a data-reactid="15" href="/">HOME</a>,
 <a data-reactid="17" href="https://data360.worldbank.org/en/economies" rel="noopener noreferrer" target="_blank">ECONOMIES</a>,
 <a data-reactid="19" href="https://data360.worldbank.org/en/search" rel="noopener noreferrer" target="_blank">DATA &amp; RESOURCES</a>,
 <a data-reactid="21" href="https://data360.worldbank.org/en/about" rel="noopener noreferrer" target="_blank">ABOUT</a>,
 <a class="country" data-reactid="43" href="/country"><span data-reactid="44">Country</span></a>,
 <a data-reactid="46" href="/indicator"><span data-reactid="47">Indicator</span></a>,
 <a class="nav-item datapage" data-reactid="49" href="/"></a>,
 <a class="nav-item" data-reactid="50" href="http://databank.worldbank.org/data/home.aspx" 

In [7]:
links = soup.find_all('a')
countries = []
for link in links:
    href = link.get('href')
    if href and href.startswith("/country/"):
        country_name = link.text.strip()
        full_url =  'https://data.worldbank.org/' + href
        countries.append((country_name,full_url))
        

In [8]:
print(len(countries))

217


In [9]:
#print out the first countries
print(countries[:5])

[('Afghanistan', 'https://data.worldbank.org//country/afghanistan?view=chart'), ('Albania', 'https://data.worldbank.org//country/albania?view=chart'), ('Algeria', 'https://data.worldbank.org//country/algeria?view=chart'), ('American Samoa', 'https://data.worldbank.org//country/american-samoa?view=chart'), ('Andorra', 'https://data.worldbank.org//country/andorra?view=chart')]


In [10]:
import csv

In [11]:
with open('worldbank_countries.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Country', 'URL'])
    writer.writerows(countries)
print('csv saved')


csv saved


In [12]:
import pandas as pd


In [13]:
data = pd.read_csv("worldbank_countries.csv")
data.head()
            

Unnamed: 0,Country,URL
0,Afghanistan,https://data.worldbank.org//country/afghanista...
1,Albania,https://data.worldbank.org//country/albania?vi...
2,Algeria,https://data.worldbank.org//country/algeria?vi...
3,American Samoa,https://data.worldbank.org//country/american-s...
4,Andorra,https://data.worldbank.org//country/andorra?vi...


In [26]:
url = "https://data.worldbank.org/country/nigeria?view=chart"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

print(soup.title.text)

Nigeria | Data


In [20]:
metadata = soup.find_all("div", class_ = "metadata-item")
for item in metadata:
    print(item.text.strip())

In [25]:
# with open("raw_output.html", "w", encoding = "utf-8") as f:
#     f.write(soup.prettify())

<ul class="meta" data-reactid="1003"></ul>

In [42]:
url = "https://api.worldbank.org/v2/country/NGA?format=json"
response = requests.get(url)
data = response.json()
print(data)

[{'page': 1, 'pages': 1, 'per_page': '50', 'total': 1}, [{'id': 'NGA', 'iso2Code': 'NG', 'name': 'Nigeria', 'region': {'id': 'SSF', 'iso2code': 'ZG', 'value': 'Sub-Saharan Africa '}, 'adminregion': {'id': 'SSA', 'iso2code': 'ZF', 'value': 'Sub-Saharan Africa (excluding high income)'}, 'incomeLevel': {'id': 'LMC', 'iso2code': 'XN', 'value': 'Lower middle income'}, 'lendingType': {'id': 'IDB', 'iso2code': 'XH', 'value': 'Blend'}, 'capitalCity': 'Abuja', 'longitude': '7.48906', 'latitude': '9.05804'}]]


In [44]:
def get_country_metadata(country_code):
    url = f"https://api.worldbank.org/v2/country/{country_code}?format=json"
    response = requests.get(url)
    data = response.json()
    return data[1][0]


In [50]:
info = get_country_metadata("NGA")
info

{'id': 'NGA',
 'iso2Code': 'NG',
 'name': 'Nigeria',
 'region': {'id': 'SSF', 'iso2code': 'ZG', 'value': 'Sub-Saharan Africa '},
 'adminregion': {'id': 'SSA',
  'iso2code': 'ZF',
  'value': 'Sub-Saharan Africa (excluding high income)'},
 'incomeLevel': {'id': 'LMC',
  'iso2code': 'XN',
  'value': 'Lower middle income'},
 'lendingType': {'id': 'IDB', 'iso2code': 'XH', 'value': 'Blend'},
 'capitalCity': 'Abuja',
 'longitude': '7.48906',
 'latitude': '9.05804'}

In [None]:
print("Country:", info['name'])
print("Capital:", info["name"])
print("Region:", info["name"])
print("Income level:", info["name"])
print("Lending Type:", info["name"])