In [110]:
import pandas as pd
import numpy as np
import requests

import matplotlib.pyplot as plt
import seaborn as sns

## Exercise 1

**Write a Python function that can be used to query data from the Worldbank Indicator API. Your function should:**

• take the following input parameters: indicators, countries, and years.

• return a Pandas DataFrame of the queried data

• have a docstring that explains what the function does, what the input parameters are, and what
the output is

• minimize the number of API calls necessary to retrieve the data
Demonstrate that your function works by querying the following data (codes are provided in parentheses):

a) The total population (SP.POP.TOTL) of Germany (DE) and France (FR) between 2015 and 2020.

b) The total population (SP.POP.TOTL), GDP in current US$ (NY.GDP.MKTP.CD), and life ex-
pectancy in years at birth (SP.DYN.LE00.IN) of all countries (all) in 2012. Print the shape of the
resulting DataFrame and display its first 10 rows.

c) State how many API calls your function makes for a) and b) respectively.
Notes:

• To solve the exercises study the documentation of the basic call structures. Most of the information
you need is provided there.

• If needed, additional information about the API is available here. For instance, you will find links
to the list of available indicators and countries, and explanations on error codes.

• Note that by default the API r


In [122]:
call_count = 0 
def call_api(indicators,countries,years,) -> pd.DataFrame:
    """
    Call the World Bank API to get data for the specified indicators, countries, and years.
    
    Parameters:
    - indicators: List of indicator codes
    - countries: List of country codes
    - years: List of years
    
    Returns:
    - DataFrame with the requested data
    """
    global call_count
    if isinstance(indicators, list):
        indicators = ';'.join(indicators)
    if isinstance(countries, list):
        countries = ';'.join(countries)
    
    
    url = f"http://api.worldbank.org/v2/country/{countries}/indicator/{indicators}?date={years}&format=json&per_page=10000&source=2"
    print("url: ",url)

      # Make the API request
    response = requests.get(url)
    
    # Check for HTTP errors
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        print(f"Response text: {response.text}")
        return None
    call_count += 1

    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
        return None
    
    # Check if the response contains data
    if len(data) < 2 or data[1] is None:
        print("No data available in the response.")
        print("Response content:", data)  # Debugging: Print the full response
        return None
    
    # Convert the JSON data to a DataFrame
    df = pd.json_normalize(data[1])
    return df
    
df= call_api(
    indicators="SP.POP.TOTL",
    countries=["DE","FR"],
    years="2015:2020"
)

df


url:  http://api.worldbank.org/v2/country/DE;FR/indicator/SP.POP.TOTL?date=2015:2020&format=json&per_page=10000&source=2


Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,DEU,2020,83160871,,,0,SP.POP.TOTL,"Population, total",DE,Germany
1,DEU,2019,83092962,,,0,SP.POP.TOTL,"Population, total",DE,Germany
2,DEU,2018,82905782,,,0,SP.POP.TOTL,"Population, total",DE,Germany
3,DEU,2017,82657002,,,0,SP.POP.TOTL,"Population, total",DE,Germany
4,DEU,2016,82348669,,,0,SP.POP.TOTL,"Population, total",DE,Germany
5,DEU,2015,81686611,,,0,SP.POP.TOTL,"Population, total",DE,Germany
6,FRA,2020,67601110,,,0,SP.POP.TOTL,"Population, total",FR,France
7,FRA,2019,67382061,,,0,SP.POP.TOTL,"Population, total",FR,France
8,FRA,2018,67158348,,,0,SP.POP.TOTL,"Population, total",FR,France
9,FRA,2017,66918020,,,0,SP.POP.TOTL,"Population, total",FR,France


### a)

#### The total population (SP.POP.TOTL) of Germany (DE) and France (FR) between 2015 and 2020.


In [112]:
df_POP_DE_FR = call_api(
    indicators="SP.POP.TOTL",
    countries=["DE","FR"],
    years="2015:2020"
)

url:  http://api.worldbank.org/v2/country/DE;FR/indicator/SP.POP.TOTL?date=2015:2020&format=json&per_page=100&source=2


In [113]:
df_POP_DE_FR

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,DEU,2020,83160871,,,0,SP.POP.TOTL,"Population, total",DE,Germany
1,DEU,2019,83092962,,,0,SP.POP.TOTL,"Population, total",DE,Germany
2,DEU,2018,82905782,,,0,SP.POP.TOTL,"Population, total",DE,Germany
3,DEU,2017,82657002,,,0,SP.POP.TOTL,"Population, total",DE,Germany
4,DEU,2016,82348669,,,0,SP.POP.TOTL,"Population, total",DE,Germany
5,DEU,2015,81686611,,,0,SP.POP.TOTL,"Population, total",DE,Germany
6,FRA,2020,67601110,,,0,SP.POP.TOTL,"Population, total",FR,France
7,FRA,2019,67382061,,,0,SP.POP.TOTL,"Population, total",FR,France
8,FRA,2018,67158348,,,0,SP.POP.TOTL,"Population, total",FR,France
9,FRA,2017,66918020,,,0,SP.POP.TOTL,"Population, total",FR,France


### b)

#### The total population (SP.POP.TOTL), GDP in current US$ (NY.GDP.MKTP.CD), and life expectancy in years at birth (SP.DYN.LE00.IN) of all countries (all) in 2012. Print the shape of the resulting DataFrame and display its first 10 rows


In [None]:
df_gdp_pop_2012 = call_api(
    indicators=["SP.POP.TOTL","NY.GDP.MKTP.CD","SP.DYN.LE00.IN"],
    countries='all',
    years="2012"
)

df_gdp_pop_2012

url:  http://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL;NY.GDP.MKTP.CD;SP.DYN.LE00.IN?date=2019:2025&format=json&per_page=10000&source=2


Unnamed: 0,countryiso3code,date,value,scale,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,AFE,2024,,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
1,AFE,2023,7.505038e+08,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
2,AFE,2022,7.318214e+08,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
3,AFE,2021,7.130909e+08,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
4,AFE,2020,6.944461e+08,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
...,...,...,...,...,...,...,...,...,...,...,...
4783,ZWE,2023,6.277500e+01,,,,0,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",ZW,Zimbabwe
4784,ZWE,2022,6.236000e+01,,,,0,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",ZW,Zimbabwe
4785,ZWE,2021,6.013500e+01,,,,0,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",ZW,Zimbabwe
4786,ZWE,2020,6.153000e+01,,,,0,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",ZW,Zimbabwe


### c)

#### State how many API calls your function makes for a) and b) respectively.


In [115]:
call_count

3

**Answer: for each call call_count variable counts number of calls in my case function is created fully adapted to type and number of parameters in url it's checking health of response additionally and provides solution based on function params**

- For this solution I researched API parameters such as sources which is set to two for fetching multiple indicator
- for seeing a lot of data I set per_page=10000 for fetching high amount of data


## Exercise 2

**The file medal_table_2012.csv contains information about the number of medals won by each country at the Olympic Games 2012. (It probably looks similar to the medal table that you calculated in the first part of the project. Small differences are possible, but the overall structure should be the same.)**

a) Preprocess both the medal table data and the Worldbank data retrieved in exercise 1 b) and
combine the two datasets suitably into one tidy dataset. The final dataset should be such that it
allows you to answer the following exercises. Explain your actions and decisions in a few sentences.

b) Create an alternative medal table for the 2012 Olympic Games by calculating the number of Gold,
Silver, and Bronze medals won per 10 million inhabitants. Display the 10 most successful countries
according to this alternative medal table.

Note: If there are missing values in the Worldbank data set (e.g. if no population data is available for
Germany), then you do NOT need to impute these values.


In [131]:
MEDAL_PATH = "../data/medal_table_2012.csv"

In [132]:
df = pd.read_csv(MEDAL_PATH)


In [133]:
df.head()

Unnamed: 0,rank,country_code,country,gold,silver,bronze
0,1,USA,United States,46,28,30
1,2,CHN,People's Republic of China,38,31,22
2,3,GBR,Great Britain,29,17,19
3,4,RUS,Russian Federation,20,20,27
4,5,KOR,Republic of Korea,13,9,8


In [134]:
len(df.country_code.unique())

85

In [135]:
df.country_code.unique()

array(['USA', 'CHN', 'GBR', 'RUS', 'KOR', 'GER', 'FRA', 'AUS', 'ITA',
       'HUN', 'JPN', 'IRI', 'NZL', 'UKR', 'CUB', 'ESP', 'JAM', 'CZE',
       'RSA', 'PRK', 'BRA', 'POL', 'ETH', 'KAZ', 'CRO', 'CAN', 'BLR',
       'KEN', 'DEN', 'ROU', 'AZE', 'SUI', 'NOR', 'LTU', 'TUN', 'SWE',
       'COL', 'MEX', 'GEO', 'IRL', 'ARG', 'SRB', 'SLO', 'TTO', 'TUR',
       'DOM', 'TPE', 'LAT', 'ALG', 'BRN', 'GRN', 'BAH', 'UGA', 'VEN',
       'EGY', 'IND', 'MGL', 'THA', 'BUL', 'FIN', 'INA', 'SVK', 'BEL',
       'ARM', 'EST', 'MAS', 'PUR', 'BOT', 'CYP', 'GAB', 'GUA', 'MNE',
       'POR', 'UZB', 'GRE', 'QAT', 'SGP', 'AFG', 'CMR', 'HKG', 'KSA',
       'KUW', 'MAR', 'TJK', 'VIE'], dtype=object)

In [149]:
df_gdp_pop_2012.head()

Unnamed: 0,countryiso3code,date,value,scale,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,AFE,2024,,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
1,AFE,2023,750503764.0,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
2,AFE,2022,731821393.0,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
3,AFE,2021,713090928.0,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern
4,AFE,2020,694446100.0,,,,0,SP.POP.TOTL,"Population, total",ZH,Africa Eastern and Southern


In [136]:
df_gdp_pop_2012['countryiso3code'].unique()

array(['AFE', 'AFW', 'ARB', 'CSS', 'CEB', 'EAR', 'EAS', 'EAP', 'TEA',
       'EMU', 'ECS', 'ECA', 'TEC', 'EUU', 'FCS', 'HPC', '', 'IBD', 'IBT',
       'IDB', 'IDX', 'IDA', 'LTE', 'LCN', 'LAC', 'TLA', 'LDC', 'LMY',
       'MEA', 'MNA', 'TMN', 'MIC', 'NAC', 'OED', 'OSS', 'PSS', 'PST',
       'PRE', 'SST', 'SAS', 'TSA', 'SSF', 'SSA', 'TSS', 'WLD', 'AFG',
       'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW',
       'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL',
       'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'VGB',
       'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CYM',
       'CAF', 'TCD', 'CHI', 'CHL', 'CHN', 'COL', 'COM', 'COD', 'COG',
       'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'DNK', 'DJI',
       'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ',
       'ETH', 'FRO', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO',
       'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GUM', 'GTM', 'GIN',
       'GNB', 'G

In [140]:
len(df_gdp_pop_2012['countryiso3code'].unique())

262

In [146]:
list_gdp = df_gdp_pop_2012['countryiso3code'].unique()
list_medals = df.country_code.unique()

In [147]:
common_elements = [item for item in list_gdp if item in list_medals]

In [148]:
len(common_elements)

61

## Exercise 3

**Carry out a simple supervised machine learning experiment, in which you train a model to predict the number of Gold medals a country wins at the Olympic Games 2012 based on demographic and economic features. Note: Since machine learning is not a focus topic of this course, you do not need to optimize the model. Just demonstrate that you are able to apply the steps we discussed in the course and correctly interpret the results.**

a) Train and evaluate a linear regression model: 1. Split your data into a training and a test set. 2. Train a linear regression model using population, life expectancy and the GDP per capita of a
country as features. 3. Evaluate the model using the root mean squared error as the performance
metric.

b) Discuss your results: How do you judge the performance? What are possible reasons for this
performance? How could the model be improved?

c) Due to an unfortunate “data error”, the country Netherlands was not included in the Olympic
Games data and is therefore not present in medal table. Use your trained machine learning model
to predict the number of Gold medals the Netherlands has won in 2012, just based on their demo-
graphic and economic characteristics.
