# Data Fetching

### Install necessary Python package

In [None]:
pip install wbdata pandas

### Import packages

In [1]:
import wbdata
import pandas as pd
import datetime

## 1. World Bank

### Set countries & dates

In [17]:
df.shape

(0, 6)

In [59]:
import wbdata
import pandas as pd
import datetime

# Year range
start = datetime.datetime(2000, 1, 1)
end = datetime.datetime(2023, 12, 31)
dates = (start, end)

# Indicators
indicators = {
    'SP.POP.TOTL': 'Population',
    'NY.GDP.MKTP.CD': 'GDP',
    'SP.DYN.LE00.IN': 'Life_Expectancy'
}

# Fetch data for all countries - use positional arguments
df = wbdata.get_dataframe(indicators, country="all", date=dates)
df.reset_index(inplace=True)

# Make sure 'date' is datetime
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,country,date,Population,GDP,Life_Expectancy
0,Africa Eastern and Southern,2023-01-01,750503764.0,1133818000000.0,65.146291
1,Africa Eastern and Southern,2022-01-01,731821393.0,1191639000000.0,64.48702
2,Africa Eastern and Southern,2021-01-01,713090928.0,1085605000000.0,62.979999
3,Africa Eastern and Southern,2020-01-01,694446100.0,933407200000.0,63.766484
4,Africa Eastern and Southern,2019-01-01,675950189.0,1009747000000.0,63.857261


In [31]:
df.shape

(6384, 5)

In [11]:
df.to_csv("world_bank_data.csv", index=False)

In [45]:
df['country'].nunique()

194

In [37]:
import pandas as pd

# Example: your DataFrame is df
# Predefined list of 195 real countries (can be found online)
real_countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", 
    "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas, The", 
    "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
    "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei Darussalam",
    "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada",
    "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo, Dem. Rep.",
    "Congo, Rep.", "Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czechia", 
    "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt, Arab Rep.",
    "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
    "Fiji", "Finland", "France", "Gabon", "Gambia, The", "Georgia", "Germany", "Ghana",
    "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras",
    "Hungary", "Iceland", "India", "Indonesia", "Iran, Islamic Rep.", "Iraq", "Ireland", "Israel",
    "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Korea, Dem. People's Rep.",
    "Korea, Rep.", "Kosovo", "Kuwait", "Kyrgyz Republic", "Lao PDR", "Latvia", "Lebanon", "Lesotho",
    "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macao SAR, China", "Madagascar",
    "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius",
    "Mexico", "Micronesia, Fed. Sts.", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique",
    "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria",
    "North Macedonia", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay",
    "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russian Federation", "Rwanda",
    "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles",
    "Sierra Leone", "Singapore", "Slovak Republic", "Slovenia", "Solomon Islands", "Somalia, Fed. Rep.",
    "South Africa", "South Sudan", "Spain", "Sri Lanka", "St. Kitts and Nevis", "St. Lucia",
    "St. Vincent and the Grenadines", "Sudan", "Suriname", "Sweden", "Switzerland", "Syrian Arab Republic",
    "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago",
    "Tunisia", "Turkiye", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates",
    "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela, RB", "Viet Nam",
    "Zambia", "Zimbabwe"
]

# Keep only rows with real countries
df = df[df['country'].isin(real_countries)]
df.reset_index(drop=True, inplace=True)

print(len(df['country'].unique()))  # should be 195
df.head()

194


Unnamed: 0,country,date,Population,GDP,Life_Expectancy
0,Afghanistan,2023-01-01,41454761.0,17152230000.0,66.035
1,Afghanistan,2022-01-01,40578842.0,14497240000.0,65.617
2,Afghanistan,2021-01-01,40000412.0,14260000000.0,60.417
3,Afghanistan,2020-01-01,39068979.0,19955930000.0,61.454
4,Afghanistan,2019-01-01,37856121.0,18799440000.0,62.941


In [47]:
df.shape

(4656, 5)

In [49]:
# Full country → continent mapping
continent_dict = {
    "Afghanistan": "Asia", "Albania": "Europe", "Algeria": "Africa", "Andorra": "Europe",
    "Angola": "Africa", "Antigua and Barbuda": "North America", "Argentina": "South America",
    "Armenia": "Asia", "Australia": "Oceania", "Austria": "Europe", "Azerbaijan": "Asia",
    "Bahamas, The": "North America", "Bahrain": "Asia", "Bangladesh": "Asia", "Barbados": "North America",
    "Belarus": "Europe", "Belgium": "Europe", "Belize": "North America", "Benin": "Africa",
    "Bhutan": "Asia", "Bolivia": "South America", "Bosnia and Herzegovina": "Europe", "Botswana": "Africa",
    "Brazil": "South America", "Brunei Darussalam": "Asia", "Bulgaria": "Europe", "Burkina Faso": "Africa",
    "Burundi": "Africa", "Cabo Verde": "Africa", "Cambodia": "Asia", "Cameroon": "Africa",
    "Canada": "North America", "Central African Republic": "Africa", "Chad": "Africa", "Chile": "South America",
    "China": "Asia", "Colombia": "South America", "Comoros": "Africa", "Congo, Dem. Rep.": "Africa",
    "Congo, Rep.": "Africa", "Costa Rica": "North America", "Cote d'Ivoire": "Africa", "Croatia": "Europe",
    "Cuba": "North America", "Cyprus": "Europe", "Czechia": "Europe", "Denmark": "Europe",
    "Djibouti": "Africa", "Dominica": "North America", "Dominican Republic": "North America", "Ecuador": "South America",
    "Egypt, Arab Rep.": "Africa", "El Salvador": "North America", "Equatorial Guinea": "Africa", "Eritrea": "Africa",
    "Estonia": "Europe", "Eswatini": "Africa", "Ethiopia": "Africa", "Fiji": "Oceania",
    "Finland": "Europe", "France": "Europe", "Gabon": "Africa", "Gambia, The": "Africa",
    "Georgia": "Europe", "Germany": "Europe", "Ghana": "Africa", "Greece": "Europe",
    "Grenada": "North America", "Guatemala": "North America", "Guinea": "Africa", "Guinea-Bissau": "Africa",
    "Guyana": "South America", "Haiti": "North America", "Honduras": "North America", "Hungary": "Europe",
    "Iceland": "Europe", "India": "Asia", "Indonesia": "Asia", "Iran, Islamic Rep.": "Asia",
    "Iraq": "Asia", "Ireland": "Europe", "Israel": "Asia", "Italy": "Europe",
    "Jamaica": "North America", "Japan": "Asia", "Jordan": "Asia", "Kazakhstan": "Asia",
    "Kenya": "Africa", "Kiribati": "Oceania", "Korea, Dem. People's Rep.": "Asia", "Korea, Rep.": "Asia",
    "Kosovo": "Europe", "Kuwait": "Asia", "Kyrgyz Republic": "Asia", "Lao PDR": "Asia",
    "Latvia": "Europe", "Lebanon": "Asia", "Lesotho": "Africa", "Liberia": "Africa",
    "Libya": "Africa", "Liechtenstein": "Europe", "Lithuania": "Europe", "Luxembourg": "Europe",
    "Macao SAR, China": "Asia", "Madagascar": "Africa", "Malawi": "Africa", "Malaysia": "Asia",
    "Maldives": "Asia", "Mali": "Africa", "Malta": "Europe", "Marshall Islands": "Oceania",
    "Mauritania": "Africa", "Mauritius": "Africa", "Mexico": "North America", "Micronesia, Fed. Sts.": "Oceania",
    "Moldova": "Europe", "Monaco": "Europe", "Mongolia": "Asia", "Montenegro": "Europe",
    "Morocco": "Africa", "Mozambique": "Africa", "Myanmar": "Asia", "Namibia": "Africa",
    "Nauru": "Oceania", "Nepal": "Asia", "Netherlands": "Europe", "New Zealand": "Oceania",
    "Nicaragua": "North America", "Niger": "Africa", "Nigeria": "Africa", "North Macedonia": "Europe",
    "Norway": "Europe", "Oman": "Asia", "Pakistan": "Asia", "Palau": "Oceania",
    "Panama": "North America", "Papua New Guinea": "Oceania", "Paraguay": "South America", "Peru": "South America",
    "Philippines": "Asia", "Poland": "Europe", "Portugal": "Europe", "Qatar": "Asia",
    "Romania": "Europe", "Russian Federation": "Europe", "Rwanda": "Africa", "Samoa": "Oceania",
    "San Marino": "Europe", "Sao Tome and Principe": "Africa", "Saudi Arabia": "Asia", "Senegal": "Africa",
    "Serbia": "Europe", "Seychelles": "Africa", "Sierra Leone": "Africa", "Singapore": "Asia",
    "Slovak Republic": "Europe", "Slovenia": "Europe", "Solomon Islands": "Oceania", "Somalia, Fed. Rep.": "Africa",
    "South Africa": "Africa", "South Sudan": "Africa", "Spain": "Europe", "Sri Lanka": "Asia",
    "St. Kitts and Nevis": "North America", "St. Lucia": "North America", "St. Vincent and the Grenadines": "North America",
    "Sudan": "Africa", "Suriname": "South America", "Sweden": "Europe", "Switzerland": "Europe",
    "Syrian Arab Republic": "Asia", "Tajikistan": "Asia", "Tanzania": "Africa", "Thailand": "Asia",
    "Timor-Leste": "Asia", "Togo": "Africa", "Tonga": "Oceania", "Trinidad and Tobago": "North America",
    "Tunisia": "Africa", "Turkiye": "Asia", "Turkmenistan": "Asia", "Tuvalu": "Oceania",
    "Uganda": "Africa", "Ukraine": "Europe", "United Arab Emirates": "Asia", "United Kingdom": "Europe",
    "United States": "North America", "Uruguay": "South America", "Uzbekistan": "Asia", "Vanuatu": "Oceania",
    "Venezuela, RB": "South America", "Viet Nam": "Asia", "Zambia": "Africa", "Zimbabwe": "Africa"
}

# Add continent column to your DataFrame
df['Continent'] = df['country'].map(continent_dict)

# Check
df[['country', 'Continent']].head()

Unnamed: 0,country,Continent
0,Afghanistan,Asia
1,Afghanistan,Asia
2,Afghanistan,Asia
3,Afghanistan,Asia
4,Afghanistan,Asia


In [53]:
df.head()

Unnamed: 0,country,date,Population,GDP,Life_Expectancy,Continent
0,Afghanistan,2023-01-01,41454761.0,17152230000.0,66.035,Asia
1,Afghanistan,2022-01-01,40578842.0,14497240000.0,65.617,Asia
2,Afghanistan,2021-01-01,40000412.0,14260000000.0,60.417,Asia
3,Afghanistan,2020-01-01,39068979.0,19955930000.0,61.454,Asia
4,Afghanistan,2019-01-01,37856121.0,18799440000.0,62.941,Asia


In [55]:
df.to_csv("world_bank_data.csv", index=False)

In [61]:
import requests
import pandas as pd

url = ("http://data.un.org/WS/rest/data/DF_SDG_GLH/"
       "..SI_POV_NAHC............?startPeriod=2000&endPeriod=2023&format=csv")

resp = requests.get(url)
resp.raise_for_status()

# Load into DataFrame
from io import StringIO
df_un = pd.read_csv(StringIO(resp.text))

# Clean/Transform
df_un.rename(columns={ 'REF_AREA': 'country_code', 'TIME_PERIOD': 'year', 'OBS_VALUE': 'value' },
             inplace=True)

# Optional: merge with country names via lookup, filter only real countries, add continent etc.

print(df_un.head())

                     DATAFLOW FREQ REPORTING_TYPE       SERIES  country_code  \
0  IAEG-SDGs:DF_SDG_GLH(1.21)    A              G  SI_POV_NAHC             4   
1  IAEG-SDGs:DF_SDG_GLH(1.21)    A              G  SI_POV_NAHC             4   
2  IAEG-SDGs:DF_SDG_GLH(1.21)    A              G  SI_POV_NAHC             4   
3  IAEG-SDGs:DF_SDG_GLH(1.21)    A              G  SI_POV_NAHC             8   
4  IAEG-SDGs:DF_SDG_GLH(1.21)    A              G  SI_POV_NAHC             8   

  SEX AGE URBANISATION INCOME_WEALTH_QUANTILE EDUCATION_LEV  ...  \
0  _T  _T           _T                     _T            _T  ...   
1  _T  _T           _T                     _T            _T  ...   
2  _T  _T           _T                     _T            _T  ...   
3  _T  _T           _T                     _T            _T  ...   
4  _T  _T           _T                     _T            _T  ...   

                                         COMMENT_OBS TIME_COVERAGE  \
0  Source: National Statistics and Infor

In [65]:
df_un.head()

Unnamed: 0,DATAFLOW,FREQ,REPORTING_TYPE,SERIES,country_code,SEX,AGE,URBANISATION,INCOME_WEALTH_QUANTILE,EDUCATION_LEV,...,COMMENT_OBS,TIME_COVERAGE,UPPER_BOUND,LOWER_BOUND,SOURCE_DETAIL,COMMENT_TS,GEO_INFO_URL,GEO_INFO_TYPE,CUST_BREAKDOWN_LB,DATA_LAST_UPDATE
0,IAEG-SDGs:DF_SDG_GLH(1.21),A,G,SI_POV_NAHC,4,_T,_T,_T,_T,_T,...,Source: National Statistics and Information Au...,,,,"Poverty and Inequality Platform, World Bank",,,,,2025-08-11T00:00:00
1,IAEG-SDGs:DF_SDG_GLH(1.21),A,G,SI_POV_NAHC,4,_T,_T,_T,_T,_T,...,Source: National Statistics and Information Au...,,,,"Poverty and Inequality Platform, World Bank",,,,,2025-08-11T00:00:00
2,IAEG-SDGs:DF_SDG_GLH(1.21),A,G,SI_POV_NAHC,4,_T,_T,_T,_T,_T,...,Source: National Statistics and Information Au...,,,,"Poverty and Inequality Platform, World Bank",,,,,2025-08-11T00:00:00
3,IAEG-SDGs:DF_SDG_GLH(1.21),A,G,SI_POV_NAHC,8,_T,_T,_T,_T,_T,...,Source: Albania: trends in poverty 2002-2005-2...,,,,"Poverty and Inequality Platform, World Bank",,,,,2025-08-11T00:00:00
4,IAEG-SDGs:DF_SDG_GLH(1.21),A,G,SI_POV_NAHC,8,_T,_T,_T,_T,_T,...,Source: Albania: trends in poverty 2002-2005-2...,,,,"Poverty and Inequality Platform, World Bank",,,,,2025-08-11T00:00:00


In [67]:
df_un.columns.tolist()

['DATAFLOW',
 'FREQ',
 'REPORTING_TYPE',
 'SERIES',
 'country_code',
 'SEX',
 'AGE',
 'URBANISATION',
 'INCOME_WEALTH_QUANTILE',
 'EDUCATION_LEV',
 'OCCUPATION',
 'CUST_BREAKDOWN',
 'COMPOSITE_BREAKDOWN',
 'DISABILITY_STATUS',
 'ACTIVITY',
 'PRODUCT',
 'year',
 'value',
 'OBS_STATUS',
 'UNIT_MULT',
 'UNIT_MEASURE',
 'BASE_PER',
 'NATURE',
 'TIME_DETAIL',
 'COMMENT_OBS',
 'TIME_COVERAGE',
 'UPPER_BOUND',
 'LOWER_BOUND',
 'SOURCE_DETAIL',
 'COMMENT_TS',
 'GEO_INFO_URL',
 'GEO_INFO_TYPE',
 'CUST_BREAKDOWN_LB',
 'DATA_LAST_UPDATE']

In [83]:
# Keep only relevant columns
df_un_simplified = df_un[['country_code', 'year', 'value']]

In [85]:
df_un_simplified.to_csv("un_data.csv", index=False)

In [87]:
df_un_simplified.sample

<bound method NDFrame.sample of       country_code  year  value
0                4  2007   33.7
1                4  2011   38.3
2                4  2016   54.5
3                8  2002   25.4
4                8  2005   18.5
...            ...   ...    ...
1040           887  2005   34.8
1041           887  2014   48.6
1042           894  2010   57.0
1043           894  2015   54.4
1044           894  2022   60.0

[1045 rows x 3 columns]>

In [91]:
df_un_simplified['country_code'].unique()

array([  4,   8,  12,  24,  31,  32,  40,  50,  51,  56,  64,  68,  70,
        72,  90, 100, 104, 108, 112, 116, 120, 132, 140, 144, 148, 152,
       156, 170, 174, 178, 180, 188, 191, 196, 203, 204, 208, 214, 218,
       222, 226, 231, 233, 242, 246, 250, 262, 266, 268, 270, 275, 276,
       288, 296, 300, 308, 320, 324, 332, 340, 348, 352, 356, 360, 368,
       372, 380, 384, 388, 398, 400, 404, 417, 418, 422, 426, 428, 430,
       440, 442, 450, 454, 458, 462, 466, 470, 478, 480, 484, 496, 498,
       499, 504, 508, 516, 524, 528, 548, 558, 562, 566, 578, 583, 584,
       585, 586, 591, 598, 600, 604, 608, 616, 620, 624, 626, 642, 643,
       646, 662, 678, 686, 688, 690, 694, 703, 704, 705, 706, 710, 716,
       724, 728, 729, 748, 752, 756, 760, 762, 764, 768, 776, 788, 792,
       798, 800, 804, 807, 818, 826, 834, 854, 858, 860, 862, 882, 887,
       894], dtype=int64)

In [95]:
import pandas as pd

# Suppose your DataFrame is df_un_simple with 'country_code', 'year', 'value'

# Full mapping of M49 codes to country names
country_map = {
    4: "Afghanistan", 8: "Albania", 12: "Algeria", 24: "Angola", 31: "Azerbaijan",
    32: "Argentina", 40: "Austria", 50: "Bangladesh", 51: "Armenia", 56: "Belgium",
    64: "Bhutan", 68: "Bolivia", 70: "Bosnia and Herzegovina", 72: "Botswana", 90: "Solomon Islands",
    100: "Bulgaria", 104: "Myanmar", 108: "Burundi", 112: "Belarus", 116: "Cambodia",
    120: "Cameroon", 132: "Cabo Verde", 140: "Central African Republic", 144: "Sri Lanka",
    148: "Chad", 152: "Chile", 156: "China", 170: "Colombia", 174: "Comoros", 178: "Congo",
    180: "Democratic Republic of the Congo", 188: "Costa Rica", 191: "Croatia", 196: "Cyprus",
    203: "Czechia", 204: "Benin", 208: "Denmark", 214: "Dominican Republic", 218: "Ecuador",
    222: "El Salvador", 226: "Equatorial Guinea", 231: "Ethiopia", 233: "Estonia",
    242: "Fiji", 246: "Finland", 250: "France", 262: "Djibouti", 266: "Gabon", 268: "Georgia",
    270: "Gambia", 275: "Palestine", 276: "Germany", 288: "Ghana", 296: "Kiribati",
    300: "Greece", 308: "Grenada", 320: "Guatemala", 324: "Guinea", 332: "Haiti",
    340: "Honduras", 348: "Hungary", 352: "Iceland", 356: "India", 360: "Indonesia",
    368: "Iraq", 372: "Ireland", 380: "Italy", 384: "Côte d’Ivoire", 388: "Jamaica",
    398: "Kazakhstan", 400: "Jordan", 404: "Kenya", 417: "Kyrgyzstan", 418: "Lao PDR",
    422: "Lebanon", 426: "Lesotho", 428: "Latvia", 430: "Liberia", 440: "Lithuania",
    442: "Luxembourg", 450: "Madagascar", 454: "Malawi", 458: "Malaysia", 462: "Maldives",
    466: "Mali", 470: "Malta", 478: "Mauritania", 480: "Mauritius", 484: "Mexico",
    496: "Mongolia", 498: "Republic of Moldova", 499: "Montenegro", 504: "Morocco",
    508: "Mozambique", 516: "Namibia", 524: "Nepal", 528: "Netherlands", 548: "Vanuatu",
    558: "Nicaragua", 562: "Niger", 566: "Nigeria", 578: "Norway", 583: "Micronesia",
    584: "Marshall Islands", 585: "Palau", 586: "Pakistan", 591: "Panama", 598: "Papua New Guinea",
    600: "Paraguay", 604: "Peru", 608: "Philippines", 616: "Poland", 620: "Portugal",
    624: "Guinea-Bissau", 626: "Timor-Leste", 642: "Romania", 643: "Russian Federation",
    646: "Rwanda", 662: "Saint Lucia", 678: "Sao Tome and Principe", 686: "Senegal",
    688: "Serbia", 690: "Seychelles", 694: "Sierra Leone", 703: "Slovakia", 704: "Vietnam",
    705: "Slovenia", 706: "Somalia", 710: "South Africa", 716: "Zimbabwe", 724: "Spain",
    728: "South Sudan", 729: "Sudan", 748: "Eswatini", 752: "Sweden", 756: "Switzerland",
    760: "Syrian Arab Republic", 762: "Tajikistan", 764: "Thailand", 768: "Togo",
    776: "Tonga", 788: "Tunisia", 792: "Türkiye", 798: "Tuvalu", 800: "Uganda",
    804: "Ukraine", 807: "North Macedonia", 818: "Egypt", 826: "United Kingdom",
    834: "Tanzania", 854: "Burkina Faso", 858: "Uruguay", 860: "Uzbekistan",
    862: "Venezuela", 882: "Samoa", 887: "Yemen", 894: "Zambia"
}

# Map country codes to names
df_un_simple['country'] = df_un_simple['country_code'].map(country_map)

# Drop rows where mapping failed (regions, unknowns)
df_un_simple = df_un_simple.dropna(subset=['country'])

df_un_simple.head()


NameError: name 'df_un_simple' is not defined