In [3]:
import pandas as pd
import requests
import json
from pprint import pprint
import matplotlib.pyplot as plt
import math


#This was a map option that I found first, but required new libraries
# # https://towardsdatascience.com/how-to-visualize-data-on-top-of-a-map-in-python-using-the-geoviews-library-part-2-e61a48ee6c3d
# import geopandas as gpd
# import geoviews as gv
# import geoviews.tile_sources as gvts
# from geoviews import opts
# gv.extension('bokeh')

#THis was a library for the above mapping, but could not get to work
# # https://github.com/Toblerity/Fiona
# import fiona
# from fiona import Feature, Geometry
# from shapely.geometry import mapping, shape
# import warnings
# warnings.filterwarnings('ignore')

#Pull in API key from personal file
from api_keys import geoapify_key

In [4]:

#Read in our base dataset as a dataframe
world_data_df = pd.read_csv('../Datasets/world_population.csv')

#set parameters for later API searches on GeoApify
base_url = "https://api.geoapify.com/v1/geocode/search"
params = {
    "apiKey":geoapify_key,
    "format":"json",
    "limit": 1
}

#Add columns for geoapify country name and iso_alpha to normalize across datasets
world_data_df['geo_country'] = ""
world_data_df['iso_two'] = ""

#Briefly explored this as a better way to add columns, but removed because it kept adding if code executed again
# world_data_df = world_data_df.reindex(columns = world_data_df.columns.tolist() + ['geo_country'])
# world_data_df = world_data_df.reindex(columns = world_data_df.columns.tolist() + ['iso_two'])
# # world_data_df = world_data_df.reindex(columns = world_data_df.columns.tolist() + ['iso_three'])

#Look at first five rows of base dataset to confirm new columns
world_data_df.head()

Unnamed: 0,Rank,CCA3,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage,geo_country,iso_two
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52,,
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04,,
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56,,
3,213,ASM,American Samoa,Pago Pago,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199,222.4774,0.9831,0.0,,
4,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0,,


In [5]:
#This code was used to test specific API pulls to see how the errors and duplication were occuring in the data


#Manual country input to trace errors/dupes
params['country'] = 'New Caledonia'
current_country = requests.get(base_url, params).json()

pprint(current_country)

{'query': {'parsed': {'country': 'New Caledonia', 'expected_type': 'unknown'},
           'text': ''},
 'results': [{'address_line1': 'France',
              'address_line2': '',
              'archipelago': 'New Caledonia',
              'bbox': {'lat1': -23.2217509,
                       'lat2': -17.6868616,
                       'lon1': 157.9847541,
                       'lon2': 172.3057152},
              'category': 'administrative',
              'country': 'France',
              'country_code': 'fr',
              'datasource': {'attribution': '© OpenStreetMap contributors',
                             'license': 'Open Database License',
                             'sourcename': 'openstreetmap',
                             'url': 'https://www.openstreetmap.org/copyright'},
              'formatted': 'France',
              'lat': -21.3019905,
              'lon': 165.4880773,
              'place_id': '51702d49549eaf644059e333d93f4f4d35c0f00101f9011bff330000000000c0020b',

In [6]:


# Iterate dataset to remove non-countries and normalize name syntax and iso key
for index, row in world_data_df.iterrows():

    #Set country parameter for current row and call API
    params['country'] = world_data_df.loc[index, 'Country/Territory']
    current_country = requests.get(base_url, params).json()
    
    # Attempt to add proper syntax to geo_country and iso_two in base dataframe
    try: 
        world_data_df.loc[index, 'geo_country'] = current_country['results'][0]['country']
        world_data_df.loc[index, 'iso_two'] = current_country['results'][0]['country_code']
        # Show progress for each loop
        print(f"Indexing: {world_data_df.loc[index, 'geo_country']} as {world_data_df.loc[index, 'iso_two']} and confidence: {current_country['results'][0]['rank']['confidence']}")

        #If the serach confidence is zero, drop the row
        if current_country['results'][0]['rank']['confidence'] == 0:
            world_data_df = world_data_df.drop(labels=index, axis=0)

    #If the above try fails with no country/iso, delete the row
    except IndexError:
        print(f"Dropping: {current_country}")
        world_data_df = world_data_df.drop(labels=index, axis=0)
        


    


Indexing: Afghanistan as af and confidence: 1
Indexing: Albania as al and confidence: 1
Indexing: Algeria as dz and confidence: 1
Dropping: {'results': [], 'query': {'text': '', 'parsed': {'country': 'American Samoa', 'expected_type': 'unknown'}}}
Indexing: Andorra as ad and confidence: 1
Indexing: Angola as ao and confidence: 1
Indexing: Anguilla as ai and confidence: 1
Indexing: Antigua and Barbuda as ag and confidence: 1
Indexing: Argentina as ar and confidence: 1
Indexing: Armenia as am and confidence: 1
Indexing: Aruba as aw and confidence: 1
Indexing: Australia as au and confidence: 1
Indexing: Austria as at and confidence: 1
Indexing: Azerbaijan as az and confidence: 1
Indexing: The Bahamas as bs and confidence: 1
Indexing: Bahrain as bh and confidence: 1
Indexing: Bangladesh as bd and confidence: 1
Indexing: Barbados as bb and confidence: 1
Indexing: Belarus as by and confidence: 1
Indexing: Belgium as be and confidence: 1
Indexing: Belize as bz and confidence: 1
Indexing: Beni

Indexing: Rwanda as rw and confidence: 1
Indexing: St. Lucia as lc and confidence: 0
Indexing: Saint Kitts and Nevis as kn and confidence: 1
Indexing: Saint Lucia as lc and confidence: 1
Indexing: Sint Maarten as sx and confidence: 0
Dropping: {'results': [], 'query': {'text': '', 'parsed': {'country': 'Saint Pierre and Miquelon', 'expected_type': 'unknown'}}}
Indexing: Saint Vincent and the Grenadines as vc and confidence: 1
Indexing: Samoa as ws and confidence: 1
Indexing: San Marino as sm and confidence: 1
Indexing: São Tomé and Príncipe as st and confidence: 1
Indexing: Saudi Arabia as sa and confidence: 1
Indexing: Senegal as sn and confidence: 1
Indexing: Serbia as rs and confidence: 1
Indexing: Seychelles as sc and confidence: 1
Indexing: Sierra Leone as sl and confidence: 1
Indexing: Singapore as sg and confidence: 1
Indexing: Sint Maarten as sx and confidence: 1
Indexing: Slovakia as sk and confidence: 1
Indexing: Slovenia as si and confidence: 1
Indexing: Solomon Islands as s

In [7]:
# https://github.com/stefangabos/world_countries/blob/master/data/countries/_combined/countries.csv

# Bring in the countrynames dataset to merge on the two character iso
countrynames = pd.read_csv('../Datasets/countries.csv')

#Rename columns and reduce dataset to just the iso country codes
countrynames.rename(columns={'alpha2': 'iso_two', 'alpha3': 'iso_three'}, inplace=True)
countrynames = countrynames[['iso_two', 'iso_three']]

#Merge the base dataset with the new one containing both iso codes. Merge on two-character iso
countryiso = pd.merge(world_data_df, countrynames, on='iso_two', how='left')
countryiso['iso_two'].value_counts()
countryiso.head()

Unnamed: 0,Rank,CCA3,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage,geo_country,iso_two,iso_three
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52,Afghanistan,af,afg
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04,Albania,al,alb
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56,Algeria,dz,dza
3,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0,Andorra,ad,and
4,42,AGO,Angola,Luanda,Africa,35588987,33428485,28127721,23364185,16394062,11828638,8330047,6029700,1246700,28.5466,1.0315,0.45,Angola,ao,ago


In [8]:
#Check for all rows having same CCA3 and three-character iso
for index, row in countryiso.iterrows():
    if countryiso.loc[index, 'CCA3'].lower() != countryiso.loc[index, 'iso_three']:
        print(f"{countryiso.loc[index, 'geo_country']} does not have matching iso codes: CCA3 {countryiso.loc[index, 'CCA3']} vs. iso_three {countryiso.loc[index, 'iso_three']}")
        #Delete the row if iso codes don't match
        countryiso = countryiso.drop(labels=index, axis=0)

Anguilla does not have matching iso codes: CCA3 AIA vs. iso_three nan
Aruba does not have matching iso codes: CCA3 ABW vs. iso_three nan
Bermuda does not have matching iso codes: CCA3 BMU vs. iso_three nan
British Virgin Islands does not have matching iso codes: CCA3 VGB vs. iso_three nan
Cayman Islands does not have matching iso codes: CCA3 CYM vs. iso_three nan
Cook Islands does not have matching iso codes: CCA3 COK vs. iso_three nan
Curacao does not have matching iso codes: CCA3 CUW vs. iso_three nan
Falkland Islands does not have matching iso codes: CCA3 FLK vs. iso_three nan
Faroe Islands does not have matching iso codes: CCA3 FRO vs. iso_three nan
Gibraltar does not have matching iso codes: CCA3 GIB vs. iso_three nan
Greenland does not have matching iso codes: CCA3 GRL vs. iso_three nan
Guernsey does not have matching iso codes: CCA3 GGY vs. iso_three nan
Hong Kong S.A.R. does not have matching iso codes: CCA3 HKG vs. iso_three nan
Isle of Man does not have matching iso codes: CC

In [None]:

# world_data_df.loc[world_data_df['geo_country'] == 'France']

In [None]:
# world_data_df = world_data_df.rename(columns={'Country/Territory': 'country'})


# world_data = pd.merge(world_data_df, mapcheck, on='geo_country', how='inner')

# world_data['country_y'].value_counts()
# for index, row in world_data.iterrows():
#     if world_data.loc[index, 'country_x'] != world_data.loc[index, 'country_y']:
#         print(world_data.loc[index, 'country_x'])
        


# duplicates = world_data.loc[world_data.duplicated(subset=['country_x', 'country_y']), 'country_x'].unique()

# duplicate_df = []
# for x in range(len(duplicates)):
#     countryid = duplicates[x]
#     for y in range(len(world_data)):
#         if countryid == world_data.loc[y, 'country_x']:
#             duplicate_df.append(world_data.loc[y, :])
        
# duplicate_all = pd.DataFrame(duplicate_df)
# duplicate_all

# for x in range(len(duplicates)):
#     countryid = duplicates[x]
#     for y in range(len(world_data)):
#         if countryid == world_data.loc[y, 'country_x']:
#             world_data.drop([y], inplace=True)

# duplicates = world_data.loc[world_data.duplicated(subset=['country_x', 'country_y']), 'country_x'].unique()

In [9]:
#Check to see if API call was useful. It was.
for index, row in world_data_df.iterrows():
    if world_data_df.loc[index, 'Country/Territory'] != world_data_df.loc[index, 'geo_country']:
        print(world_data_df.loc[index, 'Country/Territory'])

Bahamas
Czech Republic
DR Congo
Gambia
Hong Kong
Ivory Coast
Macau
Micronesia
Palestine
Republic of the Congo
Sao Tome and Principe
Timor-Leste


In [10]:
# This was just playing with the datframe trying to find other ways to chart/graph it.
# Didn't lead anywhere yet
world_data_list = []
for index, row in world_data_df.iterrows():
    world_data_list.append([world_data_df.loc[index, '1970 Population'], world_data_df.loc[index, '1980 Population'], world_data_df.loc[index, '1990 Population'],
                            world_data_df.loc[index, '2000 Population'], world_data_df.loc[index, '2010 Population'], world_data_df.loc[index, '2015 Population'],
                            world_data_df.loc[index, '2020 Population'], world_data_df.loc[index, '2022 Population']])

world_data_list
world_data_graph = pd.DataFrame({'country': world_data_df['geo_country'],
                                'area': world_data_df['Area (km²)'],
                                 'continent': world_data_df['Continent'],
                                'population': world_data_list})



In [None]:
# https://coderzcolumn.com/tutorials/data-science/how-to-convert-static-maps-geopandas-to-interactive-maps-hvplot
# https://towardsdatascience.com/how-to-visualize-data-on-top-of-a-map-in-python-using-the-geoviews-library-part-2-e61a48ee6c3d

# shapes_df = gpd.read_file('map_shapes/ne_50m_admin_0_countries.shp', driver='ESRI Shapefile')

In [11]:
import plotly.express as px

#Mapping figure from GET LINK
# dataframe is first variable, columns are customizations.
fig = px.choropleth(country_cleaned, locations="iso_map",
                    color="growth_rate_10_22", 
                    hover_name="country",
                    color_continuous_scale=px.colors.sequential.PuBu)
fig.show()

NameError: name 'country_cleaned' is not defined

In [12]:
# Clean the new dataframe 

countryiso.head()
countryiso.rename(columns={'Country/Territory': 'org_country',
                          'CCA3': 'iso_map',
                          '2022 Population': '2022_pop',
                           '2020 Population': '2020_pop',
                           '2015 Population': '2015_pop',
                           '2010 Population': '2010_pop',
                           '2000 Population': '2000_pop',
                           '1990 Population': '1990_pop',
                           '1980 Population': '1980_pop',
                           '1970 Population': '1970_pop',
                          'Area (km²)': 'area (km)',
                          'Density (per km²)': 'density',
                          'World Population Percentage': 'percent_pop',
                          'geo_country': 'country'}, inplace=True)

country_cleaned = countryiso.reindex(columns=['country', 'iso_map', '2022_pop', '2020_pop', '2015_pop', '2010_pop', '2000_pop', 
                            '1990_pop', '1980_pop', '1970_pop', 'area (km)', 'density', 'iso_two', 'iso_three',
                           'Capital', 'Continent'])

country_cleaned.head()

Unnamed: 0,country,iso_map,2022_pop,2020_pop,2015_pop,2010_pop,2000_pop,1990_pop,1980_pop,1970_pop,area (km),density,iso_two,iso_three,Capital,Continent
0,Afghanistan,AFG,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,af,afg,Kabul,Asia
1,Albania,ALB,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,al,alb,Tirana,Europe
2,Algeria,DZA,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,dz,dza,Algiers,Africa
3,Andorra,AND,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,ad,and,Andorra la Vella,Europe
4,Angola,AGO,35588987,33428485,28127721,23364185,16394062,11828638,8330047,6029700,1246700,28.5466,ao,ago,Luanda,Africa


In [13]:
country_cleaned['growth_rate_10_22'] = (((country_cleaned['2022_pop'] / country_cleaned['2010_pop']) ** (1/12)) - 1)  * 100
# https://ca.indeed.com/career-advice/career-development/how-to-calculate-growth-rate
country_cleaned.head()


country_cleaned.to_csv('../Datasets/cleaned_base.csv')
country_cleaned.head()

Unnamed: 0,country,iso_map,2022_pop,2020_pop,2015_pop,2010_pop,2000_pop,1990_pop,1980_pop,1970_pop,area (km),density,iso_two,iso_three,Capital,Continent,growth_rate_10_22
0,Afghanistan,AFG,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,af,afg,Kabul,Asia,3.198007
1,Albania,ALB,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,al,alb,Tirana,Europe,-0.205617
2,Algeria,DZA,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,dz,dza,Algiers,Africa,1.892596
3,Andorra,AND,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,ad,and,Andorra la Vella,Europe,0.919712
4,Angola,AGO,35588987,33428485,28127721,23364185,16394062,11828638,8330047,6029700,1246700,28.5466,ao,ago,Luanda,Africa,3.569151
