In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd


## Extracting Values

In [2]:
# Load the GeoJSON file from World Bank
world = gpd.read_file('..\Data_Sets\originalData\WB_countries_Admin0.geojson')

In [3]:
df_countriesSize = world[['WB_NAME', 'Shape_Leng', 'Shape_Area', 'SUBREGION']]

Renaming regions to have 2 levels of grouping

In [4]:
df_countriesSize['REGION'] = df_countriesSize['SUBREGION']

df_countriesSize['REGION'].replace(
    {
    'South-Eastern Asia': 'SEA_Oceania',
    'Australia and New Zealand': 'SEA_Oceania',
    'Melanesia': 'SEA_Oceania',
    'Micronesia': 'SEA_Oceania',
    'Polynesia': 'SEA_Oceania',
    'South America': 'SouthAmerica',
    'Central America': 'CentralAmerica_Caribbean',
    'Caribbean': 'CentralAmerica_Caribbean',
    'Northern America': 'NorthAmerica',
    'Eastern Europe': 'EasternEurope',
    'Western Europe': 'WesternEurope',
    'Northern Europe': 'WesternEurope',
    'Southern Europe': 'WesternEurope',
    'Eastern Asia': 'CentralAsia',
    'Central Asia': 'CentralAsia',
    'Southern Asia': 'SouthAsia',
    'Eastern Africa': 'SubSaharanAfrica',
    'Middle Africa': 'SubSaharanAfrica',
    'Southern Africa': 'SubSaharanAfrica',
    'Western Africa': 'SubSaharanAfrica',
    'Northern Africa': 'NorthAfrica_MiddleEast',
    'Western Asia': 'NorthAfrica_MiddleEast',
    'Seven seas (open ocean)': 'SS'
    }, inplace=True)

df_countriesSize = df_countriesSize[df_countriesSize['REGION'] != 'SS']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_countriesSize['REGION'] = df_countriesSize['SUBREGION']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_countriesSize['REGION'].replace(


Let's delete countries we don't need in the database

In [5]:
#Renaming Countries from df_WB to df_EF standard
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Czechia", "WB_NAME"] = "Czech Republic"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Cabo Verde", "WB_NAME"] = "Cape Verde"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Congo, Democratic Republic of", "WB_NAME"] = "Democratic Republic of Congo"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Congo, Rep. of", "WB_NAME"] = "Republic of Congo"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Egypt, Arab Republic of", "WB_NAME"] = "Egypt"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Hong Kong (SAR, China)", "WB_NAME"] = "Hong Kong"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Iran, Islamic Republic of", "WB_NAME"] = "Iran"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Lao People's Democratic Republic", "WB_NAME"] = "Laos"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Macau (SAR, China)", "WB_NAME"] = "Macau"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Micronesia, Federated States of", "WB_NAME"] = "Micronesia"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Korea, Democratic People's Republic of", "WB_NAME"] = "North Korea"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Korea, Republic of", "WB_NAME"] = "South Korea"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Russian Federation", "WB_NAME"] = "Russia"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "St. Lucia", "WB_NAME"] = "Saint Lucia"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "St. Vincent and the Grenadines", "WB_NAME"] = "Saint Vincent and the Grenadines"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Syrian Arab Republic", "WB_NAME"] = "Syria"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Bahamas, The", "WB_NAME"] = "The Bahamas"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Gambia, The", "WB_NAME"] = "The Gambia"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Venezuela, Republica Bolivariana de", "WB_NAME"] = "Venezuela"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Yemen, Republic of", "WB_NAME"] = "Yemen"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "São Tomé and Príncipe", "WB_NAME"] = "Sao Tome and Principe"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Turkiye", "WB_NAME"] = "Turkey"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "Côte d'Ivoire", "WB_NAME"] = "Cote d'Ivoire"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "United States of America", "WB_NAME"] = "United States"
df_countriesSize.loc[df_countriesSize["WB_NAME"] == "eSwatini", "WB_NAME"] = "Eswatini"

In [6]:
# importing our data set, to compare country names
df_nameRef = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv") # Processed Data Set from an earlier notebook

eligibleCountries = df_nameRef["Country Name"].unique()
filtered_df_WB = df_countriesSize[df_countriesSize['WB_NAME'].isin(eligibleCountries)]

print('original names count  :', df_nameRef["Country Name"].nunique())
print('WB_beforeFilter count :', df_countriesSize['WB_NAME'].nunique())
print('filtered count        :', filtered_df_WB['WB_NAME'].nunique())

original names count  : 185
WB_beforeFilter count : 236
filtered count        : 185


In [7]:
# Let's rename some columns for consistency
filtered_df_WB.rename(columns={
    'WB_NAME': 'Country Name',
    'REGION': 'Region',
    'SUBREGION': 'SubRegion'
    }, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_WB.rename(columns={


In [8]:
filtered_df_WB.to_csv('..\Data_Sets\processed\\addData_countriesSize_Regions.csv', index = False)