In [1]:
import pandas as pd

from wsi.mapping.iso_region import CODE_SUBREGION
from wsi.utils import raw_data_path, processed_data_path

## World Bank classifications

In [None]:
df = pd.read_excel(raw_data_path("mapping", "CLASS.xlsx"))

# overwrite Malta (from WB), it's not Middle East & North Africa
df.loc[df['Code']=='MLT', 'Region'] = 'Europe & Central Asia'
df.loc[df['Code']=='BMU', 'Region'] = 'Latin America & Caribbean'

# map on our subregion
df['Subregion'] = df['Code'].map(CODE_SUBREGION)
df=df.dropna(subset='Region')

In [None]:
# 1) Find subregions with >1 unique parent Region
subregion_counts = (
    df.groupby('Subregion')['Region']
      .nunique()
      .reset_index(name='n_regions')
)
bad_subregions = subregion_counts.loc[subregion_counts['n_regions'] > 1, 'Subregion']

# 2) Filter to only those countries whose Subregion is in bad_subregions
inconsistent_countries = df[df['Subregion'].isin(bad_subregions)]

# 3) Show the specific countries, their Region and Subregion
result = inconsistent_countries[['Economy', 'Code', 'Region', 'Subregion']].reset_index(drop=True)
print(result)


Empty DataFrame
Columns: [Economy, Code, Region, Subregion]
Index: []


In [None]:
def summarize_by_subregion(df):
    def fmt_economies(subdf):
        # format "Name (Code)", sorted by name
        pairs = sorted(zip(subdf['Economy'], subdf['Code']), key=lambda x: x[0])
        return ', '.join(f"{name} ({code})" for name, code in pairs)

    summary = (
        df
        .groupby(['Region', 'Subregion'])
        .apply(lambda g: pd.Series({
            'Economies': fmt_economies(g)
        }))
        .reset_index()
    )
    return summary

summary = summarize_by_subregion(df)

  .apply(lambda g: pd.Series({


In [8]:
display(summary)

Unnamed: 0,Region,Subregion,Economies
0,East Asia & Pacific,Australia and New Zealand,"Australia (AUS), New Zealand (NZL)"
1,East Asia & Pacific,East Asia,"China (CHN), Hong Kong SAR, China (HKG), Japan..."
2,East Asia & Pacific,Melanesia,"Fiji (FJI), New Caledonia (NCL), Papua New Gui..."
3,East Asia & Pacific,Micronesia,"Guam (GUM), Kiribati (KIR), Marshall Islands (..."
4,East Asia & Pacific,Polynesia,"American Samoa (ASM), French Polynesia (PYF), ..."
5,East Asia & Pacific,Southeast Asia,"Brunei Darussalam (BRN), Cambodia (KHM), Indon..."
6,Europe & Central Asia,Central Asia,"Kazakhstan (KAZ), Kyrgyz Republic (KGZ), Tajik..."
7,Europe & Central Asia,Eastern Europe,"Belarus (BLR), Bulgaria (BGR), Czechia (CZE), ..."
8,Europe & Central Asia,Northern Europe,"Denmark (DNK), Estonia (EST), Faroe Islands (F..."
9,Europe & Central Asia,Southern Europe,"Albania (ALB), Andorra (AND), Bosnia and Herze..."


In [9]:
summary.to_csv(processed_data_path("region_subregion_countries.csv"), index=False)

### Overwrites

Changes to Region:
Bermuda (BMU) [Region was North America at WB -> we have Latin America & Caribbean]
Malta (MLT) [region was Middle East & North Africa at WB -> we have Europe & Central Asia]

Changes to Subegion (now updated in mapping):
West Bank and Gaza (PSE) [subregion was western asia -> Middle east]
Sudan	SDN [subregion was Northern Africa -> Eastern Africa]
Djibouti DJI [subregion was Eastern Africa -> Northern Africa]