In [5]:
import pandas as pd
import glob
import os

In [6]:
DATA_DIR = "processed/main"   # folder with 3 wide CSVs
REGION_COLUMN = "Region"

In [7]:
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
csv_files
print("CSV files found:")
for f in csv_files:
    print(" -", os.path.basename(f))


CSV files found:
 - iiasa_ssp_2024.csv
 - IPCC_AR6_Scenarios_Database_ISO.csv
 - NGFS_Phase-5.csv


In [125]:
def read_csv_safe(path):
    try:
        return pd.read_csv(path, encoding="utf-8", low_memory=False)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin1", low_memory=False)

In [126]:
dfs = []

for file in csv_files:
    df = read_csv_safe(file)
    df["__source_file__"] = os.path.basename(file)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

print(f"\nTotal rows loaded: {len(combined_df):,}")



Total rows loaded: 4,631,315


In [127]:
unique_regions = (
    combined_df[REGION_COLUMN]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)

unique_regions = sorted(unique_regions)

print(f"Total unique regions found: {len(unique_regions)}\n")

for r in unique_regions:
    print(r)

Total unique regions found: 542

ABW
AFG
AGO
ALB
ARE
ARG
ARM
ATG
AUS
AUT
AZE
Afghanistan
Albania
Algeria
American Samoa
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Aruba
Asia (R5)
Australia
Austria
Azerbaijan
BDI
BEL
BEN
BFA
BGD
BGR
BHR
BHS
BIH
BLR
BLZ
BOL
BRA
BRB
BRN
BTN
BWA
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bonaire, Sint Eustatius and Saba
Bosnia and Herzegovina
Botswana
Brazil
British Virgin Islands
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
C?te d'Ivoire
CAF
CAN
CHE
CHL
CHN
CIV
CMR
COD
COG
COL
COM
CPV
CRI
CUB
CYP
CZE
Cabo Verde
Cambodia
Cameroon
Canada
Central African Republic
Chad
Chile
China
Colombia
Comoros
Congo
Cook Islands
Costa Rica
Croatia
Cuba
Cura?ao
Cyprus
Czechia
DEU
DJI
DNK
DOM
DZA
Democratic Republic of the Congo
Denmark
Djibouti
Dominica
Dominican Republic
Downscaling|Countries without IEA statistics
ECU
EGY
ERI
ESH
ESP
EST
ETH
EU
EU27
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethi

In [128]:
# Load ISO-3 country reference (AUTOMATIC)
import pycountry

In [129]:
def fix_utf8(text):
    if not isinstance(text, str):
        return text
    try:
        return text.encode("latin1").decode("utf-8")
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text

In [130]:
ISO3_TO_NAME = {
    country.alpha_3: country.name
    for country in pycountry.countries
}

In [131]:
# Manual fixes (ONLY where needed)
MANUAL_COUNTRY_FIXES = {
    "United States of America": "United States",
    "Russian Federation": "Russia",
    "Iran, Islamic Republic of": "Iran",
    "Côte d'Ivoire": "Côte d’Ivoire",
    "Bolivia, Plurinational State of": "Bolivia",
    "Korea, Republic of": "South Korea",
    "Korea, Democratic People's Republic of": "North Korea",
}


In [132]:
# (C?te d'Ivoire, R?union
ENCODING_FIXES = {
    
    "C?te d'Ivoire": "Côte d’Ivoire",
    "R?union": "Réunion",
    "Cura?ao": "Curaçao",
    
    # ---- Turkey ----
    "TUR": "Türkiye",
    "Turkey": "Türkiye",
    "Türkiye": "Türkiye",
    "TÃ¼rkiye": "Türkiye",

    # ---- Vietnam ----
    "VNM": "Vietnam",
    "Viet Nam": "Vietnam",

    # ---- Russia ----
    "RUS": "Russia",
    "Russian Federation": "Russia",

    # ---- Taiwan ----
    "TWN": "Taiwan",
    "Taiwan, Province of China": "Taiwan",
    "Taiwan": "Taiwan",

    # ---- Tanzania ----
    "TZA": "Tanzania",
    "Tanzania, United Republic of": "Tanzania",
    "Tanzania": "Tanzania",

    # ---- Venezuela ----
    "VEN": "Venezuela",
    "Venezuela, Bolivarian Republic of": "Venezuela",
    "Venezuela": "Venezuela",

    # ---- Syria ----
    "SYR": "Syria",
    "Syrian Arab Republic": "Syria",
    "Syria": "Syria",

    # ---- COD COG ----
    "COD": "Congo, The Democratic Republic of the",
    "Democratic Republic of the Congo": "Democratic Republic of the Congo",
    "COG": "Congo",

}


In [133]:
# Model-specific region parsing
def normalize_model_region(region):
    if "|" in region:
        model, reg = region.split("|", 1)
        return f"{reg.replace('_', ' ')} ({model.split()[0]})"
    return region


In [134]:
# Master normalization function (THIS IS THE CORE)
def normalize_region(region):
    if not region or not isinstance(region, str):
        return region

    # region = region.strip()
    region = fix_utf8(region.strip())
    # Encoding cleanup
    region = ENCODING_FIXES.get(region, region)

    # ISO-3 → country name
    if region in ISO3_TO_NAME:
        region = ISO3_TO_NAME[region]

    # Manual country fixes
    region = MANUAL_COUNTRY_FIXES.get(region, region)

    # Model-prefixed regions
    region = normalize_model_region(region)

    return region


In [135]:
# Apply to 542 regions
normalized_regions = {
    r: normalize_region(r)
    for r in unique_regions
}
# Show sample
for k in list(normalized_regions)[:30]:
    print(f"{k} → {normalized_regions[k]}")


ABW → Aruba
AFG → Afghanistan
AGO → Angola
ALB → Albania
ARE → United Arab Emirates
ARG → Argentina
ARM → Armenia
ATG → Antigua and Barbuda
AUS → Australia
AUT → Austria
AZE → Azerbaijan
Afghanistan → Afghanistan
Albania → Albania
Algeria → Algeria
American Samoa → American Samoa
Andorra → Andorra
Angola → Angola
Antigua and Barbuda → Antigua and Barbuda
Argentina → Argentina
Armenia → Armenia
Aruba → Aruba
Asia (R5) → Asia (R5)
Australia → Australia
Austria → Austria
Azerbaijan → Azerbaijan
BDI → Burundi
BEL → Belgium
BEN → Benin
BFA → Burkina Faso
BGD → Bangladesh


In [136]:
# Quality checks (VERY IMPORTANT)

# No ISO codes left
remaining_iso = [r for r in normalized_regions.values() if len(r) == 3 and r.isupper()]
print("Remaining ISO-like codes:", remaining_iso)


# Check duplicates collapsing correctly
from collections import Counter
Counter(normalized_regions.values()).most_common(20)


Remaining ISO-like codes: []


[('Aruba', 2),
 ('Afghanistan', 2),
 ('Angola', 2),
 ('Albania', 2),
 ('United Arab Emirates', 2),
 ('Argentina', 2),
 ('Armenia', 2),
 ('Antigua and Barbuda', 2),
 ('Australia', 2),
 ('Austria', 2),
 ('Azerbaijan', 2),
 ('Algeria', 2),
 ('Burundi', 2),
 ('Belgium', 2),
 ('Benin', 2),
 ('Burkina Faso', 2),
 ('Bangladesh', 2),
 ('Bulgaria', 2),
 ('Bahrain', 2),
 ('Bahamas', 2)]

In [137]:
with open("region_mapping.py", "w", encoding="utf-8") as f:
    f.write("REGION_MAP = {\n")
    for k, v in sorted(normalized_regions.items()):
        f.write(f'    "{k}": "{v}",\n')
    f.write("}\n")

print("✅ Final REGION_MAP written (542 → canonical regions)")

✅ Final REGION_MAP written (542 → canonical regions)


In [138]:
from Datasets.region_mapping import REGION_MAP

In [139]:
# Detect Unmapped Regions
unmapped = sorted(
    r for r in unique_regions
    if r not in REGION_MAP
)

print(f"Unmapped regions: {len(unmapped)}\n")

for r in unmapped:
    print(r)


Unmapped regions: 0



In [140]:
# Normalize Regions in the Dataset
def normalize_region_in_dataset(region):
    if pd.isna(region):
        return region
    region = str(region).strip()
    return REGION_MAP.get(region, region)
combined_df["Region_normalized"] = combined_df["Region"].apply(normalize_region_in_dataset)


In [141]:
# Sanity Check (Before vs After)
comparison = (
    combined_df[["Region", "Region_normalized"]]
    .drop_duplicates()
    .sort_values("Region")
)

comparison.head(30)


Unnamed: 0,Region,Region_normalized
621502,ABW,Aruba
621506,AFG,Afghanistan
621510,AGO,Angola
621514,ALB,Albania
621518,ARE,United Arab Emirates
279551,ARG,Argentina
621526,ARM,Armenia
621530,ATG,Antigua and Barbuda
452178,AUS,Australia
621538,AUT,Austria


In [142]:
# Export Cleaned CSVs
OUTPUT_DIR = "processed/region_normalized"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for file in csv_files:
    fname = os.path.basename(file)

    df = combined_df[
        combined_df["__source_file__"] == fname
    ].copy()

    df["Region"] = df["Region_normalized"]
    df.drop(columns=["Region_normalized", "__source_file__"], inplace=True)

    out_path = os.path.join(OUTPUT_DIR, fname)
    df.to_csv(out_path, index=False)

    print(f"✅ Written: {out_path}")


✅ Written: Datasets/processed/region_normalized\iiasa_ssp_2024.csv
✅ Written: Datasets/processed/region_normalized\IPCC_AR6_Scenarios_Database_ISO.csv
✅ Written: Datasets/processed/region_normalized\NGFS_Phase-5.csv
