In [8]:
import pandas as pd
import json
import unicodedata
import re

POLLUTION_JSON_PATH = "pollution.json"
GREEN_CSV_PATH = "green_areas.csv"

In [None]:
# Normalisation de base

def normalize_str(s):
    if pd.isna(s):
        return None
    s = str(s).replace("\xa0", " ").strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s)
    return s.lower()

def harmonize_country(cn_norm: str) -> str:
    if cn_norm is None:
        return None
    COUNTRY_MAP = {
        # déjà utilisés
        "russian federation": "russia",
        "united states of america": "united states",
        "u.s.a.": "united states",
        "great britain": "united kingdom",
        "uk": "united kingdom",

        # ajoutés suite à vos exemples "ONU"
        "turkiye": "turkey",
        "iran (islamic republic of)": "iran",
        "syrian arab republic": "syria",
        "lao people's democratic republic": "laos",
        "bolivia (plurinational state of)": "bolivia",
        "venezuela (bolivarian republic of)": "venezuela",
        "tanzania, united republic of": "tanzania",
        "congo, democratic republic of the": "democratic republic of the congo",
        "democratic republic of the congo": "democratic republic of the congo",
        "cote d'ivoire": "ivory coast",
        "cabo verde": "cape verde",
        # ajoutez au besoin, mais gardez ça minimal

        "united kingdom of great britain and northern ireland": "united kingdom",
        "republic of korea": "south korea",   # à vérifier selon comment Kaggle le nomme
        "state of palestine": "palestine",
    }
    return COUNTRY_MAP.get(cn_norm, cn_norm)

def strip_parentheses(s: str) -> str:
    # "tiaret (tihert)" -> "tiaret"
    return re.sub(r"\s*\(.*?\)\s*", " ", s).strip()

def extract_parentheses(s: str) -> str | None:
    # "kobenhavn (copenhagen)" -> "copenhagen"
    m = re.search(r"\((.*?)\)", s)
    if not m:
        return None
    inside = m.group(1).strip()
    # "nishapur/neyshabur" -> "nishapur" (première variante)
    inside = inside.split("/")[0].strip()
    return inside if inside else None

def strip_after_comma(s: str) -> str:
    # "yulin, guangxi" -> "yulin"
    return s.split(",")[0].strip()

def normalize_city_candidates(city_raw: str) -> list[str]:
    """Génère plusieurs clés candidates (normalisées) pour maximiser le matching."""
    if city_raw is None or pd.isna(city_raw):
        return []
    raw = str(city_raw)
    
    cand = []

    # Candidate principale: normalisation basique
    base = normalize_str(raw)
    if base:
        cand.append(base)

    # Avant parenthèses
    main = normalize_str(strip_parentheses(raw))
    if main and main not in cand:
        cand.append(main)

    # Dans parenthèses (alias)
    inside = extract_parentheses(raw)
    if inside:
        inside_norm = normalize_str(inside)
        if inside_norm and inside_norm not in cand:
            cand.append(inside_norm)

    # Avant virgule
    before_comma = normalize_str(strip_after_comma(raw))
    if before_comma and before_comma not in cand:
        cand.append(before_comma)

    # Optionnel : enlever les tirets (parfois Kaggle a "san pedro sula" vs "san-pedro-sula")
    dehyphen = normalize_str(raw.replace("-", " "))
    if dehyphen and dehyphen not in cand:
        cand.append(dehyphen)

    return cand

def to_float(x):
    """Gère '  5.49 ' et aussi des virgules européennes '5,49'."""
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().replace("\xa0", " ")
    s = s.replace(",", ".")
    s = re.sub(r"\s+", "", s)
    if s == "":
        return pd.NA
    return pd.to_numeric(s, errors="coerce")

In [10]:
# Charger pollution set

with open(POLLUTION_JSON_PATH, encoding="utf-8") as f:
    pollution = json.load(f)

df_poll = pd.json_normalize(pollution["measurements"])
df_poll["country_join"] = df_poll["country_name"].apply(normalize_str).apply(harmonize_country)
df_poll["city_join"] = df_poll["city_name"].apply(normalize_str)

poll_pairs = set(zip(df_poll["country_join"], df_poll["city_join"]))

print("Pollution pairs:", len(poll_pairs))

Pollution pairs: 23035


In [11]:
# Charger green

df_green = pd.read_csv(GREEN_CSV_PATH, encoding="utf-8-sig")

col_country = "Country or Territory Name"
col_citycode = "City Code"
col_cityname = "City Name"
col_share2020 = "Average share of green area in city/ urban area 2020 (%)"
col_percap2020 = "Green area per capita 2020 (m2/person)"

# Drop lignes incomplètes city name / code
df_green[col_cityname] = df_green[col_cityname].replace(r"^\s*$", pd.NA, regex=True)
df_green[col_citycode] = df_green[col_citycode].replace(r"^\s*$", pd.NA, regex=True)
df_green = df_green.dropna(subset=[col_cityname, col_citycode]).copy()

# Normaliser pays
df_green["country_join"] = df_green[col_country].apply(normalize_str).apply(harmonize_country)

# Construire une clé ville choisie "best match"
def choose_best_city_join(row):
    ctry = row["country_join"]
    candidates = normalize_city_candidates(row[col_cityname])
    for cand in candidates:
        if (ctry, cand) in poll_pairs:
            return cand
    # fallback : meilleure candidate "main" (sans parenthèses) sinon base
    return candidates[1] if len(candidates) > 1 else (candidates[0] if candidates else None)

df_green["city_join"] = df_green.apply(choose_best_city_join, axis=1)

# Convertir numeric 2020 (facultatif mais pratique)
df_green["green_share_2020"] = df_green[col_share2020].apply(to_float)
df_green["green_m2_per_capita_2020"] = df_green[col_percap2020].apply(to_float)


In [12]:
# Stats avant / après

green_pairs_after = set(zip(df_green["country_join"], df_green["city_join"]))
common_after = green_pairs_after & poll_pairs

print("Green pairs (after drop NA) :", len(green_pairs_after))
print("Common after improvements  :", len(common_after))

# Exemples non matchés (après)
only_green = list(green_pairs_after - poll_pairs)[:20]
print("Exemples green NON trouvés (après):", only_green)

Green pairs (after drop NA) : 659
Common after improvements  : 450
Exemples green NON trouvés (après): [('nicaragua', 'leon'), ('iraq', 'diwaniyah'), ('eswatini', 'mbabane'), ('australia', 'bunbury'), ('ireland', 'dublin'), ('iraq', 'amara'), ('chile', 'los angeles'), ('mexico', 'campeche_campeche'), ('chile', 'santiago'), ('pakistan', 'sheikhupura'), ('russia', 'moskva'), ('united states', 'toledo'), ('south korea', 'jeju'), ('canada', 'ottawa gatineau'), ('palestine', 'nabulus'), ('egypt', 'az zaqazig'), ('mexico', 'merida'), ('morocco', 'sidi slimane'), ('iran', 'bandar abbas'), ('philippines', 'davao city')]


In [None]:
# Export facelift (sans FootNote/Data Source par défaut)

out_cols = [
    col_country, col_citycode, col_cityname,
    "country_join", "city_join",
    "green_share_2020", "green_m2_per_capita_2020"
]
df_green[out_cols].to_csv("green_areas_cleaned.csv", index=False, encoding="utf-8")
print("Wrote: green_areas_cleaned.csv")

Wrote: green_areas_cleaned.csv
['south korea']
