In [3]:
import pandas as pd
import json
import unicodedata
import re
from collections import Counter

In [6]:
# Config chemins

POLLUTION_JSON_PATH = "projet/data/pollution.json"

# Si exportÃ© l'Excel en CSV :
GREEN_CSV_PATH = "projet/data/green_areas.csv"

# Si on veut lire directement l'Excel :
GREEN_XLSX_PATH = "projet/data/green_areas.xlsx"

USE_GREEN_XLSX = True  # False si on utilise le CSV

In [None]:
# Helpers de nettoyage / normalisation

def normalize_str(s):
    """minuscule + trim + enlever NBSP + normaliser accents + compacter espaces"""
    if pd.isna(s):
        return None
    s = str(s)
    s = s.replace("\xa0", " ")     # NBSP -> espace
    s = s.strip()
    # Normalisation unicode + suppression des accents
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    # Remplacer espaces multiples par un seul
    s = re.sub(r"\s+", " ", s)
    return s.lower()

COUNTRY_MAP = {
    "russian federation": "russia",
    "united states of america": "united states",
    "united states": "united states",
    "u.s.a.": "united states",
    "great britain": "united kingdom",
    "uk": "united kingdom",
    # ajouter d'autres cas si on en voit
}

def harmonize_country_norm(cn):
    if cn is None:
        return None
    return COUNTRY_MAP.get(cn, cn)

In [None]:
# Charger pollution (JSON facelift)

with open(POLLUTION_JSON_PATH, encoding="utf-8") as f:
    pollution = json.load(f)

df_poll = pd.json_normalize(pollution["measurements"])

# Normalisation pays / villes pour jointure
df_poll["country_norm"] = df_poll["country_name"].apply(normalize_str)
df_poll["city_norm"] = df_poll["city_name"].apply(normalize_str)
df_poll["country_join"] = df_poll["country_norm"].apply(harmonize_country_norm)

print("=== Pollution JSON ===")
print("Nb de lignes :", len(df_poll))
print(df_poll[["country_name", "city_name"]].head(), "\n")