In [1]:
# Biblioth√®ques standard
import os
import glob
import json
import time
from math import radians, cos, sin, asin, sqrt

# Biblioth√®ques tierces
import requests
import pandas as pd
import geopandas as gpd
from IPython.display import display

# PySpark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import to_date, col, lit, substring, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.window import Window

# Pr√©paration des donn√©es

## Phase 1 : Initialisation de l'Environnement

In [2]:
# --- 1. Configuration et Initialisation de Spark ---
# Augmentation des timeouts et allocation de m√©moire stricte pour √©viter les crashs JVM
spark = SparkSession.builder \
    .appName("DataLake_NOAA_NYC_Prep") \
    .config("spark.executor.memory", "3g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.network.timeout", "800s") \
    .config("spark.rpc.askTimeout", "800s") \
    .getOrCreate()

# --- 2. D√©finition des Param√®tres G√©ographiques et HDFS ---
# Bo√Æte englobante de la r√©gion de NYC afin de restreindre l'import des donn√©es NOAA
MIN_LAT, MAX_LAT = 40.0, 41.5
MIN_LON, MAX_LON = -75.0, -73.0

# Chemin HDFS BRUT
RAW_OUTPUT_PATH = "hdfs://namenode:9000/user/mathis/datalake/noaa_gsod_nyc_raw_2005_2023.parquet"

# Plage d'ann√©es
START_YEAR = 2005
END_YEAR = 2023

print("‚úÖ Session Spark configur√©e et initialis√©e.")

‚úÖ Session Spark configur√©e et initialis√©e.


## Phase 2 : M√©tadonn√©es et Identification des Stations NOAA

In [3]:
# --- 1. T√©l√©chargement des M√©tadonn√©es des Stations ---
stations_url = "https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv"
pdf_stations = pd.read_csv(stations_url,
                         dtype={'USAF': str, 'WBAN': str})

pdf_stations['STN_ID'] = pdf_stations['USAF'].str.strip() + pdf_stations['WBAN'].str.strip()
pdf_stations = pdf_stations.rename(columns={'LAT': 'LATITUDE', 'LON': 'LONGITUDE'})
pdf_stations = pdf_stations.dropna(subset=['LATITUDE', 'LONGITUDE', 'STATION NAME'])
spark_stations_df = spark.createDataFrame(pdf_stations)

# --- 2. Filtrage G√©ographique ---
nyc_stations_spark = spark_stations_df.filter(
    (F.col('LATITUDE') >= MIN_LAT) & (F.col('LATITUDE') <= MAX_LAT) &
    (F.col('LONGITUDE') >= MIN_LON) & (F.col('LONGITUDE') <= MAX_LON)
)

# R√©cup√©ration de la liste des IDs pertinents (pour filtrage par nom de fichier)
relevant_station_ids = [row.STN_ID for row in nyc_stations_spark.select("STN_ID").collect()]

print(f"\n‚úÖ {nyc_stations_spark.count()} Stations NOAA pertinentes trouv√©es pr√®s de New York.")
# Gardons ce DataFrame pour la jointure des coordonn√©es plus tard


‚úÖ 93 Stations NOAA pertinentes trouv√©es pr√®s de New York.


## Phase 2 bis : T√©l√©chargement des donn√©es

In [4]:
BASE_URL = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/access"
LOCAL_BASE_DIR = "/home/jovyan/work/data/noaa_gsod"
START_YEAR = 2005
END_YEAR = 2023

# --- IDs des stations √† t√©l√©charger ---
if 'relevant_station_ids' not in locals():
    print("‚ö†Ô∏è ATTENTION: La liste 'relevant_station_ids' n'est pas d√©finie. Veuillez ex√©cuter la Phase 2 en premier.")
    exit()

# D√©marrage du processus
print(f"D√©marrage du t√©l√©chargement pour {len(relevant_station_ids)} stations de {START_YEAR} √† {END_YEAR}.")

downloaded_count = 0

# --- Boucle principale (sans barre de progression) ---
for year in range(START_YEAR, END_YEAR + 1):
    year_dir = os.path.join(LOCAL_BASE_DIR, str(year))
    
    # Petit print pour savoir o√π on en est (optionnel, mais utile sans barre de progression)
    print(f"Traitement de l'ann√©e : {year}...")

    # Cr√©e le r√©pertoire de l'ann√©e s'il n'existe pas
    os.makedirs(year_dir, exist_ok=True)

    for station_id in relevant_station_ids:
        file_name = f"{station_id}.csv"
        local_path = os.path.join(year_dir, file_name)
        remote_url = f"{BASE_URL}/{year}/{file_name}"

        # V√©rifie si le fichier existe d√©j√†
        if os.path.exists(local_path):
            downloaded_count += 1
            continue

        try:
            # Requ√™te HTTP GET
            response = requests.get(remote_url, timeout=10)
            response.raise_for_status()

            # √âcrit le contenu dans le fichier local
            with open(local_path, 'wb') as f:
                f.write(response.content)

            downloaded_count += 1
            
            # Pause pour √™tre poli avec le serveur NOAA
            time.sleep(0.05) 

        except requests.exceptions.HTTPError as errh:
            # Fichier 404/Not Found
            if response.status_code == 404:
                pass 
            else:
                print(f"\n‚ùå Erreur HTTP pour {remote_url}: {errh}")
        except requests.exceptions.RequestException as e:
            print(f"\n‚ùå Erreur de Connexion/Timeout pour {remote_url}: {e}")

print(f"\n‚úÖ T√©l√©chargement termin√©. {downloaded_count} fichiers GSOD trait√©s (t√©l√©charg√©s ou existants).")

D√©marrage du t√©l√©chargement pour 93 stations de 2005 √† 2023.
Traitement de l'ann√©e : 2005...
Traitement de l'ann√©e : 2006...
Traitement de l'ann√©e : 2007...
Traitement de l'ann√©e : 2008...
Traitement de l'ann√©e : 2009...
Traitement de l'ann√©e : 2010...
Traitement de l'ann√©e : 2011...
Traitement de l'ann√©e : 2012...
Traitement de l'ann√©e : 2013...
Traitement de l'ann√©e : 2014...
Traitement de l'ann√©e : 2015...
Traitement de l'ann√©e : 2016...
Traitement de l'ann√©e : 2017...
Traitement de l'ann√©e : 2018...
Traitement de l'ann√©e : 2019...
Traitement de l'ann√©e : 2020...
Traitement de l'ann√©e : 2021...
Traitement de l'ann√©e : 2022...
Traitement de l'ann√©e : 2023...

‚úÖ T√©l√©chargement termin√©. 552 fichiers GSOD trait√©s (t√©l√©charg√©s ou existants).


## Phase 3 : Ingestion Cibl√©e et Persistance

In [5]:
# --- Configuration des Chemins ---
LOCAL_BASE_DIR = "/home/jovyan/work/data/noaa_gsod" 
RAW_OUTPUT_PATH = "hdfs://namenode:9000/user/mathis/datalake/noaa_gsod_nyc_raw_2005_2023.parquet"
START_YEAR = 2005
END_YEAR = 2023

# --- 1. D√©finition des Chemins Cibl√©s ---
# Nous recr√©ons la liste, mais cette fois en utilisant 'glob' ou une v√©rification OS
# pour ne pas inclure les chemins qui n'existent pas.

existing_targeted_paths = []
for year in range(START_YEAR, END_YEAR + 1):
    for station_id in relevant_station_ids:
        # Chemin absolu corrig√© : /home/jovyan/work/data/noaa_gsod/2005/XXXXX.csv
        path = f"{LOCAL_BASE_DIR}/{year}/{station_id}.csv"
        
        # V√©rifie si le fichier existe vraiment avant de l'ajouter √† la liste de lecture de Spark
        if os.path.exists(path):
            existing_targeted_paths.append(path)

# Si aucun chemin n'existe, nous aurons une erreur, mais au moins nous savons pourquoi.
if not existing_targeted_paths:
    raise FileNotFoundError("Aucun fichier GSOD cible n'a √©t√© trouv√© dans le r√©pertoire local.")

gsod_data_paths = existing_targeted_paths
print(f"Total de {len(gsod_data_paths)} fichiers existants seront lus par Spark.")

# --- 2. Sch√©ma et Lecture ---
gsod_schema = StructType([
    StructField("STATION", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("LATITUDE", DoubleType(), True), 
    StructField("LONGITUDE", DoubleType(), True),
    StructField("ELEVATION", DoubleType(), True),
    StructField("NAME", StringType(), True),
    StructField("TEMP", DoubleType(), True),
    StructField("TEMP_ATTRIBUTES", StringType(), True),
    StructField("DEWP", DoubleType(), True),
    StructField("DEWP_ATTRIBUTES", StringType(), True),
    StructField("SLP", DoubleType(), True),
    StructField("SLP_ATTRIBUTES", StringType(), True),
    StructField("STP", DoubleType(), True),
    StructField("STP_ATTRIBUTES", StringType(), True),
    StructField("VISIB", DoubleType(), True),
    StructField("VISIB_ATTRIBUTES", StringType(), True),
    StructField("WDSP", DoubleType(), True),
    StructField("WDSP_ATTRIBUTES", StringType(), True),
    StructField("MXSPD", DoubleType(), True),
    StructField("GUST", DoubleType(), True),
    StructField("MAX", DoubleType(), True),
    StructField("MAX_ATTRIBUTES", StringType(), True),
    StructField("MIN", DoubleType(), True),
    StructField("MIN_ATTRIBUTES", StringType(), True),
    StructField("PRCP", DoubleType(), True),
    StructField("PRCP_ATTRIBUTES", StringType(), True),
    StructField("SNDP", DoubleType(), True),
    StructField("FRSHHT", StringType(), True),
])

# Lecture distribu√©e des donn√©es GSOD (seulement les fichiers cibl√©s)
all_gsod_data = spark.read.csv(
    gsod_data_paths,
    header=True,
    schema=gsod_schema,
    sep=','
)

# Renommage de la colonne ID
nyc_gsod_data = all_gsod_data.withColumnRenamed("STATION", "ID_STATION")


# --- 3. Persistance de la Couche Brute sur HDFS ---
print(f"\nSauvegarde de la copie BRUTE filtr√©e (2005-2023) dans : {RAW_OUTPUT_PATH}...")
# Cette √©tape transf√®re les donn√©es du disque local du conteneur vers HDFS
nyc_gsod_data.write.mode("overwrite").parquet(RAW_OUTPUT_PATH)
print("‚úÖ Copie brute sauvegard√©e sur HDFS. Le traitement peut se poursuivre.")

Total de 552 fichiers existants seront lus par Spark.

Sauvegarde de la copie BRUTE filtr√©e (2005-2023) dans : hdfs://namenode:9000/user/mathis/datalake/noaa_gsod_nyc_raw_2005_2023.parquet...
‚úÖ Copie brute sauvegard√©e sur HDFS. Le traitement peut se poursuivre.


## Phase 4 : T√©l√©chargement et Nettoyage du JSON Socrata

In [6]:
# --- Configuration des Chemins ---
LOCAL_BASE_DIR = "/home/jovyan/work/data/air_quality"
LOCAL_JSON_PATH = os.path.join(LOCAL_BASE_DIR, "nyc_air_quality_raw.json")
AIR_QUALITY_URL = "https://data.cityofnewyork.us/api/views/c3uy-2p5r/rows.json?accessType=DOWNLOAD"

# Cr√©e le r√©pertoire local si n√©cessaire
os.makedirs(LOCAL_BASE_DIR, exist_ok=True)

# --- 1. T√©l√©chargement et Nettoyage de la structure JSON Socrata ---
print(f"‚¨áÔ∏è T√©l√©chargement du JSON Socrata depuis l'API de NYC...")
try:
    response = requests.get(AIR_QUALITY_URL, timeout=300) # Timeout de 5 minutes
    response.raise_for_status()
    data = response.json()
    
    # La cl√© 'data' contient le tableau des enregistrements bruts que Spark doit lire.
    raw_records = data.get('data', [])

    if not raw_records:
        print("‚ùå Erreur : La cl√© 'data' est vide dans le JSON t√©l√©charg√©. Arr√™t du processus.")
        exit()
    
    # √âcriture du tableau de donn√©es brutes SEULEMENT dans le nouveau fichier JSON.
    # Ceci est essentiel pour que le RDD/toDF fonctionne correctement.
    with open(LOCAL_JSON_PATH, 'w') as f:
        json.dump(raw_records, f)

    print(f"‚úÖ Fichier JSON brut sauvegard√© et nettoy√© structurellement √† : {LOCAL_JSON_PATH}")
    
except Exception as e:
    print(f"‚ùå Erreur lors du t√©l√©chargement/nettoyage : {e}")
    exit()

‚¨áÔ∏è T√©l√©chargement du JSON Socrata depuis l'API de NYC...
‚úÖ Fichier JSON brut sauvegard√© et nettoy√© structurellement √† : /home/jovyan/work/data/air_quality/nyc_air_quality_raw.json


## Phase 5 - Transformation des donn√©es et Cr√©ation du Dashboard

In [8]:
import os
import json
import geopandas as gpd
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, lit, substring, regexp_replace
from pyspark.sql.types import DoubleType, StructType, StructField, StringType

# ==============================================================================
# √âTAPE 1 : PR√âPARATION DES DONN√âES (ETL SPARK)
# ==============================================================================
print("üîÑ D√©marrage de l'ETL Spark...")

# 1. Initialisation Spark
try:
    spark
except NameError:
    spark = SparkSession.builder.appName("OneCode_ETL").getOrCreate()

# 2. Configuration des chemins
GEOJSON_URL = "https://raw.githubusercontent.com/nycehs/NYC_geography/master/UHF42.geo.json"
AIR_QUALITY_PATH = "/home/jovyan/work/data/air_quality/nyc_air_quality_raw.json" 
WEATHER_PATH = "hdfs://namenode:9000/user/mathis/datalake/noaa_gsod_nyc_raw_2005_2023.parquet"

# --- A. Traitement G√©ographique (GeoJSON) ---
print("   üìç Traitement du GeoJSON...")
gdf_quartiers = gpd.read_file(GEOJSON_URL)

# Correction G√©om√©trique : On projette en EPSG:2263 (NYC Feet) pour calculer le centre, puis on revient en Lat/Lon
# Cela √©vite le warning et donne un centre plus pr√©cis.
gdf_quartiers = gdf_quartiers.to_crs(epsg=2263) 
gdf_quartiers['centroid'] = gdf_quartiers.geometry.centroid
gdf_quartiers = gdf_quartiers.to_crs(epsg=4326) # Retour au standard GPS

# Extraction Lat/Lon des centro√Ødes recalcul√©s
# Attention: Apr√®s reprojection, on acc√®de au centroid via la colonne qu'on a cr√©√©e, mais il faut la reprojeter aussi
# Plus simple : on recr√©e le centroid en 4326 directement si la pr√©cision au m√®tre n'est pas vitale, 
# mais pour √™tre propre, utilisons la colonne geometry reprojet√©e.
gdf_quartiers['LATITUDE_ZONE'] = gdf_quartiers['centroid'].to_crs(epsg=4326).y
gdf_quartiers['LONGITUDE_ZONE'] = gdf_quartiers['centroid'].to_crs(epsg=4326).x

# Sauvegardes
gdf_quartiers[['GEOCODE', 'GEONAME', 'BOROUGH', 'geometry']].to_file("dashboard_map.geojson", driver='GeoJSON')
pdf_locations = pd.DataFrame(gdf_quartiers[['GEOCODE', 'GEONAME', 'LATITUDE_ZONE', 'LONGITUDE_ZONE']])
spark_locations = spark.createDataFrame(pdf_locations)

# --- B. Traitement Air Quality (CORRIG√â) ---
print("   üí® Traitement Air Quality (Mode Manuel)...")

# Lecture manuelle car le JSON est une liste de listes (Socrata)
with open(AIR_QUALITY_PATH, 'r') as f:
    raw_data = json.load(f)

# D√©finition du Sch√©ma explicite (Index Socrata)
# 17=Date, 14=GeoID, 10=Polluant, 18=Valeur
fields = []
for i in range(20): # On cr√©e 20 colonnes g√©n√©riques
    name = f"col_{i}"
    if i == 17: name = "DATE_MESURE_BRUTE"
    elif i == 14: name = "GEOJOIN_ID_BRUT"
    elif i == 10: name = "NOM_POLLUANT"
    elif i == 18: name = "VALEUR_MESURE_BRUTE"
    fields.append(StructField(name, StringType(), True))

schema = StructType(fields)

# Cr√©ation DataFrame Spark
air_q_df = spark.createDataFrame(raw_data, schema=schema)

# Nettoyage
air_quality_clean = air_q_df.withColumn(
    "DATE_OBSERVATION", 
    to_date(col("DATE_MESURE_BRUTE"))
).select(
    col("GEOJOIN_ID_BRUT").alias("GEOJOIN_ID"),
    col("DATE_OBSERVATION"),
    col("NOM_POLLUANT"),
    col("VALEUR_MESURE_BRUTE").cast(DoubleType()).alias("VALEUR")
).filter(col("VALEUR").isNotNull())

# --- C. Jointure Air Quality + Coordonn√©es ---
print("   üîó Jointure Air Quality + G√©ographie...")
final_air_data = air_quality_clean.join(
    spark_locations,
    air_quality_clean.GEOJOIN_ID == spark_locations.GEOCODE,
    "inner"
).drop("GEOCODE")

final_air_data.toPandas().to_parquet("dashboard_data_air.parquet", index=False)

# --- D. Traitement M√©t√©o ---
print("   ‚òÄÔ∏è  Traitement M√©t√©o...")
weather_df = spark.read.parquet(WEATHER_PATH)
weather_lite = weather_df.select("ID_STATION", "NAME", "LATITUDE", "LONGITUDE", "DATE", "TEMP", "DEWP", "WDSP")
weather_lite.toPandas().to_parquet("dashboard_data_weather.parquet", index=False)

print("‚úÖ ETL termin√© ! Fichiers g√©n√©r√©s.")

# ==============================================================================
# √âTAPE 2 : G√âN√âRATION DU DASHBOARD (Identique)
# ==============================================================================
dashboard_code = """
import streamlit as st
import pandas as pd
import geopandas as gpd
import folium
from streamlit_folium import st_folium
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# ==============================================================================
# 1. CONFIGURATION & INITIALISATION
# ==============================================================================
st.set_page_config(layout="wide", page_title="NYC Environmental Dashboard")

# Initialisation des √©tats
if 'selected_geocode' not in st.session_state:
    st.session_state.selected_geocode = None
if 'dropdown_selector' not in st.session_state:
    st.session_state.dropdown_selector = "Tous quartiers"

# ==============================================================================
# 2. FONCTIONS UTILITAIRES
# ==============================================================================

@st.cache_data
def load_data():
    geo = gpd.read_file("dashboard_map.geojson")
    air = pd.read_parquet("dashboard_data_air.parquet")
    weather = pd.read_parquet("dashboard_data_weather.parquet")
    
    # --- NETTOYAGE & CONVERSIONS ---
    geo['GEOCODE'] = geo['GEOCODE'].astype(str)
    
    if 'LATITUDE_ZONE' not in geo.columns:
        try:
            geo_temp = geo.to_crs(epsg=2263)
            centroids = geo_temp.geometry.centroid.to_crs(epsg=4326)
        except:
            centroids = geo.geometry.centroid
        geo['LATITUDE_ZONE'] = centroids.y
        geo['LONGITUDE_ZONE'] = centroids.x
    
    air['DATE_OBSERVATION'] = pd.to_datetime(air['DATE_OBSERVATION'])
    weather['DATE'] = pd.to_datetime(weather['DATE'])
    
    # Conversion Unit√©s
    weather['TEMP'] = (weather['TEMP'] - 32) * 5.0/9.0
    weather['DEWP'] = (weather['DEWP'] - 32) * 5.0/9.0
    weather['WDSP'] = weather['WDSP'] * 1.852
    
    stations = weather[['ID_STATION', 'NAME', 'LATITUDE', 'LONGITUDE']].drop_duplicates()
    return geo, air, weather, stations

def haversine_vectorized(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c 

def calculate_global_metrics(geo_df, weather_df, stations_df, radius_km):
    results = []
    weather_agg = weather_df.groupby('ID_STATION')[['TEMP', 'WDSP', 'DEWP']].mean().reset_index()
    stations_w_weather = stations_df.merge(weather_agg, on='ID_STATION')
    
    if stations_w_weather.empty:
        return pd.DataFrame()

    for _, row in geo_df.iterrows():
        lat_q, lon_q = row['LATITUDE_ZONE'], row['LONGITUDE_ZONE']
        dists = haversine_vectorized(
            lon_q, lat_q, 
            stations_w_weather['LONGITUDE'].values, 
            stations_w_weather['LATITUDE'].values
        )
        
        mask = dists <= radius_km
        nearby_stations = stations_w_weather[mask].copy()
        nearby_dists = dists[mask]
        
        if not nearby_stations.empty:
            weights = 1 / (nearby_dists + 0.1)
            w_temp = np.average(nearby_stations['TEMP'], weights=weights)
            w_wind = np.average(nearby_stations['WDSP'], weights=weights)
            w_dewp = np.average(nearby_stations['DEWP'], weights=weights)
            
            results.append({
                'GEOCODE': str(row['GEOCODE']),
                'W_TEMP': round(w_temp, 1),
                'W_WIND': round(w_wind, 1),
                'W_DEWP': round(w_dewp, 1),
                'NB_STATIONS': len(nearby_stations)
            })
        else:
            results.append({
                'GEOCODE': str(row['GEOCODE']),
                'W_TEMP': None, 'W_WIND': None, 'W_DEWP': None, 'NB_STATIONS': 0
            })
    return pd.DataFrame(results)

# ==============================================================================
# 3. CHARGEMENT & FILTRES
# ==============================================================================

geo, df_air, df_weather, df_stations = load_data()

st.sidebar.header("üéõÔ∏è Filtres & Param√®tres")

# Dates
min_date, max_date = df_air['DATE_OBSERVATION'].min(), df_air['DATE_OBSERVATION'].max()
start_date, end_date = st.sidebar.date_input(
    "P√©riode d'analyse", [min_date, max_date], min_value=min_date, max_value=max_date
)

# Filtre Polluants Disponibles
mask_air_date = (df_air['DATE_OBSERVATION'].dt.date >= start_date) & (df_air['DATE_OBSERVATION'].dt.date <= end_date)
df_air_filtered = df_air[mask_air_date]

valid_pollutants = df_air_filtered[df_air_filtered['VALEUR'].notna()]['NOM_POLLUANT'].unique()
valid_pollutants = sorted(valid_pollutants)

if len(valid_pollutants) > 0:
    selected_polluant = st.sidebar.selectbox("Polluant (Dispo sur la p√©riode)", valid_pollutants)
else:
    st.sidebar.error("‚ö†Ô∏è Aucune donn√©e de pollution pour cette p√©riode.")
    selected_polluant = None

# Autres Filtres
radius = st.sidebar.slider("Rayon des stations m√©t√©o (km)", 1, 100, 15)
meteo_vars = ['Temp√©rature', 'Vitesse Vent', 'Point de Ros√©e']
selected_meteo_vars = st.sidebar.multiselect("Graphiques M√©t√©o (Comparaison)", meteo_vars, default=['Temp√©rature'])

# ==============================================================================
# 4. ETL A LA VOL√âE (PR√âPARATION GLOBALE)
# ==============================================================================

if selected_polluant is None:
    st.warning("Veuillez √©largir la plage de dates.")
    st.stop()

# Filtre M√©t√©o
mask_weather_date = (df_weather['DATE'].dt.date >= start_date) & (df_weather['DATE'].dt.date <= end_date)
df_weather_filtered = df_weather[mask_weather_date]

# --- INDICATEUR SIDEBAR ---
with st.sidebar:
    st.markdown("---")
    st.markdown("### ‚ÑπÔ∏è Info Stations")
    active_stations = df_weather_filtered[['ID_STATION']].drop_duplicates()
    active_stations_coords = active_stations.merge(df_stations, on='ID_STATION')

    if st.session_state.selected_geocode is None:
        # Mode Global : Centre NYC
        lat_center = geo['LATITUDE_ZONE'].mean()
        lon_center = geo['LONGITUDE_ZONE'].mean()
        if not active_stations_coords.empty:
            dists_s = haversine_vectorized(lon_center, lat_center, active_stations_coords['LONGITUDE'].values, active_stations_coords['LATITUDE'].values)
            nb_visible = np.sum(dists_s <= radius)
            st.metric(f"Stations (Centre NYC, {radius} km)", nb_visible)
        else:
            st.metric(f"Stations (Centre NYC, {radius} km)", 0)
    else:
        # Mode Local
        sel_geo = geo[geo['GEOCODE'] == st.session_state.selected_geocode]
        if not sel_geo.empty:
            lat_s = sel_geo.iloc[0]['LATITUDE_ZONE']
            lon_s = sel_geo.iloc[0]['LONGITUDE_ZONE']
            if not active_stations_coords.empty:
                dists_s = haversine_vectorized(lon_s, lat_s, active_stations_coords['LONGITUDE'].values, active_stations_coords['LATITUDE'].values)
                nb_visible = np.sum(dists_s <= radius)
                st.metric(f"Stations (Quartier, {radius} km)", nb_visible)
            else:
                st.metric(f"Stations (Quartier, {radius} km)", 0)

# --- PR√âPARATION DONN√âES CARTE ---
df_air_map = df_air_filtered[df_air_filtered['NOM_POLLUANT'] == selected_polluant]
if not df_air_map.empty:
    air_agg = df_air_map.groupby('GEOJOIN_ID')['VALEUR'].mean().reset_index()
    air_agg.columns = ['GEOCODE', 'MEAN_POLLUANT']
    air_agg['GEOCODE'] = air_agg['GEOCODE'].astype(str)
else:
    air_agg = pd.DataFrame(columns=['GEOCODE', 'MEAN_POLLUANT'])

weather_metrics_df = calculate_global_metrics(geo, df_weather_filtered, df_stations, radius)

gdf_display = geo.merge(air_agg, on='GEOCODE', how='left')
if not weather_metrics_df.empty:
    gdf_display = gdf_display.merge(weather_metrics_df, on='GEOCODE', how='left')

gdf_display['MEAN_POLLUANT'] = gdf_display['MEAN_POLLUANT'].fillna(0).round(2)
gdf_display['W_TEMP'] = gdf_display['W_TEMP'].fillna(0)
gdf_display['NB_STATIONS'] = gdf_display['NB_STATIONS'].fillna(0).astype(int)

# --- PR√âPARATION DONN√âES GRAPHIQUES & KPIs (AVANT AFFICHAGE) ---
# Cela permet d'avoir les variables pr√™tes pour les deux colonnes

current_title = ""
current_caption = ""
avg_polluant, avg_temp, avg_wind = 0, 0, 0

# Sources de donn√©es brutes pour les graphes (√† filtrer selon s√©lection)
chart_air_src = pd.DataFrame()
chart_weather_src = pd.DataFrame()

if st.session_state.selected_geocode is None:
    # GLOBAL
    current_title = "New York City (Global)"
    current_caption = "Moyenne de tous les quartiers"
    
    valid_data = gdf_display[gdf_display['MEAN_POLLUANT'] > 0]
    if not valid_data.empty:
        avg_polluant = valid_data['MEAN_POLLUANT'].mean()
        avg_temp = valid_data['W_TEMP'].replace(0, np.nan).mean()
        avg_wind = valid_data['W_WIND'].replace(0, np.nan).mean()
    
    chart_air_src = df_air_filtered[df_air_filtered['NOM_POLLUANT'] == selected_polluant].copy()
    chart_weather_src = df_weather_filtered.copy()
else:
    # LOCAL
    current_geo_data = gdf_display[gdf_display['GEOCODE'] == st.session_state.selected_geocode].iloc[0]
    current_title = current_geo_data['GEONAME']
    current_caption = f"Borough: {current_geo_data['BOROUGH']} | Stations locales : {int(current_geo_data['NB_STATIONS'])}"
    
    avg_polluant = current_geo_data['MEAN_POLLUANT']
    avg_temp = current_geo_data['W_TEMP']
    avg_wind = current_geo_data['W_WIND']
    
    chart_air_src = df_air_filtered[
        (df_air_filtered['GEOJOIN_ID'] == st.session_state.selected_geocode) & 
        (df_air_filtered['NOM_POLLUANT'] == selected_polluant)
    ].copy()
    
    lat_q, lon_q = current_geo_data['LATITUDE_ZONE'], current_geo_data['LONGITUDE_ZONE']
    dists = haversine_vectorized(lon_q, lat_q, df_stations['LONGITUDE'].values, df_stations['LATITUDE'].values)
    nearby_ids = df_stations[dists <= radius]['ID_STATION'].unique()
    chart_weather_src = df_weather_filtered[df_weather_filtered['ID_STATION'].isin(nearby_ids)].copy()

# RESAMPLING COMMUN
delta_days = (end_date - start_date).days
resample_rule = 'D'
if delta_days > 730: resample_rule = 'Q'
elif delta_days > 180: resample_rule = 'M'
elif delta_days > 60: resample_rule = 'W'

if not chart_air_src.empty:
    chart_air_final = chart_air_src.set_index('DATE_OBSERVATION').resample(resample_rule)['VALEUR'].mean().reset_index()
else:
    chart_air_final = pd.DataFrame()

if not chart_weather_src.empty:
    chart_weather_final = chart_weather_src.set_index('DATE').resample(resample_rule)[['TEMP', 'WDSP', 'DEWP']].mean().reset_index()
else:
    chart_weather_final = pd.DataFrame()

# ==============================================================================
# 5. UI PRINCIPALE
# ==============================================================================

col1, col2 = st.columns([3, 2])

# --- COLONNE 1 : CARTE & GRAPHIQUES COMPARAISON ---
with col1:
    st.subheader(f"Carte : {selected_polluant}")
    
    m = folium.Map(location=[40.7128, -74.0060], zoom_start=10, tiles="CartoDB positron")

    choropleth = folium.Choropleth(
        geo_data=gdf_display,
        data=gdf_display,
        columns=['GEOCODE', 'MEAN_POLLUANT'],
        key_on='feature.properties.GEOCODE',
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name=f"Concentration {selected_polluant}",
        highlight=True
    )
    choropleth.add_to(m)

    style_function = lambda x: {'fillColor': '#ffffff', 'color':'#000000', 'fillOpacity': 0.0, 'weight': 0.1}
    tooltip_layer = folium.GeoJson(
        gdf_display,
        style_function=style_function,
        tooltip=folium.GeoJsonTooltip(
            fields=['GEONAME', 'BOROUGH', 'MEAN_POLLUANT', 'W_TEMP', 'W_WIND', 'NB_STATIONS'],
            aliases=['Quartier:', 'Borough:', f'{selected_polluant}:', 'Temp (¬∞C):', 'Vent (km/h):', 'Stations:'],
            localize=True
        )
    ).add_to(m)

    st_map = st_folium(m, width=None, height=600)
    
    # --- LOGIQUE SYNCHRONISATION (Invisible) ---
    geo_options_df = geo[['GEOCODE', 'GEONAME']].sort_values('GEONAME')
    if st_map and st_map.get('last_object_clicked'):
        last_clicked = st_map['last_object_clicked']
        if isinstance(last_clicked, dict) and 'properties' in last_clicked:
            props = last_clicked['properties']
            if props and 'GEOCODE' in props:
                clicked_code = str(props['GEOCODE'])
                name_match = geo_options_df[geo_options_df['GEOCODE'] == clicked_code]['GEONAME']
                if not name_match.empty:
                    clicked_name = name_match.values[0]
                    if st.session_state.dropdown_selector != clicked_name:
                        st.session_state.dropdown_selector = clicked_name
                        st.session_state.selected_geocode = clicked_code
                        st.rerun()

    # --- GRAPHIQUES : COMPARAISONS M√âT√âO (SOUS LA CARTE) ---
    st.markdown("---")
    st.subheader("üìâ Facteurs M√©t√©orologiques")
    
    meteo_config = {
        'Temp√©rature': {'col': 'TEMP', 'color': 'orange', 'label': 'Temp (¬∞C)'},
        'Vitesse Vent': {'col': 'WDSP', 'color': 'blue', 'label': 'Vent (km/h)'},
        'Point de Ros√©e': {'col': 'DEWP', 'color': 'green', 'label': 'Ros√©e (¬∞C)'}
    }
    
    if not selected_meteo_vars:
        st.info("S√©lectionnez des variables m√©t√©o dans le menu pour voir les comparaisons.")
    else:
        for var_name in selected_meteo_vars:
            fig = go.Figure()
            
            # Courbe Air (R√©f√©rence)
            if not chart_air_final.empty:
                fig.add_trace(go.Scatter(
                    x=chart_air_final['DATE_OBSERVATION'], 
                    y=chart_air_final['VALEUR'], 
                    name=selected_polluant, 
                    mode='lines',
                    line=dict(color='red', width=1, dash='solid'),
                    opacity=0.5
                ))

            # Courbe M√©t√©o
            if not chart_weather_final.empty and var_name in meteo_config:
                conf = meteo_config[var_name]
                fig.add_trace(go.Scatter(
                    x=chart_weather_final['DATE'], 
                    y=chart_weather_final[conf['col']], 
                    name=conf['label'], 
                    mode='lines+markers',
                    marker=dict(size=4),
                    line=dict(color=conf['color'], width=2), 
                    yaxis='y2'
                ))

            fig.update_layout(
                title=f"{selected_polluant} vs {var_name}",
                xaxis_title="Date",
                yaxis=dict(title=selected_polluant, showgrid=False),
                yaxis2=dict(title=var_name, overlaying='y', side='right', showgrid=True),
                legend=dict(orientation="h", y=1.1),
                height=350, margin=dict(t=40, b=0, l=0, r=0)
            )
            st.plotly_chart(fig, use_container_width=True)

# --- COLONNE 2 : D√âTAILS & GRAPHIQUE PRINCIPAL ---
with col2:
    st.markdown("### üìç D√©tails")
    
    all_options = ["Tous quartiers"] + geo_options_df['GEONAME'].tolist()
    
    selected_option = st.selectbox(
        "S√©lectionner une zone",
        options=all_options,
        key="dropdown_selector"
    )
    
    if selected_option == "Tous quartiers":
        st.session_state.selected_geocode = None
    else:
        code_match = geo_options_df[geo_options_df['GEONAME'] == selected_option]['GEOCODE']
        if not code_match.empty:
            st.session_state.selected_geocode = str(code_match.values[0])

    st.title(current_title)
    st.caption(current_caption)

    # KPIs
    kpi1, kpi2, kpi3 = st.columns(3)
    val_p = f"{avg_polluant:.2f}" if pd.notnull(avg_polluant) else "N/A"
    val_t = f"{avg_temp:.1f} ¬∞C" if pd.notnull(avg_temp) else "N/A"
    val_w = f"{avg_wind:.1f} km/h" if pd.notnull(avg_wind) else "N/A"
    
    kpi1.metric(f"Moy. {selected_polluant}", val_p)
    kpi2.metric("Temp. Moy", val_t)
    kpi3.metric("Vent Moy", val_w)

    st.markdown("---")
    
    # --- GRAPHIQUE : √âVOLUTION POLLUANT SEUL (SOUS LES KPIS) ---
    st.subheader("üìà √âvolution du Polluant")
    
    fig_main = go.Figure()
    if not chart_air_final.empty:
        fig_main.add_trace(go.Scatter(
            x=chart_air_final['DATE_OBSERVATION'], 
            y=chart_air_final['VALEUR'], 
            name=selected_polluant, 
            mode='lines+markers',
            marker=dict(size=8),
            line=dict(color='red', width=3)
        ))
        fig_main.update_layout(
            title=f"Tendance : {selected_polluant}",
            xaxis_title="Date", 
            yaxis=dict(title="Concentration"), 
            height=400,
            margin=dict(t=40, b=0, l=0, r=0)
        )
        st.plotly_chart(fig_main, use_container_width=True)
    else:
        st.info("Pas de donn√©es suffisantes pour afficher l'√©volution.")
"""

with open("app.py", "w", encoding='utf-8') as f:
    f.write(dashboard_code)

print("\nüöÄ Application g√©n√©r√©e ! Lancez dans le terminal : streamlit run app.py")

üîÑ D√©marrage de l'ETL Spark...
   üìç Traitement du GeoJSON...
   üí® Traitement Air Quality (Mode Manuel)...
   üîó Jointure Air Quality + G√©ographie...
   ‚òÄÔ∏è  Traitement M√©t√©o...
‚úÖ ETL termin√© ! Fichiers g√©n√©r√©s.

üöÄ Application g√©n√©r√©e ! Lancez dans le terminal : streamlit run app.py
