In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# === CONFIGURATION ===
src_folder = "/data/weather_pq/cleaned_parquet"
dst_folder = "/data/weather_features"
os.makedirs(dst_folder, exist_ok=True)

# === TRAITEMENT DE TOUS LES DÉPARTEMENTS ===
for file in tqdm(os.listdir(src_folder), desc="Extracting station-level features"):
    if not file.endswith(".parquet"):
        continue

    dept = file.split("_")[1].split(".")[0]  # Exemple: weather_02.parquet → 02
    df = pd.read_parquet(os.path.join(src_folder, file))

    # Dates
    df["AAAAMMJJ"] = pd.to_datetime(df["AAAAMMJJ"], errors="coerce")
    df = df.dropna(subset=["AAAAMMJJ"])
    df = df[df["AAAAMMJJ"].dt.year >= 2010].copy()
    df["Year"] = df["AAAAMMJJ"].dt.year
    df["month"] = df["AAAAMMJJ"].dt.month

    if "NUM_POSTE" not in df.columns:
        df["NUM_POSTE"] = "UNKNOWN_" + dept

    rows = []

    # === GROUPÉ PAR STATION + ANNÉE ===
    for (station, year), g in df.groupby(["NUM_POSTE", "Year"]):
        grow = g[g["month"].between(4, 10)]
        summer = g[g["month"].isin([7, 8])]

        try:
            gdd = np.maximum((grow["TN"] + grow["TX"]) / 2 - 10, 0).sum()
            avg_tm_summer = summer["TM"].mean()
            avg_tx_summer = summer["TX"].mean()
            temp_amp = (summer["TX"] - summer["TN"]).mean()
            hot_days = (summer["TX"] > 35).sum()
            rainy_days = (summer["RR"] > 10).sum()
            rain_june = g[g["month"] == 6]["RR"].sum()
            rain_sep_oct = g[g["month"].isin([9, 10])]["RR"].sum()
            frost_apr = (g[g["month"] == 4]["TN"] < 0).sum()
            avg_tm_apr = g[g["month"] == 4]["TM"].mean()
        except Exception:
            continue

        # Géolocalisation (si dispo)
        lat = g["LAT"].iloc[0] if "LAT" in g.columns else np.nan
        lon = g["LON"].iloc[0] if "LON" in g.columns else np.nan
        alti = g["ALTI"].iloc[0] if "ALTI" in g.columns else np.nan

        rows.append({
            "dept": dept,
            "station": station,
            "year": year,
            "latitude": lat,
            "longitude": lon,
            "altitude": alti,
            "GDD": gdd,
            "TM_summer": avg_tm_summer,
            "TX_summer": avg_tx_summer,
            "temp_amp_summer": temp_amp,
            "hot_days": hot_days,
            "rainy_days_summer": rainy_days,
            "rain_June": rain_june,
            "rain_SepOct": rain_sep_oct,
            "frost_days_Apr": frost_apr,
            "avg_TM_Apr": avg_tm_apr
        })

    # Sauvegarde par département
    feat_df = pd.DataFrame(rows)
    output_path = os.path.join(dst_folder, f"weather_features_{dept}.parquet")
    feat_df.to_parquet(output_path, index=False)

print("Done: features saved for each department.")


Extracting station-level features: 100%|██████████| 28/28 [00:21<00:00,  1.33it/s]

Done: features saved for each department.





In [None]:
#Verification of usable station per departments

def count_usable_stations_per_dept(department_number):
    """
    input: department_number (str): the number of the department to check
    output: Number of stations with complete row data for each year in the department
    """
    dst_folder = '/data/weather_features'
    df = pd.read_parquet(os.path.join(dst_folder, f"weather_features_{department_number}.parquet"))

    feature_cols = ["GDD", "TM_summer", "TX_summer", "hot_days", "temp_amp_summer",
                    "rainy_days_summer", "rain_June", "rain_SepOct", "frost_days_Apr", "avg_TM_Apr"]

    valid_mask = df.groupby(["year", "station"])[feature_cols].apply(
        lambda g: g.notna().all(axis=1).all()
    ).reset_index(name="is_valid")

    usable_stations_by_year = valid_mask[valid_mask["is_valid"]].groupby("year").size().reset_index(name="usable_stations")

    return usable_stations_by_year

print(count_usable_stations_per_dept("32"))

    year  usable_stations
0   2010               11
1   2011               11
2   2012               11
3   2013               11
4   2014               11
5   2015               11
6   2016               11
7   2017                6
8   2018                6
9   2019                6
10  2020                8
11  2021               11
12  2022               12
13  2023               14
14  2024               14
