In [None]:
import pandas as pd

# Load
df_yield = pd.read_csv("raw_data/barley_yield_from_1982.csv", sep=";")

# Clean columns
df_yield.columns = (
    df_yield.columns.str.strip()
    .str.lower()
    .str.replace(r"\s+", "_", regex=True)
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

# Ensure correct types
df_yield["year"] = pd.to_numeric(df_yield["year"], errors="coerce")
df_yield["yield"] = pd.to_numeric(df_yield["yield"], errors="coerce")

# Harmonize department name
df_yield = df_yield.rename(columns={"department": "nom_dep"})

yield_df = df_yield[["nom_dep", "year", "yield"]].dropna()
yield_df.head()

In [None]:
df_climate = pd.read_parquet("raw_data/climate_data_from_1982.parquet")

df_climate.columns = (
    df_climate.columns.str.strip()
    .str.lower()
    .str.replace(r"\s+", "_", regex=True)
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

df_climate["time"] = pd.to_datetime(df_climate["time"])
df_climate["year"] = df_climate["time"].dt.year
df_climate["month"] = df_climate["time"].dt.month

# On garde uniquement historical pour entraînement
clim = df_climate[df_climate["scenario"] == "historical"].copy()

clim = clim[["nom_dep", "year", "month", "metric", "value"]]
clim.head()

In [None]:
temp = clim[clim["metric"].str.contains("temperature")]
precip = clim[clim["metric"].str.contains("precip")]

In [None]:
# Température annuelle
temp_year = (
    temp.groupby(["nom_dep", "year"])["value"]
    .agg(temp_mean="mean", temp_std="std", temp_max="max")
    .reset_index()
)

# Température saison de croissance (mars -> juillet)
temp_growing = (
    temp[temp["month"].between(3, 7)]
    .groupby(["nom_dep", "year"])["value"]
    .agg(temp_growing_mean="mean", temp_growing_max="max")
    .reset_index()
)

In [None]:
# Précipitation annuelle (somme)
precip_year = (
    precip.groupby(["nom_dep", "year"])["value"]
    .sum()
    .reset_index()
    .rename(columns={"value": "precip_total"})
)

# Précipitation saison de croissance
precip_growing = (
    precip[precip["month"].between(3, 7)]
    .groupby(["nom_dep", "year"])["value"]
    .sum()
    .reset_index()
    .rename(columns={"value": "precip_growing_total"})
)

In [None]:
climate_features = temp_year.merge(temp_growing, on=["nom_dep", "year"], how="left")
climate_features = climate_features.merge(
    precip_year, on=["nom_dep", "year"], how="left"
)
climate_features = climate_features.merge(
    precip_growing, on=["nom_dep", "year"], how="left"
)

climate_features.head()

In [None]:
model_df = yield_df.merge(climate_features, on=["nom_dep", "year"], how="left")

print(model_df.shape)
model_df.head()

In [None]:
# Missing values
missing = model_df.isna().mean().sort_values(ascending=False)
display((missing * 100).to_frame("% missing"))

# Corrélation rapide
display(model_df.corr(numeric_only=True)["yield"].sort_values(ascending=False))

In [None]:
# Lag climatique (année précédente)
climate_features["temp_mean_lag1"] = climate_features.groupby("nom_dep")[
    "temp_mean"
].shift(1)
climate_features["precip_total_lag1"] = climate_features.groupby("nom_dep")[
    "precip_total"
].shift(1)

In [None]:
import os

os.makedirs("outputs", exist_ok=True)

model_df.to_parquet("outputs/model_table_annual.parquet", index=False)

print("File saved: outputs/model_table_annual.parquet")