In [1]:
import pandas as pd

# Load
df_yield = pd.read_csv("../raw_data/barley_yield_from_1982.csv", sep=";")

# Clean columns
df_yield.columns = (
    df_yield.columns.str.strip()
    .str.lower()
    .str.replace(r"\s+", "_", regex=True)
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

# Ensure correct types
df_yield["year"] = pd.to_numeric(df_yield["year"], errors="coerce")
df_yield["yield"] = pd.to_numeric(df_yield["yield"], errors="coerce")

# Harmonize department name
df_yield = df_yield.rename(columns={"department": "nom_dep"})

yield_df = df_yield[["nom_dep", "year", "yield"]].dropna()
yield_df.head()

Unnamed: 0,nom_dep,year,yield
0,Ain,1982,3.95008
2,Ain,1984,4.82258
3,Ain,1985,4.19677
4,Ain,1986,3.59845
5,Ain,1987,4.92


In [3]:
df_climate = pd.read_parquet("../raw_data/climate_data_from_1982.parquet")

df_climate.columns = (
    df_climate.columns.str.strip()
    .str.lower()
    .str.replace(r"\s+", "_", regex=True)
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

df_climate["time"] = pd.to_datetime(df_climate["time"])
df_climate["year"] = df_climate["time"].dt.year
df_climate["month"] = df_climate["time"].dt.month

# On garde uniquement historical pour entraînement
clim = df_climate[df_climate["scenario"] == "historical"].copy()

clim = clim[["nom_dep", "year", "month", "metric", "value"]]
clim.head()

Unnamed: 0,nom_dep,year,month,metric,value
29950,Ain,1982,1,near_surface_air_temperature,276.292267
29951,Ain,1982,1,near_surface_air_temperature,275.092407
29952,Ain,1982,1,near_surface_air_temperature,276.418518
29953,Ain,1982,1,near_surface_air_temperature,278.893372
29954,Ain,1982,1,near_surface_air_temperature,278.887207


In [4]:
temp = clim[clim["metric"].str.contains("temperature")]
precip = clim[clim["metric"].str.contains("precip")]

In [5]:
# Température annuelle
temp_year = (
    temp.groupby(["nom_dep", "year"])["value"]
    .agg(temp_mean="mean", temp_std="std", temp_max="max")
    .reset_index()
)

# Température saison de croissance (mars -> juillet)
temp_growing = (
    temp[temp["month"].between(3, 7)]
    .groupby(["nom_dep", "year"])["value"]
    .agg(temp_growing_mean="mean", temp_growing_max="max")
    .reset_index()
)

In [6]:
# Précipitation annuelle (somme)
precip_year = (
    precip.groupby(["nom_dep", "year"])["value"]
    .sum()
    .reset_index()
    .rename(columns={"value": "precip_total"})
)

# Précipitation saison de croissance
precip_growing = (
    precip[precip["month"].between(3, 7)]
    .groupby(["nom_dep", "year"])["value"]
    .sum()
    .reset_index()
    .rename(columns={"value": "precip_growing_total"})
)

In [7]:
climate_features = temp_year.merge(temp_growing, on=["nom_dep", "year"], how="left")
climate_features = climate_features.merge(
    precip_year, on=["nom_dep", "year"], how="left"
)
climate_features = climate_features.merge(
    precip_growing, on=["nom_dep", "year"], how="left"
)

climate_features.head()

Unnamed: 0,nom_dep,year,temp_mean,temp_std,temp_max,temp_growing_mean,temp_growing_max,precip_total,precip_growing_total
0,Ain,1982,285.130951,7.939871,305.540894,287.565948,304.145508,0.027494,0.01272
1,Ain,1983,283.978882,8.053026,302.047577,286.859497,302.047577,0.029012,0.012507
2,Ain,1984,284.348541,7.855242,302.437683,287.628204,302.437683,0.027781,0.013384
3,Ain,1985,283.904816,8.142635,301.748352,286.708191,301.748352,0.034206,0.013498
4,Ain,1986,284.867218,7.463179,303.979248,287.412842,302.967255,0.023955,0.010336


In [8]:
model_df = yield_df.merge(climate_features, on=["nom_dep", "year"], how="left")

print(model_df.shape)
model_df.head()

(3336, 10)


Unnamed: 0,nom_dep,year,yield,temp_mean,temp_std,temp_max,temp_growing_mean,temp_growing_max,precip_total,precip_growing_total
0,Ain,1982,3.95008,285.130951,7.939871,305.540894,287.565948,304.145508,0.027494,0.01272
1,Ain,1984,4.82258,284.348541,7.855242,302.437683,287.628204,302.437683,0.027781,0.013384
2,Ain,1985,4.19677,283.904816,8.142635,301.748352,286.708191,301.748352,0.034206,0.013498
3,Ain,1986,3.59845,284.867218,7.463179,303.979248,287.412842,302.967255,0.023955,0.010336
4,Ain,1987,4.92,284.075043,7.831433,302.777557,286.147888,300.959045,0.035535,0.016205


In [9]:
# Missing values
missing = model_df.isna().mean().sort_values(ascending=False)
display((missing * 100).to_frame("% missing"))

# Corrélation rapide
display(model_df.corr(numeric_only=True)["yield"].sort_values(ascending=False))

Unnamed: 0,% missing
temp_std,15.017986
temp_mean,15.017986
precip_total,15.017986
temp_growing_max,15.017986
temp_max,15.017986
temp_growing_mean,15.017986
precip_growing_total,15.017986
nom_dep,0.0
year,0.0
yield,0.0


yield                   1.000000
temp_max                0.374915
temp_growing_max        0.357006
year                    0.351016
temp_growing_mean       0.166467
temp_std                0.147938
temp_mean               0.137344
precip_total           -0.201019
precip_growing_total   -0.224965
Name: yield, dtype: float64

In [1]:
# Lag climatique (année précédente)
climate_features["temp_mean_lag1"] = climate_features.groupby("nom_dep")[
    "temp_mean"
].shift(1)
climate_features["precip_total_lag1"] = climate_features.groupby("nom_dep")[
    "precip_total"
].shift(1)

NameError: name 'climate_features' is not defined

In [11]:
import os

os.makedirs("outputs", exist_ok=True)

model_df.to_parquet("outputs/model_table_annual.parquet", index=False)

print("File saved: outputs/model_table_annual.parquet")

File saved: outputs/model_table_annual.parquet
