In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

# 1) Point to your original file (read-only)
CSV = Path(r"../italy_core_data/n_italy_core_42_cells.csv")

# 2) Load and standardize column names in memory
df0 = pd.read_csv(CSV)
df = df0.copy()
df.columns = [c.strip().lower() for c in df.columns]
rename = {}
if "latitude" in df.columns and "lat" not in df.columns: rename["latitude"] = "lat"
if "longitude" in df.columns and "lon" not in df.columns: rename["longitude"] = "lon"
if "long" in df.columns and "lon" not in df.columns: rename["long"] = "lon"
df = df.rename(columns=rename)

# Keep only lat/lon and ensure numeric
assert {"lat","lon"} <= set(df.columns), f"Expected columns lat, lon. Got: {list(df.columns)}"
df = df[["lat","lon"]].copy()
df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
df["lon"] = pd.to_numeric(df["lon"], errors="coerce")

# 3) Validations (handles 0.25°-offset 0.5° grids)

def infer_offset_deg(series, step=0.5):
    """Infer whether coords are aligned to n*step (+ 0) or n*step (+ step/2)."""
    frac = (series / step) % 1.0  # in [0,1)
    m = np.median(frac)
    # decide between 0.00 and 0.50
    return 0.0 if abs(m - 0.0) < 0.25 else step/2  # ~0.25 deg

def off_grid_with_offset(series, offset_deg, step=0.5, tol=5e-4):
    resid = ((series - offset_deg) / step) - np.round((series - offset_deg) / step)
    return (np.abs(resid) > tol).sum()

lat_off = infer_offset_deg(df["lat"])
lon_off = infer_offset_deg(df["lon"])

n_raw = len(df)
n_nan = int(df.isna().sum().sum())
n_unique = len(df.drop_duplicates(["lat","lon"]))

off_lat = off_grid_with_offset(df["lat"], lat_off)
off_lon = off_grid_with_offset(df["lon"], lon_off)

latmin, latmax = df["lat"].min(), df["lat"].max()
lonmin, lonmax = df["lon"].min(), df["lon"].max()

print(f"Rows: {n_raw}")
print(f"Unique (lat,lon) pairs: {n_unique}")
print(f"Missing values total: {n_nan}")
print(f"Inferred grid offsets: lat {lat_off:.2f}°, lon {lon_off:.2f}° (step=0.5°)")
print(f"Off-grid counts: lat {off_lat}, lon {off_lon}")
print(f"BBox: lat [{latmin}, {latmax}], lon [{lonmin}, {lonmax}]")

# Hard checks
assert n_nan == 0, "Found missing lat/lon values."
assert n_unique == 42, f"Expected 42 unique cells; found {n_unique}."
assert off_lat == 0 and off_lon == 0, "Some coordinates are not aligned to the inferred 0.5° grid."

# 4) Save canonical copies (original CSV untouched)
OUTDIR = CSV.parent / "derived"
OUTDIR.mkdir(exist_ok=True)

df_unique = (
    df.drop_duplicates(["lat","lon"])
      .sort_values(["lat","lon"])
      .reset_index(drop=True)
)

# (a) Verbatim copy: exactly as in the source
# df_unique.to_parquet(OUTDIR / "mask_core_42.parquet", index=False)
df_unique.to_csv(OUTDIR / "mask_core_42.csv", index=False)

# (b) Normalized copy: snapped to the inferred 0.5° grid (useful for exact joins later)
def snap_to_grid(series, offset_deg, step=0.5):
    return offset_deg + step * np.round((series - offset_deg) / step)

df_norm = df_unique.copy()
df_norm["lat"] = snap_to_grid(df_norm["lat"], lat_off)
df_norm["lon"] = snap_to_grid(df_norm["lon"], lon_off)
df_norm.to_csv(OUTDIR / "mask_core_42_normalized.csv", index=False)

print(f"Saved canonical copies to: {OUTDIR}")


Rows: 42
Unique (lat,lon) pairs: 42
Missing values total: 0
Inferred grid offsets: lat 0.25°, lon 0.25° (step=0.5°)
Off-grid counts: lat 0, lon 0
BBox: lat [43.75, 46.75], lon [6.75, 12.25]
Saved canonical copies to: ..\italy_core_data\derived


In [4]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np
import json

# --- Update these two paths ---
GDHY_DIR = Path(r"..\data\maize")  # folder with yield_YYYY.nc4
CORE_DIR = Path(r"..\italy_core_data")  # folder with your CSV
# ------------------------------

# choose a representative file (use 1982 since 1981 is excluded downstream)
gdhyds = xr.open_dataset(GDHY_DIR / "yield_1982.nc4")

# extract grid coordinates (names should be 'lat'/'lon' in GDHY)
lat = gdhyds["lat"].values
lon = gdhyds["lon"].values

# Quick grid report
print(f"GDHY grid: lat n={lat.size}, lon n={lon.size}")
print(f"lat range [{lat.min()}, {lat.max()}]")
print(f"lon range [{lon.min()}, {lon.max()}]")

# detect 0.5° step & 0.25° (or 0.00°) offset
def infer_step_and_offset(arr, guess_step=0.5):
    step = float(np.round(np.median(np.diff(arr)), 3))
    # infer offset relative to multiples of step
    frac = (arr / guess_step) % 1.0
    m = float(np.median(frac))
    offset = 0.0 if abs(m - 0.0) < 0.25 else guess_step/2
    return step, offset

lat_step, lat_off = infer_step_and_offset(lat)
lon_step, lon_off = infer_step_and_offset(lon)

print(f"Inferred: step lat={lat_step}°, lon={lon_step}°; offsets lat={lat_off}°, lon={lon_off}°")

# Save a tiny "target grid" artifact we’ll reuse later (coords only)
DERIVED = CORE_DIR / "derived"
DERIVED.mkdir(exist_ok=True)

grid_meta = {
    "source": "GDHY maize",
    "file_used": str((GDHY_DIR / "yield_1982.nc4").resolve()),
    "lat_len": int(lat.size),
    "lon_len": int(lon.size),
    "lat_min": float(lat.min()),
    "lat_max": float(lat.max()),
    "lon_min": float(lon.min()),
    "lon_max": float(lon.max()),
    "step_lat": lat_step,
    "step_lon": lon_step,
    "offset_lat": lat_off,
    "offset_lon": lon_off,
}
with open(DERIVED / "grid_gdhy_0p5.json", "w") as f:
    json.dump(grid_meta, f, indent=2)

# also save a NetCDF with just the 1D coords (handy for regridders)
xr.Dataset(coords={"lat": (["lat"], lat), "lon": (["lon"], lon)}).to_netcdf(DERIVED / "grid_gdhy_0p5.nc")

print(f"Saved grid meta & coords to: {DERIVED}")

# ---- Verify the 42 cells sit on this grid and get their indices ----
mask42 = pd.read_csv(DERIVED / "mask_core_42_normalized.csv")  # from Step 0
# helper to convert lat/lon to integer indices using the inferred step/offset
def coord_to_index(val, arr, offset, step):
    # compute expected index analytically, then clip to bounds
    idx = int(round((val - offset) / step))
    # safety: ensure it maps to the actual array position
    idx = int(np.clip(idx, 0, len(arr)-1))
    # final sanity: if exact match fails due to tiny float drift, snap to nearest
    if not np.isclose(arr[idx], val, atol=5e-4):
        idx = int(np.argmin(np.abs(arr - val)))
    return idx

mask42["ilat"] = mask42["lat"].apply(lambda v: coord_to_index(v, lat, lat_off, lat_step))
mask42["ilon"] = mask42["lon"].apply(lambda v: coord_to_index(v, lon, lon_off, lon_step))

# check round-trip accuracy
lat_miss = (~np.isclose(lat[mask42["ilat"]], mask42["lat"], atol=5e-4)).sum()
lon_miss = (~np.isclose(lon[mask42["ilon"]], mask42["lon"], atol=5e-4)).sum()

print(f"Index mapping checks → lat mismatches: {lat_miss}, lon mismatches: {lon_miss}")
assert lat_miss == 0 and lon_miss == 0, "Some 42 cells do not map cleanly onto the GDHY grid."

# save the index mapping for later quick subsetting (new file; originals untouched)
mask42[["lat","lon","ilat","ilon"]].to_csv(DERIVED / "mask_core_42_on_gdhy.csv", index=False)
print(f"Wrote 42-cell index mapping: {DERIVED/'mask_core_42_on_gdhy.csv'}")


GDHY grid: lat n=360, lon n=720
lat range [-89.75, 89.75]
lon range [0.25, 359.75]
Inferred: step lat=0.5°, lon=0.5°; offsets lat=0.25°, lon=0.25°
Saved grid meta & coords to: ..\italy_core_data\derived
Index mapping checks → lat mismatches: 0, lon mismatches: 0
Wrote 42-cell index mapping: ..\italy_core_data\derived\mask_core_42_on_gdhy.csv


In [7]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np

# --- Update these two paths (same as you used in Step 1) ---
GDHY_DIR = Path(r"..\data\maize")   # folder with yield_YYYY.nc4
CORE_DIR = Path(r"..\italy_core_data")
# -----------------------------------------------------------

DERIVED = CORE_DIR / "derived"
DERIVED.mkdir(exist_ok=True)

# 1) Load the 42-cell mapping (created in Step 1). We’ll also sort for deterministic order.
m42 = (
    pd.read_csv(DERIVED / "mask_core_42_on_gdhy.csv")
      .sort_values(["lat","lon"])
      .reset_index(drop=True)
)
ilat = m42["ilat"].to_numpy()
ilon = m42["ilon"].to_numpy()
latc = m42["lat"].to_numpy()
lonc = m42["lon"].to_numpy()

# 2) Helper to open one year's GDHY file and return a 1x42 vector
def load_yield_vector(year: int) -> np.ndarray:
    f = GDHY_DIR / f"yield_{year}.nc4"
    ds = xr.open_dataset(f)
    # find the yield variable (often 'var')
    varname = [v for v in ds.data_vars][0]
    da = ds[varname].squeeze()  # -> (lat, lon) or (lat, lon) after dropping singleton time
    # ensure we index in (lat, lon) order
    if set(("lat","lon")).issubset(da.dims):
        da = da.transpose("lat","lon")
    else:
        raise ValueError(f"{f} does not have expected lat/lon dims; got {da.dims}")
    arr = da.values  # shape [nlat, nlon]
    # advanced indexing: pairs of (ilat[k], ilon[k]) -> (42,)
    vals = arr[ilat, ilon]
    return vals.astype("float32")

# 3) Loop years 1982–2016 (exclude 1981)
years = list(range(1982, 2017))
rows = []
for y in years:
    rows.append(load_yield_vector(y))
data = np.vstack(rows)  # shape (35, 42)

# 4) Build xarray object with coords and save
yield_da = xr.DataArray(
    data,
    dims=("year", "cell"),
    coords={
        "year": years,
        "cell": np.arange(len(lonc)),
        "lat": ("cell", latc),
        "lon": ("cell", lonc),
    },
    name="yield_maize",  # tonnes per hectare (per GDHY)
)
yield_ds = yield_da.to_dataset()

# Write compact NetCDF (compressed)
encoding = {"yield_maize": {"zlib": True, "complevel": 4}}
out_nc = DERIVED / "yield_maize_core42_1982_2016.nc"
yield_ds.to_netcdf(out_nc, encoding=encoding)

# Also write a tidy CSV (rows = 42*35) for quick inspection
out_csv = DERIVED / "yield_maize_core42_1982_2016.csv"

# Build a tidy/long table; xarray includes 'lat' and 'lon' already
df_long = yield_da.to_dataframe(name="yield_maize").reset_index()

# Keep columns and sort
df_long = df_long[["lat", "lon", "year", "yield_maize"]].sort_values(["lat", "lon", "year"])

df_long.to_csv(out_csv, index=False)

In [9]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np

# ---- paths (edit ERA5_DIR to your location) ----
ERA5_DIR = Path(r"..\data\climate_monthly_full")        # <-- folder with your .grib files
CORE_DIR = Path(r"..\italy_core_data")          # same as before
DERIVED  = CORE_DIR / "derived"
DERIVED.mkdir(exist_ok=True)

# ---- load artifacts from prior steps ----
m42 = pd.read_csv(DERIVED / "mask_core_42_on_gdhy.csv").sort_values(["lat","lon"]).reset_index(drop=True)
ilat, ilon = m42["ilat"].to_numpy(), m42["ilon"].to_numpy()
latc, lonc = m42["lat"].to_numpy(), m42["lon"].to_numpy()

grid_ds = xr.open_dataset(DERIVED / "grid_gdhy_0p5.nc")  # has 1D lat/lon for GDHY
lat_g, lon_g = grid_ds["lat"].values, grid_ds["lon"].values

# ---- helpers ----
def find_year_file(year: int, folder: Path) -> Path:
    cand = folder / f"era5_land_{year}.grib"
    if cand.exists():
        return cand
    # fallback: any file that contains the year in the name
    matches = list(folder.glob(f"*{year}*.grib"))
    if not matches:
        raise FileNotFoundError(f"No GRIB for year {year} in {folder}")
    return matches[0]

def open_era5_t2m_year(year: int) -> xr.DataArray:
    f = find_year_file(year, ERA5_DIR)
    ds = xr.open_dataset(
        f,
        engine="cfgrib",
        backend_kwargs=dict(indexpath="", filter_by_keys={"shortName": "2t"}),
        decode_timedelta=True,  # <- silences the FutureWarning
    )
    ds = ds.rename({"latitude": "lat", "longitude": "lon"})
    if ds.lat[0] > ds.lat[-1]:
        ds = ds.sortby("lat")
    if ds.lon.min() < 0:
        ds = ds.assign_coords(lon=((ds.lon % 360 + 360) % 360)).sortby("lon")
    da = ds["t2m"] - 273.15  # K -> °C
    return da

def bin_to_half_degree(da: xr.DataArray, step=0.5, offset=0.25) -> xr.DataArray:
    # Compute target bin centers as plain numpy arrays (not DataArrays)
    lat_bins = (offset + step * np.round((da["lat"].values - offset) / step)).astype(np.float64)
    lon_bins = (offset + step * np.round((da["lon"].values - offset) / step)).astype(np.float64)

    # Attach as coordinates aligned to the respective dimensions
    da = da.assign_coords(lat_bin=("lat", lat_bins), lon_bin=("lon", lon_bins))

    # Block-average by those bins
    da_c = da.groupby("lat_bin").mean("lat").groupby("lon_bin").mean("lon")
    da_c = da_c.rename({"lat_bin": "lat", "lon_bin": "lon"})
    return da_c

def seasonal_mean_mjjas(da_monthly_05: xr.DataArray) -> xr.DataArray:
    """Select May–Sep and mean over months (per year)."""
    sel = da_monthly_05.where(da_monthly_05["time"].dt.month.isin([5,6,7,8,9]), drop=True)
    out = sel.groupby("time.year").mean("time")
    return out.rename({"year": "year"})

# ---- process per year to keep memory low ----
years = list(range(1982, 2016 + 1))  # exclude 1981
pieces = []
for y in years:
    # 1) open monthly t2m for the year
    da_m = open_era5_t2m_year(y)  # dims: time, lat, lon (0.1°)

    # 2) regrid monthly to 0.5° block means
    da_m05 = bin_to_half_degree(da_m)  # dims: time, lat(0.5), lon(0.5)

    # 3) seasonal (May–Sep) mean for that year
    da_y = seasonal_mean_mjjas(da_m05)  # dims: year(=1), lat, lon

    # 4) align lat/lon exactly to GDHY grid (nearest) so indices match
    da_y = da_y.sel(lat=lat_g, lon=lon_g, method="nearest")

    pieces.append(da_y)

# ---- stack into one (year, lat, lon) cube on the GDHY grid ----
t2m_mjjas = xr.concat(pieces, dim="year")
t2m_mjjas = t2m_mjjas.assign_coords(year=("year", years))
t2m_mjjas.name = "t2m_MJJAS_C"

# ---- subset to the 42 cells by GDHY indices (vectorized) ----
t2m_42 = t2m_mjjas.isel(
    lat=xr.DataArray(ilat, dims="cell"),
    lon=xr.DataArray(ilon, dims="cell"),
)
# add lat/lon as coords on 'cell'
t2m_42 = t2m_42.assign_coords(cell=("cell", np.arange(len(ilat))))
t2m_42 = t2m_42.assign_coords(lat=("cell", latc), lon=("cell", lonc))

# ---- save outputs ----
out_nc = DERIVED / "t2m_MJJAS_core42_1982_2016.nc"
enc = {"t2m_MJJAS_C": {"zlib": True, "complevel": 4}}
t2m_42.to_dataset(name="t2m_MJJAS_C").to_netcdf(out_nc, encoding=enc)

out_csv = DERIVED / "t2m_MJJAS_core42_1982_2016.csv"
df_t = t2m_42.to_dataframe(name="temperature").reset_index()
df_t = df_t[["lat","lon","year","temperature"]].sort_values(["lat","lon","year"])
df_t.to_csv(out_csv, index=False)

# ---- QA ----
print(f"Saved: {out_nc.name}, {out_csv.name}  →  {DERIVED}")
print(f"Shape: years={t2m_42.sizes['year']} (expect 35), cells={t2m_42.sizes['cell']} (expect 42)")
print(f"NANs: {int(np.isnan(t2m_42.values).sum())}")
print(f"Year range: {t2m_42.year.values.min()}–{t2m_42.year.values.max()}")
print(f"Lat range: {t2m_42.lat.values.min()}–{t2m_42.lat.values.max()} | Lon range: {t2m_42.lon.values.min()}–{t2m_42.lon.values.max()}")


Saved: t2m_MJJAS_core42_1982_2016.nc, t2m_MJJAS_core42_1982_2016.csv  →  ..\italy_core_data\derived
Shape: years=35 (expect 35), cells=42 (expect 42)
NANs: 0
Year range: 1982–2016
Lat range: 43.75–46.75 | Lon range: 6.75–12.25


In [10]:
from pathlib import Path
import pandas as pd
import numpy as np

# --- paths ---
DERIVED = Path(r"..\italy_core_data\derived")
yield_csv = DERIVED / "yield_maize_core42_1982_2016.csv"
temp_csv  = DERIVED / "t2m_MJJAS_core42_1982_2016.csv"
out_csv   = DERIVED / "maize_ITnorth_core42_1982_2016.csv"

# --- load ---
y = pd.read_csv(yield_csv)   # cols: lat, lon, year, yield_maize
t = pd.read_csv(temp_csv)    # cols: lat, lon, year, temperature

# --- normalize dtypes ---
for df, name in [(y,"yield"), (t,"temp")]:
    # ensure keys exist
    assert {"lat","lon","year"}.issubset(df.columns), f"{name}: missing join keys"
    # consistent dtypes
    df["year"] = df["year"].astype(int)
    df["lat"]  = df["lat"].astype(float)
    df["lon"]  = df["lon"].astype(float)

# --- quick sanity on each input ---
def quick_report(df, label):
    n = len(df)
    nunique = df[["lat","lon","year"]].drop_duplicates().shape[0]
    n_nans = int(df.isna().sum().sum())
    print(f"{label}: rows={n}, unique_keys={nunique}, NaNs={n_nans}")
    assert n == 42*35, f"{label}: expected 1470 rows, got {n}"
    assert nunique == n, f"{label}: duplicate (lat,lon,year) rows found"
    assert n_nans == 0, f"{label}: contains missing values"

quick_report(y, "Yield")
quick_report(t, "Temperature")

# --- merge on exact keys ---
df = y.merge(t, on=["lat","lon","year"], how="inner")

# --- final checks ---
print(f"Merged rows: {len(df)} (expected 1470)")
assert len(df) == 42*35, "Merge did not produce 1470 rows—check key alignment"
assert int(df.isna().sum().sum()) == 0, "Merged table has missing values"

# order & save
df = df[["lat","lon","year","yield_maize","temperature"]].sort_values(["lat","lon","year"]).reset_index(drop=True)
df.to_csv(out_csv, index=False)

print(f"Saved combined dataset → {out_csv}")
print("Columns:", list(df.columns))
print(df.head(8))

Yield: rows=1470, unique_keys=1470, NaNs=0
Temperature: rows=1470, unique_keys=1470, NaNs=0
Merged rows: 1470 (expected 1470)
Saved combined dataset → ..\italy_core_data\derived\maize_ITnorth_core42_1982_2016.csv
Columns: ['lat', 'lon', 'year', 'yield_maize', 'temperature']
     lat    lon  year  yield_maize  temperature
0  43.75  11.75  1982     9.251022    18.613775
1  43.75  11.75  1983     9.876991    18.254950
2  43.75  11.75  1984    10.552830    16.382141
3  43.75  11.75  1985    10.486564    18.763770
4  43.75  11.75  1986    12.218849    18.099987
5  43.75  11.75  1987    11.250535    18.253310
6  43.75  11.75  1988    11.657367    18.235193
7  43.75  11.75  1989    10.706320    17.116800


In [1]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np

# ------------ paths (edit if needed) ------------
ERA5_DIR = Path(r"..\data\climate_monthly_full")   # contains era5_land_monthly_YYYY.grib
CORE_DIR = Path(r"..\italy_core_data")
DERIVED  = CORE_DIR / "derived"
DERIVED.mkdir(exist_ok=True)

base_csv = DERIVED / "maize_ITnorth_core42_1982_2016.csv"  # yield + temperature (from Step 4)

# ------------ prior artifacts ------------
m42 = pd.read_csv(DERIVED / "mask_core_42_on_gdhy.csv").sort_values(["lat","lon"]).reset_index(drop=True)
latc, lonc = m42["lat"].to_numpy(), m42["lon"].to_numpy()

# Small safety margin so binning catches the edge cells
BBOX = dict(
    lat_min=float(latc.min() - 0.5),
    lat_max=float(latc.max() + 0.5),
    lon_min=float(lonc.min() - 0.5),
    lon_max=float(lonc.max() + 0.5),
)

years = list(range(1982, 2016 + 1))
season_months = [5,6,7,8,9]  # MJJAS

# ------------ helpers ------------
def find_year_file(year: int) -> Path:
    cand = ERA5_DIR / f"era5_land_monthly_{year}.grib"
    if cand.exists():
        return cand
    matches = list(ERA5_DIR.glob(f"*{year}*.grib"))
    if not matches:
        raise FileNotFoundError(f"No GRIB for year {year} in {ERA5_DIR}")
    return matches[0]

def open_era5_var_year(year: int, short: str, varname: str, bbox: dict) -> xr.DataArray:
    """Open one ERA5-Land variable for a year, standardize coords, crop to Italy bbox, return DA."""
    f = find_year_file(year)
    ds = xr.open_dataset(
        f,
        engine="cfgrib",
        backend_kwargs=dict(indexpath="", filter_by_keys={"shortName": short}),
        decode_timedelta=True,
    ).rename({"latitude":"lat","longitude":"lon"})

    # Ensure ascending latitude and 0..360 longitudes
    if ds.lat[0] > ds.lat[-1]:
        ds = ds.sortby("lat")
    if ds.lon.min() < 0:
        ds = ds.assign_coords(lon=((ds.lon % 360 + 360) % 360)).sortby("lon")

    # Crop to Italy bbox BEFORE regridding (massive speed-up)
    ds = ds.sel(lat=slice(bbox["lat_min"], bbox["lat_max"]),
                lon=slice(bbox["lon_min"], bbox["lon_max"]))

    da = ds[varname]
    if "expver" in da.dims:
        da = da.isel(expver=-1)  # pick analysis member if present

    return da  # dims: time, lat, lon (regional)

def bin_to_half_degree(da: xr.DataArray, step=0.5, offset=0.25) -> xr.DataArray:
    """Block-average 0.1° to 0.5° by labeling to nearest 0.25 + 0.5*k centers, then mean."""
    lat_bins = (offset + step * np.round((da["lat"].values - offset) / step)).astype(np.float64)
    lon_bins = (offset + step * np.round((da["lon"].values - offset) / step)).astype(np.float64)
    da = da.assign_coords(lat_bin=("lat", lat_bins), lon_bin=("lon", lon_bins))
    da_c = da.groupby("lat_bin").mean("lat").groupby("lon_bin").mean("lon")
    return da_c.rename({"lat_bin":"lat","lon_bin":"lon"})  # dims: time, lat(0.5), lon(0.5)

def seasonal_mean(da_m05: xr.DataArray) -> xr.DataArray:
    """Mean over MJJAS months (for state variables)."""
    sel = da_m05.where(da_m05["time"].dt.month.isin(season_months), drop=True)
    return sel.groupby("time.year").mean("time").rename({"year":"year"})

def seasonal_total_from_daily_means(da_m05: xr.DataArray) -> xr.DataArray:
    """
    ERA5 Monthly Averaged fluxes are DAILY means.
    Convert to monthly totals by multiplying by days_in_month, then sum MJJAS.
    """
    sel = da_m05.where(da_m05["time"].dt.month.isin(season_months), drop=True)
    days = xr.DataArray(
        sel["time"].dt.days_in_month,
        coords={"time": sel["time"]},
        dims=["time"]
    ).astype(np.float64)
    return (sel * days).groupby("time.year").sum("time").rename({"year":"year"})

def process_var(short: str, xr_name: str, out_col: str, how: str, unit_conv=None, post_conv=None):
    """
    short/xr_name : GRIB key & xarray variable name (often the same)
    out_col       : output column name
    how           : 'mean' (state) or 'sum' (flux) over season
    unit_conv     : function applied to the **monthly** field (e.g., K->°C, m->mm, sign flips)
    post_conv     : function applied after seasonal reduce (e.g., convert J -> MJ)
    """
    pieces = []
    for y in years:
        da = open_era5_var_year(y, short, xr_name, BBOX)   # regional 0.1°
        if unit_conv is not None:
            da = unit_conv(da)                             # per-month conversion
        da05 = bin_to_half_degree(da)                      # regional 0.5°

        if how == "mean":
            ya = seasonal_mean(da05)                       # (year, lat, lon)
        elif how == "sum":
            ya = seasonal_total_from_daily_means(da05)    # (year, lat, lon)
        else:
            raise ValueError("how must be 'mean' or 'sum'")

        # Select the exact same 42 cells by value (no global expansion)
        ya42 = ya.sel(lat=xr.DataArray(latc, dims="cell"),
                      lon=xr.DataArray(lonc, dims="cell"),
                      method="nearest")
        ya42 = ya42.assign_coords(cell=("cell", np.arange(len(latc))),
                                  lat=("cell", latc), lon=("cell", lonc))
        pieces.append(ya42)

        if (y - years[0]) % 5 == 0:
            print(f"{out_col}: processed {y}", flush=True)

    da_all = xr.concat(pieces, dim="year").assign_coords(year=("year", years))

    if post_conv is not None:
        da_all = post_conv(da_all)

    out_nc  = DERIVED / f"{out_col}_MJJAS_core42_1982_2016.nc"
    out_csv = DERIVED / f"{out_col}_MJJAS_core42_1982_2016.csv"
    da_all.to_dataset(name=xr_name).to_netcdf(out_nc, encoding={xr_name: {"zlib": True, "complevel": 4}})
    df = da_all.to_dataframe(name=out_col).reset_index()[["lat","lon","year",out_col]]
    df = df.sort_values(["lat","lon","year"])
    df.to_csv(out_csv, index=False)
    print(f"Saved {out_col}: {out_csv.name}  (rows={len(df)})")
    return df

# ------------ recompute all stressors with correct handling ------------
# precipitation: m -> mm (per month), then hours-weighted seasonal total
df_tp   = process_var(short="tp",    xr_name="tp",    out_col="precipitation",
                      how="sum",
                      unit_conv=lambda da: da * 1000.0,   # mm per hour-mean
                      post_conv=None)

# soil water layer 1: mean MJJAS (m3/m3), no hours weighting
df_swvl = process_var(short="swvl1", xr_name="swvl1", out_col="soil_water",
                      how="mean",
                      unit_conv=None,
                      post_conv=None)

# solar radiation: J/m² per hour-mean -> hours-weighted seasonal total
# (optional: convert to MJ/m² for readability with post_conv=lambda da: da/1e6)
df_ssr  = process_var(short="ssr",   xr_name="ssr",   out_col="solar_radiation",
                      how="sum",
                      unit_conv=None,
                      post_conv=None)  # or post_conv=lambda da: da/1e6

# potential evaporation: m (negative) -> **positive mm**, then hours-weighted seasonal total
df_pev  = process_var(short="pev",   xr_name="pev",   out_col="potential_evaporation",
                      how="sum",
                      unit_conv=lambda da: -da * 1000.0,  # flip sign & mm per hour-mean
                      post_conv=None)

# ------------ merge with existing yield+temperature ------------
base = pd.read_csv(base_csv)  # lat, lon, year, yield_maize, temperature
df = (
    base.merge(df_tp,   on=["lat","lon","year"], how="inner")
        .merge(df_swvl, on=["lat","lon","year"], how="inner")
        .merge(df_ssr,  on=["lat","lon","year"], how="inner")
        .merge(df_pev,  on=["lat","lon","year"], how="inner")
        .sort_values(["lat","lon","year"])
        .reset_index(drop=True)
)

# Final checks and save
assert len(df) == 42*35, f"Expected 1470 rows, got {len(df)}"
assert int(df.isna().sum().sum()) == 0, "Merged table has NaNs"

final_csv = DERIVED / "maize_ITnorth_core42_1982_2016_allstressors.csv"
df.to_csv(final_csv, index=False)

print("\nFinal table saved →", final_csv)
print("Columns:", list(df.columns))
print(df.head(8))


precipitation: processed 1982
precipitation: processed 1987
precipitation: processed 1992
precipitation: processed 1997
precipitation: processed 2002
precipitation: processed 2007
precipitation: processed 2012
Saved precipitation: precipitation_MJJAS_core42_1982_2016.csv  (rows=1470)
soil_water: processed 1982
soil_water: processed 1987
soil_water: processed 1992
soil_water: processed 1997
soil_water: processed 2002
soil_water: processed 2007
soil_water: processed 2012
Saved soil_water: soil_water_MJJAS_core42_1982_2016.csv  (rows=1470)
solar_radiation: processed 1982
solar_radiation: processed 1987
solar_radiation: processed 1992
solar_radiation: processed 1997
solar_radiation: processed 2002
solar_radiation: processed 2007
solar_radiation: processed 2012
Saved solar_radiation: solar_radiation_MJJAS_core42_1982_2016.csv  (rows=1470)
potential_evaporation: processed 1982
potential_evaporation: processed 1987
potential_evaporation: processed 1992
potential_evaporation: processed 1997
po

In [2]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np

# ---------- paths ----------
ERA5_DIR = Path(r"..\data\climate_monthly_full")   # folder with era5_land_monthly_YYYY.grib
CORE_DIR = Path(r"..\italy_core_data")
DERIVED  = CORE_DIR / "derived"
DERIVED.mkdir(exist_ok=True)

base_csv = DERIVED / "maize_ITnorth_core42_1982_2016_allstressors.csv"  # seasonal file (already created)
out_csv  = DERIVED / "maize_ITnorth_core42_1982_2016_allstressors_with_monthly.csv"

# ---------- constants ----------
years = list(range(1982, 2016 + 1))
months = [5, 6, 7, 8, 9]  # May–Sep
mon_name = {5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep"}

# ---------- 42-cell mask (values) for exact selection ----------
m42 = pd.read_csv(DERIVED / "mask_core_42_on_gdhy.csv").sort_values(["lat","lon"]).reset_index(drop=True)
latc, lonc = m42["lat"].to_numpy(), m42["lon"].to_numpy()

# Small bbox so we only process Italy region
BBOX = dict(
    lat_min=float(latc.min() - 0.5),
    lat_max=float(latc.max() + 0.5),
    lon_min=float(lonc.min() - 0.5),
    lon_max=float(lonc.max() + 0.5),
)

# ---------- helpers ----------
def find_year_file(year: int) -> Path:
    cand = ERA5_DIR / f"era5_land_monthly_{year}.grib"
    if cand.exists():
        return cand
    matches = list(ERA5_DIR.glob(f"*{year}*.grib"))
    if not matches:
        raise FileNotFoundError(f"No GRIB for year {year} in {ERA5_DIR}")
    return matches[0]

def open_era5_var_year(year: int, short: str, varname: str, bbox: dict) -> xr.DataArray:
    f = find_year_file(year)
    ds = xr.open_dataset(
        f,
        engine="cfgrib",
        backend_kwargs=dict(indexpath="", filter_by_keys={"shortName": short}),
        decode_timedelta=True,
    ).rename({"latitude":"lat","longitude":"lon"})
    # ensure ascending lat, 0..360 lon, and crop
    if ds.lat[0] > ds.lat[-1]:
        ds = ds.sortby("lat")
    if ds.lon.min() < 0:
        ds = ds.assign_coords(lon=((ds.lon % 360 + 360) % 360)).sortby("lon")
    ds = ds.sel(lat=slice(bbox["lat_min"], bbox["lat_max"]),
                lon=slice(bbox["lon_min"], bbox["lon_max"]))
    da = ds[varname]
    if "expver" in da.dims:
        da = da.isel(expver=-1)
    return da  # dims: time, lat, lon

def bin_to_half_degree(da: xr.DataArray, step=0.5, offset=0.25) -> xr.DataArray:
    """Block-average 0.1° to 0.5°: label to nearest 0.25+0.5*k centers, then mean."""
    lat_bins = (offset + step * np.round((da["lat"].values - offset) / step)).astype(np.float64)
    lon_bins = (offset + step * np.round((da["lon"].values - offset) / step)).astype(np.float64)
    da = da.assign_coords(lat_bin=("lat", lat_bins), lon_bin=("lon", lon_bins))
    da_c = da.groupby("lat_bin").mean("lat").groupby("lon_bin").mean("lon")
    return da_c.rename({"lat_bin":"lat","lon_bin":"lon"})  # dims: time, lat(0.5), lon(0.5)

def monthly_means(da_m05: xr.DataArray) -> xr.DataArray:
    """Return monthly means for MJJAS (state variables)."""
    return da_m05.where(da_m05["time"].dt.month.isin(months), drop=True)

def monthly_totals_from_daily_means(da_m05: xr.DataArray) -> xr.DataArray:
    """For ERA5 Monthly Averaged fluxes: daily mean × days_in_month -> monthly totals."""
    sel = da_m05.where(da_m05["time"].dt.month.isin(months), drop=True)
    days = xr.DataArray(sel["time"].dt.days_in_month, coords={"time": sel["time"]}, dims=["time"]).astype(np.float64)
    return sel * days

def make_monthly_wide(short: str, xr_name: str, out_col: str, how: str, unit_conv=None):
    """
    Build a wide dataframe with columns {out_col}_May … {out_col}_Sep.
    how = 'mean' for state vars; 'total' for flux vars.
    unit_conv is applied to the **monthly** field prior to means/totals (e.g., K->°C; m->mm and sign flip).
    """
    rows = []
    for y in years:
        da = open_era5_var_year(y, short, xr_name, BBOX)
        if unit_conv is not None:
            da = unit_conv(da)
        da05 = bin_to_half_degree(da)

        if how == "mean":
            dam = monthly_means(da05)                 # time (MJJAS), lat, lon
        elif how == "total":
            dam = monthly_totals_from_daily_means(da05)
        else:
            raise ValueError("how must be 'mean' or 'total'")

        # select the 42 cells by value (nearest to GDHY centers)
        sub = dam.sel(lat=xr.DataArray(latc, dims="cell"),
                      lon=xr.DataArray(lonc, dims="cell"),
                      method="nearest")
        sub = sub.assign_coords(cell=("cell", np.arange(len(latc))),
                                lat=("cell", latc), lon=("cell", lonc))

        df = sub.to_dataframe(name=out_col).reset_index()
        df["year"] = pd.to_datetime(df["time"]).dt.year
        df["month"] = pd.to_datetime(df["time"]).dt.month
        df = df[df["month"].isin(months)].copy()
        df["month_name"] = df["month"].map(mon_name)
        rows.append(df[["lat","lon","year","month_name",out_col]])

        if (y - years[0]) % 5 == 0:
            print(f"{out_col}: processed {y}", flush=True)

    df_all = pd.concat(rows, ignore_index=True)
    # pivot to wide: columns = month names
    wide = df_all.pivot_table(index=["lat","lon","year"], columns="month_name", values=out_col, aggfunc="first").reset_index()
    # ensure month column order and rename with prefix
    month_cols = ["May","Jun","Jul","Aug","Sep"]
    for m in month_cols:
        if m not in wide.columns:
            wide[m] = np.nan
    wide = wide[["lat","lon","year"] + month_cols]
    wide = wide.rename(columns={m: f"{out_col}_{m}" for m in month_cols})
    return wide

# ---------- build monthly tables ----------
# temperature: K -> °C, monthly means
df_t2m_mon = make_monthly_wide(short="2t", xr_name="t2m", out_col="temperature",
                               how="mean", unit_conv=lambda da: da - 273.15)

# soil water: monthly means (m3/m3)
df_swvl_mon = make_monthly_wide(short="swvl1", xr_name="swvl1", out_col="soil_water",
                                how="mean", unit_conv=None)

# precipitation: m -> mm, daily mean × days -> monthly totals
df_tp_mon = make_monthly_wide(short="tp", xr_name="tp", out_col="precipitation",
                              how="total", unit_conv=lambda da: da * 1000.0)

# solar radiation: J/m² daily mean × days -> monthly totals (kept in J/m²)
df_ssr_mon = make_monthly_wide(short="ssr", xr_name="ssr", out_col="solar_radiation",
                               how="total", unit_conv=None)

# potential evaporation: m -> **mm** and flip sign to positive, daily mean × days -> monthly totals
df_pev_mon = make_monthly_wide(short="pev", xr_name="pev", out_col="potential_evaporation",
                               how="total", unit_conv=lambda da: -da * 1000.0)

# ---------- merge with seasonal base and save ----------
base = pd.read_csv(base_csv)  # lat, lon, year, seasonal columns already present

enriched = (
    base.merge(df_t2m_mon, on=["lat","lon","year"], how="inner")
        .merge(df_tp_mon,   on=["lat","lon","year"], how="inner")
        .merge(df_swvl_mon, on=["lat","lon","year"], how="inner")
        .merge(df_ssr_mon,  on=["lat","lon","year"], how="inner")
        .merge(df_pev_mon,  on=["lat","lon","year"], how="inner")
        .sort_values(["lat","lon","year"])
        .reset_index(drop=True)
)

# sanity checks
assert len(enriched) == 42*35, f"Expected 1470 rows, got {len(enriched)}"
assert int(enriched.isna().sum().sum()) == 0, "Found NaNs in enriched dataset"

enriched.to_csv(out_csv, index=False)
print(f"Saved enriched dataset with monthly columns → {out_csv}")
print("Columns:", list(enriched.columns)[:12], "...")  # preview

temperature: processed 1982
temperature: processed 1987
temperature: processed 1992
temperature: processed 1997
temperature: processed 2002
temperature: processed 2007
temperature: processed 2012
soil_water: processed 1982
soil_water: processed 1987
soil_water: processed 1992
soil_water: processed 1997
soil_water: processed 2002
soil_water: processed 2007
soil_water: processed 2012
precipitation: processed 1982
precipitation: processed 1987
precipitation: processed 1992
precipitation: processed 1997
precipitation: processed 2002
precipitation: processed 2007
precipitation: processed 2012
solar_radiation: processed 1982
solar_radiation: processed 1987
solar_radiation: processed 1992
solar_radiation: processed 1997
solar_radiation: processed 2002
solar_radiation: processed 2007
solar_radiation: processed 2012
potential_evaporation: processed 1982
potential_evaporation: processed 1987
potential_evaporation: processed 1992
potential_evaporation: processed 1997
potential_evaporation: proces