In [None]:
# === STEP 7.1 — USER INPUTS ===
from pathlib import Path

# Folder with your final CSVs
DERIVED = Path(r"..\italy_core_data\derived")

# >>> Pick ONE of your datasets to prepare for modeling <<<
# Examples:
#INPUT_CSV = DERIVED / "maize_ITnorth_core42_1982_2016_allstressors_with_monthly.csv"
#INPUT_CSV = DERIVED / "rice_ITnorth_core41_1982_2016_allstressors_with_monthly.csv"
#INPUT_CSV = DERIVED / "soybean_ITnorth_core41_1982_2016_allstressors_with_monthly.csv"
#INPUT_CSV = DERIVED / "wheat_spring_ITnorth_core42_1982_2016_allstressors_with_monthly.csv"
#INPUT_CSV = DERIVED / "wheat_winter_ITnorth_core42_1982_2016_allstressors_with_monthly.csv"
#INPUT_CSV = DERIVED / "wheat_both_ITnorth_core42_1982_2016_allstressors_with_monthly_long.csv"

# ← choose one:
INPUT_CSV = DERIVED / "maize_ITnorth_core42_1982_2016_allstressors_with_monthly.csv"

# Where to save the modeling table
OUT_DIR = DERIVED / "modeling"
OUT_DIR.mkdir(exist_ok=True)


In [None]:
import pandas as pd
import numpy as np
import json

# -------- load --------
df = pd.read_csv(INPUT_CSV)

# detect yield column dynamically (works for maize/rice/soy + wheat_* + stacked long)
yield_cols = [c for c in df.columns if c.startswith("yield_")]
assert len(yield_cols) == 1, f"Expected one yield_* column, found {yield_cols}"
YIELD_COL = yield_cols[0]

# keys present
assert {"lat","lon","year"}.issubset(df.columns)
has_type = "wheat_type" in df.columns  # stacked long wheat

# -------- add cell_id (stable across runs) --------
# sort by lat,lon so IDs are reproducible
cells = (df[["lat","lon"]]
         .drop_duplicates()
         .sort_values(["lat","lon"])
         .reset_index(drop=True))
cells["cell_id"] = np.arange(len(cells))

df = df.merge(cells, on=["lat","lon"], how="left")

# -------- outcomes: log yield + two-way FE demeaned --------
# sanity: yields should be > 0
min_y = df[YIELD_COL].min()
if not (min_y > 0):
    raise ValueError(f"{YIELD_COL} has non-positive values (min={min_y}). Cannot log-transform safely.")

df["log_yield"] = np.log(df[YIELD_COL].values)

def twoway_demean(s, ids, years):
    mu = s.mean()
    a = s.groupby(ids).transform("mean")
    t = s.groupby(years).transform("mean")
    return s - a - t + mu

df["yield_fe"]      = twoway_demean(df[YIELD_COL], df["cell_id"], df["year"])
df["log_yield_fe"]  = twoway_demean(df["log_yield"], df["cell_id"], df["year"])

# -------- seasonal predictors (keep raw + standardized) --------
seasonal_vars = ["temperature","precipitation","soil_water","solar_radiation","potential_evaporation"]
missing = [v for v in seasonal_vars if v not in df.columns]
assert not missing, f"Missing seasonal columns: {missing}"

# aridity index (mm/mm) — protect against zeros just in case
pev = df["potential_evaporation"].replace({0: np.nan})
df["aridity_index"] = df["precipitation"] / pev

# standardize (z-scores) — across the whole chosen dataset
params = {"standardization": {}}
for v in seasonal_vars + ["aridity_index"]:
    s = df[v].astype(float)
    mu = float(s.mean())
    sd = float(s.std(ddof=0))
    # avoid division by 0 if constant
    if sd == 0 or np.isnan(sd):
        z = pd.Series(np.zeros(len(s)), index=s.index, dtype=float)
    else:
        z = (s - mu) / sd
    df[f"{v}_z"] = z
    params["standardization"][v] = {"mean": mu, "std": sd}

# also store centered (mean-removed) versions for interpretable interactions
for v in seasonal_vars:
    df[f"{v}_c"] = df[v] - params["standardization"][v]["mean"]

# -------- light integrity checks --------
n_rows = len(df)
dups = df.duplicated(["lat","lon","year"] + (["wheat_type"] if has_type else [])).sum()
nans = int(df[["log_yield","yield_fe","log_yield_fe"] + seasonal_vars].isna().sum().sum())

print(f"Rows: {n_rows}")
print(f"Duplicate keys: {dups} (expect 0)")
print(f"NaNs in core fields: {nans} (expect 0)")
print("Yield col:", YIELD_COL)

# -------- save --------
stem = INPUT_CSV.stem.replace("_allstressors_with_monthly","").replace("_allstressors","")
out_csv  = OUT_DIR / f"{stem}_modeling.csv"
out_json = OUT_DIR / f"{stem}_modeling_params.json"

df.to_csv(out_csv, index=False)
with open(out_json, "w") as f:
    json.dump({
        "input_file": str(INPUT_CSV),
        "yield_col": YIELD_COL,
        "n_rows": n_rows,
        "n_cells": int(cells.shape[0]),
        "years": {"min": int(df["year"].min()), "max": int(df["year"].max())},
        "has_wheat_type": bool(has_type),
        **params
    }, f, indent=2)

print("\nSaved:")
print(" •", out_csv)
print(" •", out_json)

# preview a few columns you’ll model with
show_cols = ["lat","lon","cell_id","year"] + (["wheat_type"] if has_type else []) + \
            [YIELD_COL,"log_yield","yield_fe","log_yield_fe"] + \
            ["temperature_z","precipitation_z","soil_water_z","solar_radiation_z","potential_evaporation_z","aridity_index_z"]
print("\nPreview:")
print(df[show_cols].head(8))


In [None]:
# === STEP 7.2 — Baseline seasonal multiple regression (clustered by cell) ===
from pathlib import Path
import pandas as pd
import numpy as np
import statsmodels.api as sm

DERIVED = Path(r"..\italy_core_data\derived")
MODEL_DIR = DERIVED / "modeling"
MODEL_DIR.mkdir(exist_ok=True)

# Pick the modeling CSV you just created in 7.1 (maize here, but any dataset works)
model_csv = MODEL_DIR / "maize_ITnorth_core42_1982_2016_modeling.csv"
df = pd.read_csv(model_csv)

# Columns
y = "log_yield_fe"   # two-way demeaned log yield
X_base = ["temperature_z","precipitation_z","soil_water_z","solar_radiation_z","potential_evaporation_z"]

# Add a simple interaction (centered vars so main effects are interpretable at avg climate)
df["temp_x_precip"] = (df["temperature_c"] * df["precipitation_c"]).astype(float)
X = X_base + ["temp_x_precip"]

# Build design matrix
Xmat = sm.add_constant(df[X].astype(float))
yvec = df[y].astype(float)

# Fit OLS with cluster-robust SE by cell
model = sm.OLS(yvec, Xmat, missing="drop")
res = model.fit(cov_type="cluster", cov_kwds={"groups": df["cell_id"]})

# Print compact summary
print(res.summary())

# Tidy coefficient table with clustered SE and 95% CI
coefs = (pd.DataFrame({
    "term": ["const"] + X,
    "estimate": res.params.values,
    "std_error": res.bse.values,
    "t_value": res.tvalues.values,
    "p_value": res.pvalues.values,
})
.assign(ci_lo=lambda d: d["estimate"] - 1.96*d["std_error"])
.assign(ci_hi=lambda d: d["estimate"] + 1.96*d["std_error"])
)

out_coef = MODEL_DIR / "maize_baseline_coefficients.csv"
coefs.to_csv(out_coef, index=False)

# Simple fit diagnostics
print("\nN =", int(res.nobs), "| R-squared (within-FE spec proxy):", f"{res.rsquared:.3f}")
print("Saved coef table →", out_coef)

# Optional quick sanity: group means of yield_fe should be ~0
cell_means = df.groupby("cell_id")["yield_fe"].mean().abs().max()
year_means = df.groupby("year")["yield_fe"].mean().abs().max()
print(f"Max |mean(yield_fe)| by cell: {cell_means:.3e} ; by year: {year_means:.3e}")
