In [460]:
import os
import re
from math import sqrt
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt
from pysal.lib import weights
from pysal.explore import esda
import matplotlib.pyplot as plt
import seaborn as sns
import contextily
from spreg import OLS_Regimes
from pysal.model import spreg
import statsmodels.formula.api as smf
import contextily as ctx
from libpysal.weights import KNN
from spreg import OLS
from libpysal.weights import lag_spatial
from spreg import GM_Lag
from libpysal.weights import Kernel
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Spacial Regression

--> Embed location through the model

## Load data

In [461]:
well_filter = 1

current_dir = os.getcwd()
path = os.path.join(current_dir, '../../data/aligned', f"merged_dataset_{well_filter}.csv")
df = pd.read_csv(path)
len(df)

192

In [462]:
df = df[df["soil region"] != "veen"]
len(df)

175

## Sort by date

In [463]:
# sort by date
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

In [464]:
df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)

# Count number of data points per year-month
counts = df['year_month'].value_counts().sort_index()
print(counts)

year_month
2012-02     7
2012-03    23
2012-04     4
2012-06     7
2012-10     2
2013-10     8
2014-09     6
2015-06    37
2015-07     1
2015-08     1
2015-09     1
2016-09     6
2017-08     2
2017-12     6
2018-05    40
2018-06     4
2018-08     6
2019-07     8
2020-05     2
2020-07     4
Freq: M, Name: count, dtype: int64


  df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)


## Remove outlier based on "Soil region"

In [465]:
print(df["soil region"].value_counts())

soil region
zand    132
klei     43
Name: count, dtype: int64


## Add month and year as features

In [466]:
# df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

## Log transform

In [467]:
df["nitrate"] = np.log1p(df["nitrate"])

## Remove columns

In [468]:
df = df.drop(columns=["date", 'bro-id', 'lon', 'lat'])
print(df.columns)
len(df)

Index(['nitrate', 'geometry', 'soil region', 'landuse code', 'population',
       'groundwater depth', 'elevation', 'precipitation', 'temperature',
       'n deposition', 'mainsoilclassification_1', 'organicmattercontent_1',
       'density_1', 'acidity_1', 'year_month', 'year'],
      dtype='object')


175

## Change column type

In [469]:
# convert to categorical
predefined_categories = {
    "soil region": None,
    "landuse code": None,
    "mainsoilclassification_1": None 
}


for col, fixed_categories in predefined_categories.items():
    if col in df.columns:
        categories = sorted(df[col].dropna().unique().tolist())
        
        df[col] = pd.Categorical(df[col], categories=categories)

In [470]:
df.dtypes

nitrate                       float64
geometry                       object
soil region                  category
landuse code                 category
population                    float64
groundwater depth             float64
elevation                     float64
precipitation                 float64
temperature                   float64
n deposition                  float64
mainsoilclassification_1     category
organicmattercontent_1        float64
density_1                     float64
acidity_1                     float64
year_month                  period[M]
year                            int32
dtype: object

## Remove spatial outlier

In [471]:
# GMW000000024093

# target_well = ['GAR000000000307', 'GAR000000000308', 'GAR000000000309']
# df = df.drop(df[df["bro-id"].isin(target_well)].index)
# len(df)

## Preprocess (alternative)

In [472]:
def split_and_preprocess(df, cols_to_drop, holdout_cols = None):
    if holdout_cols is None:
        holdout_cols = []

    df = df.drop(columns=cols_to_drop)
    df = df.dropna()
    len(df)

    n = len(df)
    train_size = int(n * 0.7)

    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")

    holdout_train = train_df[holdout_cols].copy()
    holdout_test = test_df[holdout_cols].copy()

    X_train_raw = train_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_train = train_df["nitrate"].values.reshape(-1, 1)

    X_test_raw = test_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_test = test_df["nitrate"].values.reshape(-1, 1)

    categorical_cols = X_train_raw.select_dtypes(include="category").columns.tolist()
    numerical_cols = X_train_raw.select_dtypes(include=["float64", "int64"]).columns.tolist()

    # construct a transformer
    transformers = []

    if categorical_cols:
        transformers.append(
            ("cat_ohe", OneHotEncoder(handle_unknown="ignore", drop='first', sparse_output=False), categorical_cols)
        )

    if numerical_cols:
        transformers.append(
            ("num_scaler", StandardScaler(), numerical_cols)
        )

    preprocessor = ColumnTransformer(transformers)

    print(X_train_raw)
    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)

    # extract feature names
    if categorical_cols:
        cat_ohe = preprocessor.named_transformers_["cat_ohe"]
        ohe_feature_names = cat_ohe.get_feature_names_out(categorical_cols).tolist()
    else:
        ohe_feature_names = []

    if numerical_cols:
        num_feature_names = numerical_cols.copy()
    else:
        numerical_cols = []
    
    feature_names = ohe_feature_names + num_feature_names
    print(feature_names)

    # convert back to pd Dataframe
    X_train = pd.DataFrame(X_train, columns=feature_names, index=X_train_raw.index)
    X_test = pd.DataFrame(X_test, columns=feature_names, index=X_test_raw.index)
    y_train = pd.DataFrame(y_train, columns=["nitrate"], index=X_train_raw.index)
    y_test = pd.DataFrame(y_test, columns=["nitrate"], index=X_test_raw.index)

    print(X_train.shape)

    X_train = pd.concat([X_train, holdout_train], axis=1)
    X_test = pd.concat([X_test, holdout_test], axis=1)

    return X_train, y_train, X_test, y_test, feature_names, preprocessor

## Simple OLS regression

In [473]:
cols_to_drop_1 = ['landuse code', 'soil region']
X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_1)

Train: 82
Test: 36
                             geometry  population  groundwater depth  \
0    POINT (5.600793941 51.962453539)        40.0           6.838178   
1    POINT (5.601761312 51.976104861)        30.0           6.838178   
3    POINT (5.206717551 52.113087483)         0.0           3.067872   
4    POINT (5.187673269 52.052827185)        15.0           0.842530   
7    POINT (5.273031977 52.149856205)         0.0           3.071355   
..                                ...         ...                ...   
114  POINT (5.299047893 52.146637386)        40.0           3.058056   
115  POINT (5.273031977 52.149856205)         0.0           3.058056   
116  POINT (5.189137388 52.107199019)      1345.0           0.707292   
118  POINT (5.221186316 52.116677057)         0.0           3.057153   
120  POINT (5.249927484 52.073041222)       100.0           0.705625   

     elevation  precipitation  temperature  n deposition  \
0    12.048532           8.75    29.466667     27.383685

In [474]:
print(feature_names)

ols_model = spreg.OLS(y_train, X_train, name_y='nitrate', name_x=feature_names)
print(ols_model.summary)

['mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:          82
Mean dependent var  :      1.0552                Number of Variables   :          14
S.D. dependent var  :      1.0128                Degrees of Freedom    :          68
R-squared           :      0.7292
Adjusted R-squared  :      0.6774
Sum squared residual:     22.5001                F-statistic           :     14.0864
Sigma-square        :       0.331                Prob(F-statistic)     

In [475]:
beta = ols_model.betas.flatten()

X_test_with_const = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

y_pred = X_test_with_const @ beta

print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.8305987289082357
Test MAE: 1.2536285248504342
Test RMSE: 1.7301847938141008


## Learning Curve

In [476]:
def learning_curve(X_train, y_train):
    print("Creating learning curve...")

    n = len(X_train)

    train_errors = []
    val_errors = []

    train_sizes = np.linspace(0.1, 1.0, 10)

    for frac in train_sizes:
        split_idx = int(n * frac)

        X_subset = X_train.iloc[:split_idx]
        y_subset = y_train.iloc[:split_idx]

        tscv = TimeSeriesSplit(n_splits=7)
        fold_train_scores = []
        fold_val_scores = []

        for train_idx, val_idx in tscv.split(X_subset):
            X_tr, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
            y_tr, y_val = y_subset.iloc[train_idx], y_subset.iloc[val_idx]

            X_tr = preprocessor.fit_transform(X_tr)
            X_val = preprocessor.transform(X_val)

            y_tr = y_tr.values.reshape(-1, 1)
            y_val = y_val.values.reshape(-1, 1)
            
            # create fresh model
            curr_model = OLS(y_tr, X_tr, name_y='nitrate', name_x=feature_names)

            beta = curr_model.betas.flatten()
            X_train_with_const = np.hstack([np.ones((X_tr.shape[0], 1)), X_tr])
            X_val_with_const = np.hstack([np.ones((X_val.shape[0], 1)), X_val])

            y_tr_pred = X_train_with_const @ beta
            y_val_pred = X_val_with_const @ beta
            
            fold_train_scores.append(mean_absolute_error(np.expm1(y_tr), np.expm1(y_tr_pred)))
            fold_val_scores.append(mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred)))

        train_errors.append(np.mean(fold_train_scores))
        val_errors.append(np.mean(fold_val_scores))

    plt.plot(train_sizes * len(X_train), train_errors, label="Train MAE")
    plt.plot(train_sizes * len(X_train), val_errors, label="Validation MAE")
    plt.xlabel("Training Set Size")
    plt.ylabel("Train MAE")
    plt.title("learning curve")
    plt.legend()
    plt.grid(True)
    plt.show()

# def get_feature_names():
#     ohe = preprocessor.named_transformers_["cat_ohe"]
#     cat_features = ohe.get_feature_names_out()
#     return list(cat_features) + preprocessor.transformers_[1][2]  # numerical feature names

In [477]:
feature_names

['mainsoilclassification_1_Kalkloze zandgronden',
 'mainsoilclassification_1_Moerige gronden',
 'mainsoilclassification_1_Podzolgronden',
 'mainsoilclassification_1_Rivierkleigronden',
 'population',
 'groundwater depth',
 'elevation',
 'precipitation',
 'temperature',
 'n deposition',
 'organicmattercontent_1',
 'density_1',
 'acidity_1']

In [478]:
learning_curve(X_train, y_train)

Creating learning curve...


ValueError: A given column is not a column of the dataframe

## Spatial fixed effects

--> We influence constant term

--> Instead of assuming the dependent variable behaves uniformly over space, there are systematic effects following a geographical pattern

--> Constant term to vary geographically

--> **Other** elements of the regression are left untouched and hence apply **uniformly** across space

--> Each region gets its own intercept

--> Control for spatial heterogeneity

In [20]:
cols_to_drop_2 = ['landuse code']
holdout_cols = ["soil region"]

X_train, y_train, X_test, y_test, feature_names = split_and_preprocess(df, cols_to_drop_2, holdout_cols)

Train: 82
Test: 36
     population  groundwater depth  elevation  precipitation  temperature  \
0     -0.160085           1.477253   0.428715      -0.363449    -1.487296   
1     -0.199804           1.477253  -0.062626      -0.363449    -1.487296   
3     -0.318960          -0.181306  -0.457742      -0.497288    -1.498585   
4     -0.259382          -1.160235  -0.496156      -0.497288    -1.498585   
7     -0.318960          -0.179774  -0.175461      -0.492970    -1.503565   
..          ...                ...        ...            ...          ...   
114   -0.160085          -0.185624   0.380966      -0.514557    -0.069533   
115   -0.318960          -0.185624  -0.175461      -0.514557    -0.069533   
116    5.023197          -1.219726  -0.500833      -0.514557    -0.069533   
118   -0.318960          -0.186021  -0.229700      -0.410940     0.094489   
120    0.078226          -1.220459  -0.237608      -0.410940     0.094489   

     n deposition  organicmattercontent_1  density_1  ac

In [21]:
train_df_for_formula = X_train.copy()
train_df_for_formula['nitrate'] = y_train

test_df_for_formula = X_test.copy()
test_df_for_formula['nitrate'] = y_test

In [22]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']


formula = "nitrate ~ " + " + ".join([f"Q('{col}')" for col in variable_names]) + " + C(Q('soil region')) - 1"

In [23]:
model_2 = smf.ols(formula, data=train_df_for_formula).fit()

In [24]:
print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:                nitrate   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     10.62
Date:                Mon, 09 Jun 2025   Prob (F-statistic):           9.62e-11
Time:                        10:47:46   Log-Likelihood:                -79.405
No. Observations:                  82   AIC:                             180.8
Df Residuals:                      71   BIC:                             207.3
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
C(Q('soil region')

#### Wells on **zand** soils have significantly **higher nitrate** concentrations, even when controlling for depth, population, temperature

--> The log nitrate concentration is, on average, 0.7 units lower in klei (clay) regions

In [25]:
# Step 1: Extract fixed effect names
sfe_names = [i for i in model_2.params.index if "C(Q('soil region'))" in i]

# Step 2: Extract soil region names using regex
soil_region_labels = []
for name in sfe_names:
    match = re.search(r"\[T\.(.*)\]", name)
    if match:
        soil_region_labels.append(match.group(1))
    else:
        # If no match, fallback to raw name (for safety)
        soil_region_labels.append(name)

# Step 3: Create DataFrame
soil_fixed_effects = pd.DataFrame({
    "soil_region": soil_region_labels,
    "Coef.": model_2.params[sfe_names],
    "Std. Error": model_2.bse[sfe_names],
    "P-Value": model_2.pvalues[sfe_names],
})

soil_fixed_effects

Unnamed: 0,soil_region,Coef.,Std. Error,P-Value
C(Q('soil region'))[klei],C(Q('soil region'))[klei],1.118401,0.206734,8.079357e-07
C(Q('soil region'))[zand],C(Q('soil region'))[zand],1.020628,0.129626,2.868208e-11


In [26]:
y_pred = model_2.predict(test_df_for_formula)

In [27]:
print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(y_test, y_pred))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.4689043397499747
Test MAE: 0.638099480301184
Test RMSE: 3.0635186938003383


## Spatial Regimes

--> We modify set of coeficients depending on the location

--> Spatial regimes (SRs) is to generalize the spatial FE approach to allow not only the constant term to vary but also any other explanatory variable

--> We want the whole model — intercept and slopes — to vary between geographic groups

In [28]:
cols_to_drop_3 = ['landuse code']
holdout_cols = ['soil region']

X_train, y_train, X_test, y_test, feature_names = split_and_preprocess(df, cols_to_drop_3, holdout_cols)

Train: 82
Test: 36
     population  groundwater depth  elevation  precipitation  temperature  \
0     -0.160085           1.477253   0.428715      -0.363449    -1.487296   
1     -0.199804           1.477253  -0.062626      -0.363449    -1.487296   
3     -0.318960          -0.181306  -0.457742      -0.497288    -1.498585   
4     -0.259382          -1.160235  -0.496156      -0.497288    -1.498585   
7     -0.318960          -0.179774  -0.175461      -0.492970    -1.503565   
..          ...                ...        ...            ...          ...   
114   -0.160085          -0.185624   0.380966      -0.514557    -0.069533   
115   -0.318960          -0.185624  -0.175461      -0.514557    -0.069533   
116    5.023197          -1.219726  -0.500833      -0.514557    -0.069533   
118   -0.318960          -0.186021  -0.229700      -0.410940     0.094489   
120    0.078226          -1.220459  -0.237608      -0.410940     0.094489   

     n deposition  organicmattercontent_1  density_1  ac

In [29]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']

In [30]:
m5 = spreg.OLS_Regimes(
    # Dependent variable
    y_train.values,
    # Independent variables
    X_train[variable_names].values,
    # Variable specifying neighborhood membership
    X_train["soil region"].tolist(),
    # Allow the constant term to vary by group/regime
    constant_regi="many",
    # Allow separate sigma coefficients to be estimated
    # by regime (False so a single sigma)
    regime_err_sep=False,
    # Dependent variable name
    name_y="nitrate",
    # Independent variables names
    name_x=variable_names
)

In [31]:
m5.name_x

['klei_CONSTANT',
 'klei_population',
 'klei_groundwater depth',
 'klei_elevation',
 'klei_precipitation',
 'klei_temperature',
 'klei_n deposition',
 'klei_organicmattercontent_1',
 'klei_density_1',
 'klei_acidity_1',
 'zand_CONSTANT',
 'zand_population',
 'zand_groundwater depth',
 'zand_elevation',
 'zand_precipitation',
 'zand_temperature',
 'zand_n deposition',
 'zand_organicmattercontent_1',
 'zand_density_1',
 'zand_acidity_1']

In [32]:
print(m5.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:          82
Mean dependent var  :      1.0552                Number of Variables   :          20
S.D. dependent var  :      1.0128                Degrees of Freedom    :          62
R-squared           :      0.7338
Adjusted R-squared  :      0.6522
Sum squared residual:     22.1203                F-statistic           :      8.9946
Sigma-square        :       0.357                Prob(F-statistic)     :   1.586e-11
S.E. of regression  :       0.597                Log likelihood        :     -62.634
Sigma-square ML     :       0.270                Akaike info criterion :     165.267
S.E of regression ML:      0.5194                Schwarz criterion     :     213.402

----------------------------------------

In [33]:
res = pd.DataFrame({
    "Coeff.": m5.betas.flatten(),
    "Std. Error": m5.std_err.flatten(),
    "P-Value": [i[1] for i in m5.t_stat],
}, index=m5.name_x)

soil_types = df["soil region"].unique()

regime_tables = []
for soil in soil_types:
    matching_rows = [i for i in res.index if i.startswith(f"{soil}_")]
    regime_res = res.loc[matching_rows].rename(lambda i: i.replace(f"{soil}_", ""))
    regime_res.columns = pd.MultiIndex.from_product([[soil], regime_res.columns])
    regime_tables.append(regime_res)

comparison_table = pd.concat(regime_tables, axis=1)

comparison_table

Unnamed: 0_level_0,zand,zand,zand,klei,klei,klei
Unnamed: 0_level_1,Coeff.,Std. Error,P-Value,Coeff.,Std. Error,P-Value
CONSTANT,1.136407,0.238208,1.2e-05,1.180115,0.374247,0.002489
population,0.227914,0.089811,0.013686,0.537535,1.340136,0.689722
groundwater depth,0.682457,0.135373,4e-06,-0.243392,0.307713,0.431977
elevation,-0.045837,0.081234,0.574617,1.465052,1.016377,0.154491
precipitation,-0.188506,0.099233,0.062135,-0.083031,0.114566,0.471334
temperature,-0.21285,0.089033,0.019867,-0.110685,0.127641,0.389196
n deposition,-0.151544,0.102678,0.145029,-0.201811,0.383018,0.600146
organicmattercontent_1,-1.19104,0.575951,0.042823,0.017716,0.772354,0.981773
density_1,-1.064009,0.699219,0.133166,0.197899,0.348921,0.572644
acidity_1,-0.735358,0.159191,2e-05,-0.21852,0.221525,0.327754


CHOW test check if coefficients for each predictor is statistically different across the groups (klei vs zand)

In [34]:
# ‣ 1. Flatten the coefficient vector from the fitted model
beta_vec = m5.betas.flatten()  
#    shape = (R*(1 + len(variable_names)),)  

# ‣ 2. Recover the exact “regime order” that PySAL used internally.
#    m5.name_x is a list like ["clay_CONSTANT", "clay_population", …, "sand_CONSTANT", …, …].
regime_order = []
for nm in m5.name_x:
    soil = nm.split("_")[0]
    if soil not in regime_order:
        regime_order.append(soil)
#    Now regime_order = [ "clay", "sand", "peaty", … ] in the precise sequence PySAL stacked them.

R = len(regime_order)
K = len(variable_names)
n_test = X_test.shape[0]

# ‣ 3. Build the “extended” test‐matrix of size (n_test, R*(1+K)).
#    Column‐block layout: for regime i = 0..R−1,
#      column  i*(K+1)   = intercept_dummy_i  = (soil_region == regime_i ? 1 : 0)
#      columns i*(K+1)+1 ... i*(K+1)+K =  (soil_region == regime_i) *  X_test[var_k]
X_test_ext = np.zeros((n_test, R * (K + 1)))

for i, regime in enumerate(regime_order):
    # Boolean mask of shape (n_test,): 1 if this test‐row belongs to `regime`, else 0
    mask = (X_test["soil region"] == regime).astype(int).values

    # intercept dummy for regime i
    X_test_ext[:, i * (K + 1)] = mask

    # for each covariate k, multiply mask * X_test[var_k]
    for j, var in enumerate(variable_names):
        X_test_ext[:, i * (K + 1) + 1 + j] = mask * X_test[var].values

In [35]:
# ‣ 4. Form y_test as a 1D array of the log1p(nitrate) target
y_test_log = y_test["nitrate"].values.flatten()

# ‣ 5. Compute predicted log1p(nitrate) on the test set
y_pred_log = X_test_ext.dot(beta_vec)

# ‣ 6. Evaluate in log‐space (for example, RMSE in log1p scale)
mse_log = mean_squared_error(y_test_log, y_pred_log)
rmse_log = np.sqrt(mse_log)
print(f"Test RMSE (log1p‐nitrate): {rmse_log:.4f}")

# ‣ 7. If you also want an “original‐scale” error, back‐transform both sides:
y_test_orig = np.expm1(y_test_log)     # invert log1p
y_pred_orig = np.expm1(y_pred_log)

mse_orig = mean_squared_error(y_test_orig, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
print(f"Test RMSE (original‐nitrate): {rmse_orig:.4f}")

r2_log = r2_score(y_test_log, y_pred_log)
print(f"Test  R² (log1p‐nitrate): {r2_log:.4f}")

# 2) R² on the original‐nitrate scale
r2_orig = r2_score(y_test_orig, y_pred_orig)
print(f"Test  R² (original‐nitrate): {r2_orig:.4f}")

Test RMSE (log1p‐nitrate): 0.5832
Test RMSE (original‐nitrate): 2.2610
Test  R² (log1p‐nitrate): 0.5693
Test  R² (original‐nitrate): 0.7107


In [36]:
m5.chow.joint

(31.476053838398492, 0.0004893772098237195)

In [37]:
pd.DataFrame(
    # Chow results by variable
    m5.chow.regi,
    # Name of variables
    index=m5.name_x_r,
    # Column names
    columns=["Statistic", "P-value"],
)

Unnamed: 0,Statistic,P-value
CONSTANT,0.009707,0.921516
population,0.053139,0.817688
groundwater depth,7.58491,0.005886
elevation,2.195787,0.138389
precipitation,0.484275,0.486492
temperature,0.430966,0.511515
n deposition,0.016069,0.899129
organicmattercontent_1,1.574028,0.209623
density_1,2.607721,0.106345
acidity_1,3.589612,0.058142
