In [464]:
import os
import re
from math import sqrt
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt
from pysal.lib import weights
from pysal.explore import esda
import matplotlib.pyplot as plt
import seaborn as sns
import contextily
from spreg import OLS_Regimes
from pysal.model import spreg
import statsmodels.formula.api as smf
import contextily as ctx
from libpysal.weights import KNN
from spreg import OLS
from libpysal.weights import lag_spatial
from spreg import GM_Lag
from libpysal.weights import Kernel
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Spacial Regression

--> Embed location through the model

## Load data

In [465]:
well_filter = 1

current_dir = os.getcwd()
path = os.path.join(current_dir, '../../data/aligned', f"merged_dataset_{well_filter}.csv")
df = pd.read_csv(path)
len(df)

378

In [466]:
# df = df[df["soil region"] != "veen"]
# len(df)

## Sort by date

In [467]:
# sort by date
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

In [468]:
df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)

# Count number of data points per year-month
counts = df['year_month'].value_counts().sort_index()
print(counts)

year_month
2008-08     6
2008-09    39
2009-06     9
2010-03    48
2010-08     7
2011-01     8
2011-05     6
2012-02     8
2012-03    25
2012-04     4
2012-06     8
2012-10     2
2013-10     9
2014-09     7
2015-06    40
2015-07     1
2015-08     1
2015-09     1
2016-09     7
2017-08     2
2017-11     1
2017-12     6
2018-05    43
2018-06     4
2018-08     7
2019-07     9
2020-05     2
2020-07     4
2020-08     1
2021-03    10
2021-04    13
2021-05    23
2022-07     3
2022-08     4
2023-07     8
2023-08     2
Freq: M, Name: count, dtype: int64


  df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)


## Remove outlier based on "Soil region"

In [469]:
print(df["soil region"].value_counts())

soil region
zand    263
klei     82
veen     33
Name: count, dtype: int64


## Add month and year as features

In [470]:
# df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

## Log transform

In [471]:
df["nitrate"] = np.log1p(df["nitrate"])

## Remove columns

In [472]:
df = df.drop(columns=["date", 'bro-id', 'lon', 'lat'])
print(df.columns)
len(df)

Index(['nitrate', 'geometry', 'soil region', 'landuse code', 'population',
       'groundwater depth', 'elevation', 'precipitation', 'temperature',
       'n deposition', 'mainsoilclassification_1', 'organicmattercontent_1',
       'density_1', 'acidity_1', 'year_month', 'year'],
      dtype='object')


378

## Change column type

In [473]:
# convert to categorical
predefined_categories = {
    "soil region": None,
    "landuse code": None,
    "mainsoilclassification_1": None 
}


for col, fixed_categories in predefined_categories.items():
    if col in df.columns:
        categories = sorted(df[col].dropna().unique().tolist())
        
        df[col] = pd.Categorical(df[col], categories=categories)

In [474]:
df.dtypes

nitrate                       float64
geometry                       object
soil region                  category
landuse code                 category
population                    float64
groundwater depth             float64
elevation                     float64
precipitation                 float64
temperature                   float64
n deposition                  float64
mainsoilclassification_1     category
organicmattercontent_1        float64
density_1                     float64
acidity_1                     float64
year_month                  period[M]
year                            int32
dtype: object

## Remove spatial outlier

In [475]:
# GMW000000024093

# target_well = ['GAR000000000307', 'GAR000000000308', 'GAR000000000309']
# df = df.drop(df[df["bro-id"].isin(target_well)].index)
# len(df)

## Preprocess (alternative)

In [476]:
def split_and_preprocess(df, cols_to_drop, holdout_cols = None):
    if holdout_cols is None:
        holdout_cols = []

    df = df.drop(columns=cols_to_drop)
    df = df.dropna()
    len(df)

    # n = len(df)
    # train_size = int(n * 0.8)

    # train_df = df.iloc[:train_size]
    # test_df = df.iloc[train_size:]

    # print(f"Train: {len(train_df)}")
    # print(f"Test: {len(test_df)}")

    train_years = list(range(2008, 2018))
    test_years = [2018]

    # Filter datasets
    train_df = df[df["year"].isin(train_years)].copy()
    test_df = df[df["year"].isin(test_years)].copy()

    # print(test_df)

    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")

    holdout_train = train_df[holdout_cols].copy()
    holdout_test = test_df[holdout_cols].copy()

    X_train_raw = train_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_train = train_df["nitrate"].values.reshape(-1, 1)

    X_test_raw = test_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_test = test_df["nitrate"].values.reshape(-1, 1)

    categorical_cols = X_train_raw.select_dtypes(include="category").columns.tolist()
    numerical_cols = X_train_raw.select_dtypes(include=["float64", "int64"]).columns.tolist()

    # construct a transformer
    transformers = []

    if categorical_cols:
        transformers.append(
            ("cat_ohe", OneHotEncoder(handle_unknown="ignore", drop='first', sparse_output=False), categorical_cols)
        )

    if numerical_cols:
        transformers.append(
            ("num_scaler", StandardScaler(), numerical_cols)
        )

    preprocessor = ColumnTransformer(transformers)

    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)

    # extract feature names
    if categorical_cols:
        cat_ohe = preprocessor.named_transformers_["cat_ohe"]
        ohe_feature_names = cat_ohe.get_feature_names_out(categorical_cols).tolist()
    else:
        ohe_feature_names = []

    if numerical_cols:
        num_feature_names = numerical_cols.copy()
    else:
        numerical_cols = []
    
    feature_names = ohe_feature_names + num_feature_names
    print(feature_names)

    # convert back to pd Dataframe
    X_train = pd.DataFrame(X_train, columns=feature_names, index=X_train_raw.index)
    X_test = pd.DataFrame(X_test, columns=feature_names, index=X_test_raw.index)
    y_train = pd.DataFrame(y_train, columns=["nitrate"], index=X_train_raw.index)
    y_test = pd.DataFrame(y_test, columns=["nitrate"], index=X_test_raw.index)

    print(X_train.shape)

    X_train = pd.concat([X_train, holdout_train], axis=1)
    X_test = pd.concat([X_test, holdout_test], axis=1)

    return X_train, y_train, X_test, y_test, feature_names, preprocessor

## Simple OLS regression

In [477]:
cols_to_drop_1 = ['landuse code', 'soil region'] # 'soil region'
X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_1)

Train: 156
Test: 39
['mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'mainsoilclassification_1_Veengronden', 'mainsoilclassification_1_Zeekleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
(156, 15)


In [478]:
print(feature_names)

ols_model = spreg.OLS(y_train, X_train, name_y='nitrate', name_x=feature_names)
print(ols_model.summary)

['mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'mainsoilclassification_1_Veengronden', 'mainsoilclassification_1_Zeekleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:         156
Mean dependent var  :      0.8999                Number of Variables   :          16
S.D. dependent var  :      1.0164                Degrees of Freedom    :         140
R-squared           :      0.6186
Adjusted R-squared  :      0.5777
Sum squared residual:     61.0732                F-statistic           : 

In [479]:
beta = ols_model.betas.flatten()

X_test_with_const = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

y_pred = X_test_with_const @ beta
y_pred = np.clip(y_pred, 0, None)

print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.4271987960051514
Test MAE: 2.273544554611816
Test RMSE: 3.2340745525589156


In [480]:
np.expm1(y_pred)

array([ 0.58521247,  0.36200281,  0.2339999 ,  3.45395182,  1.10637231,
        4.02586107,  4.04928208,  6.51738287,  6.84537973,  3.51535306,
        6.01168985,  0.87311405,  0.90643021,  0.94893384,  2.07623007,
        7.51831649,  3.28108303,  0.        , 12.12566615, 19.65258877,
        7.17954127,  5.14136689,  1.13443061,  2.244572  ,  0.46272943,
        0.0995947 ,  2.69128947,  1.06576637,  0.        , 10.28108022,
       11.39129964, 25.75723906,  0.14882355,  0.        ,  0.16470173,
        2.6785966 ,  2.73948601,  1.2398464 ,  0.        ])

## Learning Curve

In [481]:
def learning_curve(X_train, y_train):
    print("Creating learning curve...")

    n = len(X_train)

    train_errors = []
    val_errors = []

    train_sizes = np.linspace(0.1, 1.0, 10)

    for frac in train_sizes:
        split_idx = int(n * frac)

        X_subset = X_train.iloc[:split_idx]
        y_subset = y_train.iloc[:split_idx]

        tscv = TimeSeriesSplit(n_splits=7)
        fold_train_scores = []
        fold_val_scores = []

        for train_idx, val_idx in tscv.split(X_subset):
            X_tr, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
            y_tr, y_val = y_subset.iloc[train_idx], y_subset.iloc[val_idx]

            X_tr = preprocessor.fit_transform(X_tr)
            X_val = preprocessor.transform(X_val)

            y_tr = y_tr.values.reshape(-1, 1)
            y_val = y_val.values.reshape(-1, 1)
            
            # create fresh model
            curr_model = OLS(y_tr, X_tr, name_y='nitrate', name_x=feature_names)

            beta = curr_model.betas.flatten()
            X_train_with_const = np.hstack([np.ones((X_tr.shape[0], 1)), X_tr])
            X_val_with_const = np.hstack([np.ones((X_val.shape[0], 1)), X_val])

            y_tr_pred = X_train_with_const @ beta
            y_val_pred = X_val_with_const @ beta
            
            fold_train_scores.append(mean_absolute_error(np.expm1(y_tr), np.expm1(y_tr_pred)))
            fold_val_scores.append(mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred)))

        train_errors.append(np.mean(fold_train_scores))
        val_errors.append(np.mean(fold_val_scores))

    plt.plot(train_sizes * len(X_train), train_errors, label="Train MAE")
    plt.plot(train_sizes * len(X_train), val_errors, label="Validation MAE")
    plt.xlabel("Training Set Size")
    plt.ylabel("Train MAE")
    plt.title("learning curve")
    plt.legend()
    plt.grid(True)
    plt.show()

# def get_feature_names():
#     ohe = preprocessor.named_transformers_["cat_ohe"]
#     cat_features = ohe.get_feature_names_out()
#     return list(cat_features) + preprocessor.transformers_[1][2]  # numerical feature names

In [482]:
feature_names

['mainsoilclassification_1_Kalkloze zandgronden',
 'mainsoilclassification_1_Moerige gronden',
 'mainsoilclassification_1_Podzolgronden',
 'mainsoilclassification_1_Rivierkleigronden',
 'mainsoilclassification_1_Veengronden',
 'mainsoilclassification_1_Zeekleigronden',
 'population',
 'groundwater depth',
 'elevation',
 'precipitation',
 'temperature',
 'n deposition',
 'organicmattercontent_1',
 'density_1',
 'acidity_1']

In [483]:
# learning_curve(X_train, y_train)

## Spatial fixed effects

--> We influence constant term

--> Instead of assuming the dependent variable behaves uniformly over space, there are systematic effects following a geographical pattern

--> Constant term to vary geographically

--> **Other** elements of the regression are left untouched and hence apply **uniformly** across space

--> Each region gets its own intercept

--> Control for spatial heterogeneity

In [484]:
cols_to_drop_2 = ['landuse code']
holdout_cols = ["soil region"]

X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_2, holdout_cols)

Train: 156
Test: 39
['mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'mainsoilclassification_1_Veengronden', 'mainsoilclassification_1_Zeekleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
(156, 15)


In [485]:
train_df_for_formula = X_train.copy()
train_df_for_formula['nitrate'] = y_train

test_df_for_formula = X_test.copy()
test_df_for_formula['nitrate'] = y_test

In [486]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']


formula = "nitrate ~ " + " + ".join([f"Q('{col}')" for col in variable_names]) + " + C(Q('soil region')) - 1"

In [487]:
model_2 = smf.ols(formula, data=train_df_for_formula).fit()

In [488]:
print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:                nitrate   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.419
Method:                 Least Squares   F-statistic:                     11.16
Date:                Sun, 15 Jun 2025   Prob (F-statistic):           8.95e-15
Time:                        13:43:11   Log-Likelihood:                -175.31
No. Observations:                 156   AIC:                             374.6
Df Residuals:                     144   BIC:                             411.2
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
C(Q('soil region')

#### Wells on **zand** soils have significantly **higher nitrate** concentrations, even when controlling for depth, population, temperature

--> The log nitrate concentration is, on average, 0.7 units lower in klei (clay) regions

In [489]:
sfe_names = [i for i in model_2.params.index if "C(Q('soil region'))" in i]

soil_region_labels = []
for name in sfe_names:
    match = re.search(r"\[T\.(.*)\]", name)
    if match:
        soil_region_labels.append(match.group(1))
    else:
        soil_region_labels.append(name)

soil_fixed_effects = pd.DataFrame({
    "soil_region": soil_region_labels,
    "Coef.": model_2.params[sfe_names],
    "Std. Error": model_2.bse[sfe_names],
    "P-Value": model_2.pvalues[sfe_names],
})

soil_fixed_effects

Unnamed: 0,soil_region,Coef.,Std. Error,P-Value
C(Q('soil region'))[klei],C(Q('soil region'))[klei],0.707411,0.153742,9.130711e-06
C(Q('soil region'))[veen],C(Q('soil region'))[veen],1.047345,0.285668,0.0003453856
C(Q('soil region'))[zand],C(Q('soil region'))[zand],0.985805,0.108608,7.750353e-16


In [490]:
y_pred = model_2.predict(test_df_for_formula)

In [491]:
print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(y_test, y_pred))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.12686596063764288
Test MAE: 0.648370108355545
Test RMSE: 3.9929008477469328


## Spatial Regimes

--> We modify set of coeficients depending on the location

--> Spatial regimes (SRs) is to generalize the spatial FE approach to allow not only the constant term to vary but also any other explanatory variable

--> We want the whole model — intercept and slopes — to vary between geographic groups

In [492]:
cols_to_drop_3 = ['landuse code']
holdout_cols = ['soil region']

X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_3, holdout_cols)

Train: 156
Test: 39
['mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'mainsoilclassification_1_Veengronden', 'mainsoilclassification_1_Zeekleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
(156, 15)


In [493]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']

In [494]:
m5 = spreg.OLS_Regimes(
    y_train.values,
    X_train[variable_names].values,
    X_train["soil region"].tolist(),
    constant_regi="many",
    regime_err_sep=False,
    name_y="nitrate",
    name_x=variable_names
)

In [496]:
print(m5.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:         156
Mean dependent var  :      0.8999                Number of Variables   :          30
S.D. dependent var  :      1.0164                Degrees of Freedom    :         126
R-squared           :      0.5595
Adjusted R-squared  :      0.4581
Sum squared residual:     70.5337                F-statistic           :      5.5043
Sigma-square        :       0.560                Prob(F-statistic)     :   6.789e-12
S.E. of regression  :       0.748                Log likelihood        :    -159.441
Sigma-square ML     :       0.452                Akaike info criterion :     378.881
S.E of regression ML:      0.6724                Schwarz criterion     :     470.377

----------------------------------------

In [497]:
res = pd.DataFrame({
    "Coeff.": m5.betas.flatten(),
    "Std. Error": m5.std_err.flatten(),
    "P-Value": [i[1] for i in m5.t_stat],
}, index=m5.name_x)

soil_types = df["soil region"].unique()

regime_tables = []
for soil in soil_types:
    matching_rows = [i for i in res.index if i.startswith(f"{soil}_")]
    regime_res = res.loc[matching_rows].rename(lambda i: i.replace(f"{soil}_", ""))
    regime_res.columns = pd.MultiIndex.from_product([[soil], regime_res.columns])
    regime_tables.append(regime_res)

comparison_table = pd.concat(regime_tables, axis=1)

comparison_table

Unnamed: 0_level_0,zand,zand,zand,veen,veen,veen,klei,klei,klei
Unnamed: 0_level_1,Coeff.,Std. Error,P-Value,Coeff.,Std. Error,P-Value,Coeff.,Std. Error,P-Value
CONSTANT,1.4974,0.231673,2.034788e-09,-0.125,9130383.0,1.0,1.252303,0.246222,1e-06
population,0.00387,0.066829,0.9539168,-2.167432,54.9401,0.968593,1.349905,1.058916,0.204726
groundwater depth,0.141354,0.107535,0.1910668,0.075103,1.644834,0.963653,0.067034,0.440063,0.879171
elevation,0.114902,0.083735,0.1724342,0.40625,9731274.0,1.0,1.052444,0.90385,0.246461
precipitation,-0.075172,0.086928,0.3888115,0.001397,0.665443,0.998328,-0.028737,0.103073,0.780852
temperature,-0.277304,0.08233,0.00100409,0.006181,0.3672071,0.986596,-0.056081,0.135252,0.67911
n deposition,-0.195119,0.09621,0.04466224,-0.027093,1.055101,0.979555,0.0357,0.280011,0.898752
organicmattercontent_1,-1.572345,0.54631,0.004700925,0.102539,552281.2,1.0,0.100864,0.585055,0.863399
density_1,-1.523511,0.680685,0.02696373,0.025391,818685.9,1.0,0.359111,0.293081,0.22275
acidity_1,-0.272258,0.088821,0.002661321,-0.015947,0.6545416,0.980601,-0.105485,0.212831,0.621022


CHOW test check if coefficients for each predictor is statistically different across the groups (klei vs zand)

In [498]:
# ‣ 1. Flatten the coefficient vector from the fitted model
beta_vec = m5.betas.flatten()  
#    shape = (R*(1 + len(variable_names)),)  

# ‣ 2. Recover the exact “regime order” that PySAL used internally.
#    m5.name_x is a list like ["clay_CONSTANT", "clay_population", …, "sand_CONSTANT", …, …].
regime_order = []
for nm in m5.name_x:
    soil = nm.split("_")[0]
    if soil not in regime_order:
        regime_order.append(soil)
#    Now regime_order = [ "clay", "sand", "peaty", … ] in the precise sequence PySAL stacked them.

R = len(regime_order)
K = len(variable_names)
n_test = X_test.shape[0]

# ‣ 3. Build the “extended” test‐matrix of size (n_test, R*(1+K)).
#    Column‐block layout: for regime i = 0..R−1,
#      column  i*(K+1)   = intercept_dummy_i  = (soil_region == regime_i ? 1 : 0)
#      columns i*(K+1)+1 ... i*(K+1)+K =  (soil_region == regime_i) *  X_test[var_k]
X_test_ext = np.zeros((n_test, R * (K + 1)))

for i, regime in enumerate(regime_order):
    # Boolean mask of shape (n_test,): 1 if this test‐row belongs to `regime`, else 0
    mask = (X_test["soil region"] == regime).astype(int).values

    # intercept dummy for regime i
    X_test_ext[:, i * (K + 1)] = mask

    # for each covariate k, multiply mask * X_test[var_k]
    for j, var in enumerate(variable_names):
        X_test_ext[:, i * (K + 1) + 1 + j] = mask * X_test[var].values

In [499]:
# ‣ 4. Form y_test as a 1D array of the log1p(nitrate) target
y_test_log = y_test["nitrate"].values.flatten()

# ‣ 5. Compute predicted log1p(nitrate) on the test set
y_pred_log = X_test_ext.dot(beta_vec)

# ‣ 6. Evaluate in log‐space (for example, RMSE in log1p scale)
mse_log = mean_squared_error(y_test_log, y_pred_log)
rmse_log = np.sqrt(mse_log)
print(f"Test RMSE (log1p‐nitrate): {rmse_log:.4f}")

# ‣ 7. If you also want an “original‐scale” error, back‐transform both sides:
y_test_orig = np.expm1(y_test_log)     # invert log1p
y_pred_orig = np.expm1(y_pred_log)

mse_orig = mean_squared_error(y_test_orig, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
print(f"Test RMSE (original‐nitrate): {rmse_orig:.4f}")

r2_log = r2_score(y_test_log, y_pred_log)
print(f"Test  R² (log1p‐nitrate): {r2_log:.4f}")

# 2) R² on the original‐nitrate scale
r2_orig = r2_score(y_test_orig, y_pred_orig)
print(f"Test  R² (original‐nitrate): {r2_orig:.4f}")

Test RMSE (log1p‐nitrate): 0.7186
Test RMSE (original‐nitrate): 3.2975
Test  R² (log1p‐nitrate): 0.3709
Test  R² (original‐nitrate): 0.4045


In [500]:
m5.chow.joint

(31.328117806090198, 0.051005100356342176)

In [501]:
pd.DataFrame(
    m5.chow.regi,
    index=m5.name_x_r,
    columns=["Statistic", "P-value"],
)

Unnamed: 0,Statistic,P-value
CONSTANT,0.525576,0.768905
population,1.610967,0.446872
groundwater depth,0.028333,0.985933
elevation,1.066788,0.586611
precipitation,0.125934,0.938974
temperature,2.309925,0.315069
n deposition,0.626158,0.731192
organicmattercontent_1,4.36934,0.112515
density_1,6.453163,0.039693
acidity_1,0.646182,0.723908
