In [1]:
import os
import re
from math import sqrt
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt
from pysal.lib import weights
from pysal.explore import esda
import matplotlib.pyplot as plt
import seaborn as sns
import contextily
from spreg import OLS_Regimes
from pysal.model import spreg
import statsmodels.formula.api as smf
import contextily as ctx
from libpysal.weights import KNN
from spreg import OLS
from libpysal.weights import lag_spatial
from spreg import GM_Lag
from libpysal.weights import Kernel
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error



# Spacial Regression

--> Embed location through the model

## Load data

In [2]:
well_filter = 1

current_dir = os.getcwd()
path = os.path.join(current_dir, '../../data/aligned', f"merged_dataset_{well_filter}.csv")
df = pd.read_csv(path)
len(df)

378

In [3]:
# df = df[df["soil region"] != "veen"]
# len(df)

## Sort by date

In [4]:
# sort by date
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

In [5]:
df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)

# Count number of data points per year-month
counts = df['year_month'].value_counts().sort_index()
print(counts)

year_month
2008-08     6
2008-09    39
2009-06     9
2010-03    48
2010-08     7
2011-01     8
2011-05     6
2012-02     8
2012-03    25
2012-04     4
2012-06     8
2012-10     2
2013-10     9
2014-09     7
2015-06    40
2015-07     1
2015-08     1
2015-09     1
2016-09     7
2017-08     2
2017-11     1
2017-12     6
2018-05    43
2018-06     4
2018-08     7
2019-07     9
2020-05     2
2020-07     4
2020-08     1
2021-03    10
2021-04    13
2021-05    23
2022-07     3
2022-08     4
2023-07     8
2023-08     2
Freq: M, Name: count, dtype: int64


  df['year_month'] = df['date'].dt.to_period('M')  # year + month (e.g., 2012-03)


## Remove outlier based on "Soil region"

In [6]:
print(df["soil region"].value_counts())

soil region
zand    263
klei     82
veen     33
Name: count, dtype: int64


## Add month and year as features

In [7]:
# df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

## Log transform

In [8]:
df["nitrate"] = np.log1p(df["nitrate"])

## Remove columns

In [9]:
df = df.drop(columns=["date", 'bro-id', 'lon', 'lat'])
print(df.columns)
len(df)

Index(['nitrate', 'geometry', 'soil region', 'landuse code', 'population',
       'groundwater depth', 'elevation', 'precipitation', 'temperature',
       'n deposition', 'mainsoilclassification_1', 'organicmattercontent_1',
       'density_1', 'acidity_1', 'year_month', 'year'],
      dtype='object')


378

## Change column type

In [10]:
# convert to categorical
predefined_categories = {
    "soil region": None,
    "landuse code": None,
    "mainsoilclassification_1": None 
}


for col, fixed_categories in predefined_categories.items():
    if col in df.columns:
        categories = sorted(df[col].dropna().unique().tolist())
        
        df[col] = pd.Categorical(df[col], categories=categories)

In [11]:
df.dtypes

nitrate                       float64
geometry                       object
soil region                  category
landuse code                 category
population                    float64
groundwater depth             float64
elevation                     float64
precipitation                 float64
temperature                   float64
n deposition                  float64
mainsoilclassification_1     category
organicmattercontent_1        float64
density_1                     float64
acidity_1                     float64
year_month                  period[M]
year                            int32
dtype: object

## Remove spatial outlier

In [12]:
# GMW000000024093

# target_well = ['GAR000000000307', 'GAR000000000308', 'GAR000000000309']
# df = df.drop(df[df["bro-id"].isin(target_well)].index)
# len(df)

## Preprocess (alternative)

In [13]:
def split_and_preprocess(df, cols_to_drop, holdout_cols = None):
    if holdout_cols is None:
        holdout_cols = []

    df = df.drop(columns=cols_to_drop)
    df = df.dropna()
    len(df)

    n = len(df)
    train_size = int(n * 0.8)

    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")

    holdout_train = train_df[holdout_cols].copy()
    holdout_test = test_df[holdout_cols].copy()

    X_train_raw = train_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_train = train_df["nitrate"].values.reshape(-1, 1)

    X_test_raw = test_df.drop(columns=["nitrate", "year"] + holdout_cols).copy()
    y_test = test_df["nitrate"].values.reshape(-1, 1)

    categorical_cols = X_train_raw.select_dtypes(include="category").columns.tolist()
    numerical_cols = X_train_raw.select_dtypes(include=["float64", "int64"]).columns.tolist()

    # construct a transformer
    transformers = []

    if categorical_cols:
        transformers.append(
            ("cat_ohe", OneHotEncoder(handle_unknown="ignore", drop='first', sparse_output=False), categorical_cols)
        )

    if numerical_cols:
        transformers.append(
            ("num_scaler", StandardScaler(), numerical_cols)
        )

    preprocessor = ColumnTransformer(transformers)

    print(X_train_raw)
    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)

    # extract feature names
    if categorical_cols:
        cat_ohe = preprocessor.named_transformers_["cat_ohe"]
        ohe_feature_names = cat_ohe.get_feature_names_out(categorical_cols).tolist()
    else:
        ohe_feature_names = []

    if numerical_cols:
        num_feature_names = numerical_cols.copy()
    else:
        numerical_cols = []
    
    feature_names = ohe_feature_names + num_feature_names
    print(feature_names)

    # convert back to pd Dataframe
    X_train = pd.DataFrame(X_train, columns=feature_names, index=X_train_raw.index)
    X_test = pd.DataFrame(X_test, columns=feature_names, index=X_test_raw.index)
    y_train = pd.DataFrame(y_train, columns=["nitrate"], index=X_train_raw.index)
    y_test = pd.DataFrame(y_test, columns=["nitrate"], index=X_test_raw.index)

    print(X_train.shape)

    X_train = pd.concat([X_train, holdout_train], axis=1)
    X_test = pd.concat([X_test, holdout_test], axis=1)

    return X_train, y_train, X_test, y_test, feature_names, preprocessor

## Simple OLS regression

In [14]:
cols_to_drop_1 = ['landuse code'] # 'soil region'
X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_1)

Train: 191
Test: 48
                                          geometry soil region  population  \
3     POINT (4014966.1068907287 3225589.623440071)        zand         0.0   
5      POINT (3997707.6233134544 3237167.74386636)        zand         0.0   
9     POINT (4005247.963671945 3227908.7268077363)        zand         5.0   
10   POINT (4003895.6499366937 3218837.4487597076)        klei        25.0   
11    POINT (4010447.023226969 3224591.8496082774)        zand        10.0   
..                                             ...         ...         ...   
283      POINT (3999421.92552147 3217878.07804419)        klei        20.0   
285    POINT (4001917.418034808 3229791.966176053)        zand         0.0   
286   POINT (4008905.0068351263 3221211.301439656)        zand         0.0   
287    POINT (4015139.3574900557 3215672.75246142)        zand        55.0   
289    POINT (3998784.8558375193 3218938.01763749)        klei        10.0   

     groundwater depth  elevation  precipit

In [15]:
print(feature_names)

ols_model = spreg.OLS(y_train, X_train, name_y='nitrate', name_x=feature_names)
print(ols_model.summary)

['soil region_veen', 'soil region_zand', 'mainsoilclassification_1_Kalkloze zandgronden', 'mainsoilclassification_1_Moerige gronden', 'mainsoilclassification_1_Podzolgronden', 'mainsoilclassification_1_Rivierkleigronden', 'mainsoilclassification_1_Veengronden', 'mainsoilclassification_1_Zeekleigronden', 'population', 'groundwater depth', 'elevation', 'precipitation', 'temperature', 'n deposition', 'organicmattercontent_1', 'density_1', 'acidity_1']
REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:         191
Mean dependent var  :      0.8972                Number of Variables   :          18
S.D. dependent var  :      0.9966                Degrees of Freedom    :         173
R-squared           :      0.6393
Adjusted R-squared  :      0.6039
Sum squared residual:     68.0566

In [16]:
beta = ols_model.betas.flatten()

X_test_with_const = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

y_pred = X_test_with_const @ beta

print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.6974415521579832
Test MAE: 1.362882327601916
Test RMSE: 2.0396793611984263


## Learning Curve

In [17]:
def learning_curve(X_train, y_train):
    print("Creating learning curve...")

    n = len(X_train)

    train_errors = []
    val_errors = []

    train_sizes = np.linspace(0.1, 1.0, 10)

    for frac in train_sizes:
        split_idx = int(n * frac)

        X_subset = X_train.iloc[:split_idx]
        y_subset = y_train.iloc[:split_idx]

        tscv = TimeSeriesSplit(n_splits=7)
        fold_train_scores = []
        fold_val_scores = []

        for train_idx, val_idx in tscv.split(X_subset):
            X_tr, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
            y_tr, y_val = y_subset.iloc[train_idx], y_subset.iloc[val_idx]

            X_tr = preprocessor.fit_transform(X_tr)
            X_val = preprocessor.transform(X_val)

            y_tr = y_tr.values.reshape(-1, 1)
            y_val = y_val.values.reshape(-1, 1)
            
            # create fresh model
            curr_model = OLS(y_tr, X_tr, name_y='nitrate', name_x=feature_names)

            beta = curr_model.betas.flatten()
            X_train_with_const = np.hstack([np.ones((X_tr.shape[0], 1)), X_tr])
            X_val_with_const = np.hstack([np.ones((X_val.shape[0], 1)), X_val])

            y_tr_pred = X_train_with_const @ beta
            y_val_pred = X_val_with_const @ beta
            
            fold_train_scores.append(mean_absolute_error(np.expm1(y_tr), np.expm1(y_tr_pred)))
            fold_val_scores.append(mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred)))

        train_errors.append(np.mean(fold_train_scores))
        val_errors.append(np.mean(fold_val_scores))

    plt.plot(train_sizes * len(X_train), train_errors, label="Train MAE")
    plt.plot(train_sizes * len(X_train), val_errors, label="Validation MAE")
    plt.xlabel("Training Set Size")
    plt.ylabel("Train MAE")
    plt.title("learning curve")
    plt.legend()
    plt.grid(True)
    plt.show()

# def get_feature_names():
#     ohe = preprocessor.named_transformers_["cat_ohe"]
#     cat_features = ohe.get_feature_names_out()
#     return list(cat_features) + preprocessor.transformers_[1][2]  # numerical feature names

In [18]:
feature_names

['soil region_veen',
 'soil region_zand',
 'mainsoilclassification_1_Kalkloze zandgronden',
 'mainsoilclassification_1_Moerige gronden',
 'mainsoilclassification_1_Podzolgronden',
 'mainsoilclassification_1_Rivierkleigronden',
 'mainsoilclassification_1_Veengronden',
 'mainsoilclassification_1_Zeekleigronden',
 'population',
 'groundwater depth',
 'elevation',
 'precipitation',
 'temperature',
 'n deposition',
 'organicmattercontent_1',
 'density_1',
 'acidity_1']

In [19]:
learning_curve(X_train, y_train)

Creating learning curve...


ValueError: A given column is not a column of the dataframe

## Spatial fixed effects

--> We influence constant term

--> Instead of assuming the dependent variable behaves uniformly over space, there are systematic effects following a geographical pattern

--> Constant term to vary geographically

--> **Other** elements of the regression are left untouched and hence apply **uniformly** across space

--> Each region gets its own intercept

--> Control for spatial heterogeneity

In [22]:
cols_to_drop_2 = ['landuse code']
holdout_cols = ["soil region"]

X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_2, holdout_cols)

Train: 152
Test: 66
                             geometry  population  groundwater depth  \
2     POINT (5.535157514 52.05581825)         0.0           5.154708   
5    POINT (5.273031977 52.149856205)         0.0           2.917014   
6    POINT (5.295321071 52.033349247)        20.0           2.267003   
8    POINT (5.391578485 52.071179671)         5.0           2.641944   
9    POINT (5.380287236 51.989073002)        25.0           3.075021   
..                                ...         ...                ...   
227  POINT (5.206717551 52.113087483)         0.0           2.137031   
229  POINT (5.299047893 52.146637386)        40.0           3.058125   
230  POINT (5.273031977 52.149856205)         0.0           3.058125   
231  POINT (5.189137388 52.107199019)      1345.0           0.886458   
232  POINT (5.209847732 52.100510748)        45.0           0.883819   

     elevation  precipitation  temperature  n deposition  \
2     7.000000          34.50   174.733333     25.75887

In [23]:
train_df_for_formula = X_train.copy()
train_df_for_formula['nitrate'] = y_train

test_df_for_formula = X_test.copy()
test_df_for_formula['nitrate'] = y_test

In [24]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']


formula = "nitrate ~ " + " + ".join([f"Q('{col}')" for col in variable_names]) + " + C(Q('soil region')) - 1"

In [25]:
model_2 = smf.ols(formula, data=train_df_for_formula).fit()

In [26]:
print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:                nitrate   R-squared:                       0.517
Model:                            OLS   Adj. R-squared:                  0.483
Method:                 Least Squares   F-statistic:                     15.12
Date:                Sat, 14 Jun 2025   Prob (F-statistic):           4.41e-18
Time:                        16:13:31   Log-Likelihood:                -162.34
No. Observations:                 152   AIC:                             346.7
Df Residuals:                     141   BIC:                             379.9
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
C(Q('soil region')

#### Wells on **zand** soils have significantly **higher nitrate** concentrations, even when controlling for depth, population, temperature

--> The log nitrate concentration is, on average, 0.7 units lower in klei (clay) regions

In [27]:
# Step 1: Extract fixed effect names
sfe_names = [i for i in model_2.params.index if "C(Q('soil region'))" in i]

# Step 2: Extract soil region names using regex
soil_region_labels = []
for name in sfe_names:
    match = re.search(r"\[T\.(.*)\]", name)
    if match:
        soil_region_labels.append(match.group(1))
    else:
        # If no match, fallback to raw name (for safety)
        soil_region_labels.append(name)

# Step 3: Create DataFrame
soil_fixed_effects = pd.DataFrame({
    "soil_region": soil_region_labels,
    "Coef.": model_2.params[sfe_names],
    "Std. Error": model_2.bse[sfe_names],
    "P-Value": model_2.pvalues[sfe_names],
})

soil_fixed_effects

Unnamed: 0,soil_region,Coef.,Std. Error,P-Value
C(Q('soil region'))[klei],C(Q('soil region'))[klei],0.977049,0.163435,1.759688e-08
C(Q('soil region'))[zand],C(Q('soil region'))[zand],0.999472,0.098932,2.176484e-18


In [28]:
y_pred = model_2.predict(test_df_for_formula)

In [29]:
print("Test R2:", r2_score(np.expm1(y_test),  np.expm1(y_pred)))
print("Test MAE:",mean_absolute_error(y_test, y_pred))
print("Test RMSE:", sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred))))

Test R2: 0.1130297918502795
Test MAE: 0.6593966216265692
Test RMSE: 4.049174734288938


## Spatial Regimes

--> We modify set of coeficients depending on the location

--> Spatial regimes (SRs) is to generalize the spatial FE approach to allow not only the constant term to vary but also any other explanatory variable

--> We want the whole model — intercept and slopes — to vary between geographic groups

In [31]:
cols_to_drop_3 = ['landuse code']
holdout_cols = ['soil region']

X_train, y_train, X_test, y_test, feature_names, preprocessor = split_and_preprocess(df, cols_to_drop_3, holdout_cols)

Train: 152
Test: 66
                             geometry  population  groundwater depth  \
2     POINT (5.535157514 52.05581825)         0.0           5.154708   
5    POINT (5.273031977 52.149856205)         0.0           2.917014   
6    POINT (5.295321071 52.033349247)        20.0           2.267003   
8    POINT (5.391578485 52.071179671)         5.0           2.641944   
9    POINT (5.380287236 51.989073002)        25.0           3.075021   
..                                ...         ...                ...   
227  POINT (5.206717551 52.113087483)         0.0           2.137031   
229  POINT (5.299047893 52.146637386)        40.0           3.058125   
230  POINT (5.273031977 52.149856205)         0.0           3.058125   
231  POINT (5.189137388 52.107199019)      1345.0           0.886458   
232  POINT (5.209847732 52.100510748)        45.0           0.883819   

     elevation  precipitation  temperature  n deposition  \
2     7.000000          34.50   174.733333     25.75887

In [32]:
variable_names = ['population',
'groundwater depth',
'elevation',
'precipitation',
'temperature',
'n deposition',
'organicmattercontent_1',
'density_1',
'acidity_1']

In [33]:
m5 = spreg.OLS_Regimes(
    # Dependent variable
    y_train.values,
    # Independent variables
    X_train[variable_names].values,
    # Variable specifying neighborhood membership
    X_train["soil region"].tolist(),
    # Allow the constant term to vary by group/regime
    constant_regi="many",
    # Allow separate sigma coefficients to be estimated
    # by regime (False so a single sigma)
    regime_err_sep=False,
    # Dependent variable name
    name_y="nitrate",
    # Independent variables names
    name_x=variable_names
)

In [34]:
m5.name_x

['klei_CONSTANT',
 'klei_population',
 'klei_groundwater depth',
 'klei_elevation',
 'klei_precipitation',
 'klei_temperature',
 'klei_n deposition',
 'klei_organicmattercontent_1',
 'klei_density_1',
 'klei_acidity_1',
 'zand_CONSTANT',
 'zand_population',
 'zand_groundwater depth',
 'zand_elevation',
 'zand_precipitation',
 'zand_temperature',
 'zand_n deposition',
 'zand_organicmattercontent_1',
 'zand_density_1',
 'zand_acidity_1']

In [35]:
print(m5.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :     nitrate                Number of Observations:         152
Mean dependent var  :      0.9918                Number of Variables   :          20
S.D. dependent var  :      1.0168                Degrees of Freedom    :         132
R-squared           :      0.5888
Adjusted R-squared  :      0.5296
Sum squared residual:     64.1928                F-statistic           :      9.9487
Sigma-square        :       0.486                Prob(F-statistic)     :   1.798e-17
S.E. of regression  :       0.697                Log likelihood        :    -150.168
Sigma-square ML     :       0.422                Akaike info criterion :     340.335
S.E of regression ML:      0.6499                Schwarz criterion     :     400.813

----------------------------------------

In [36]:
res = pd.DataFrame({
    "Coeff.": m5.betas.flatten(),
    "Std. Error": m5.std_err.flatten(),
    "P-Value": [i[1] for i in m5.t_stat],
}, index=m5.name_x)

soil_types = df["soil region"].unique()

regime_tables = []
for soil in soil_types:
    matching_rows = [i for i in res.index if i.startswith(f"{soil}_")]
    regime_res = res.loc[matching_rows].rename(lambda i: i.replace(f"{soil}_", ""))
    regime_res.columns = pd.MultiIndex.from_product([[soil], regime_res.columns])
    regime_tables.append(regime_res)

comparison_table = pd.concat(regime_tables, axis=1)

comparison_table

Unnamed: 0_level_0,zand,zand,zand,klei,klei,klei
Unnamed: 0_level_1,Coeff.,Std. Error,P-Value,Coeff.,Std. Error,P-Value
CONSTANT,1.508297,0.191237,1.032915e-12,1.436699,0.257529,1.3171e-07
population,0.06448,0.066915,0.3369987,1.648361,1.106526,0.1386958
groundwater depth,0.212985,0.09355,0.02441413,0.152842,0.385764,0.692593
elevation,0.016693,0.079069,0.8331182,0.550849,0.933706,0.5562263
precipitation,-0.099966,0.083628,0.2340899,-0.037143,0.089374,0.6783845
temperature,-0.191505,0.073051,0.009781179,-0.035071,0.107202,0.7440732
n deposition,-0.225525,0.081406,0.006407193,0.023343,0.224644,0.917398
organicmattercontent_1,-1.644705,0.424474,0.0001673836,-0.049233,0.492355,0.9205007
density_1,-1.76325,0.530636,0.001152961,0.247361,0.257286,0.3380959
acidity_1,-0.582573,0.123355,5.866189e-06,-0.243682,0.204772,0.2361766


CHOW test check if coefficients for each predictor is statistically different across the groups (klei vs zand)

In [37]:
# ‣ 1. Flatten the coefficient vector from the fitted model
beta_vec = m5.betas.flatten()  
#    shape = (R*(1 + len(variable_names)),)  

# ‣ 2. Recover the exact “regime order” that PySAL used internally.
#    m5.name_x is a list like ["clay_CONSTANT", "clay_population", …, "sand_CONSTANT", …, …].
regime_order = []
for nm in m5.name_x:
    soil = nm.split("_")[0]
    if soil not in regime_order:
        regime_order.append(soil)
#    Now regime_order = [ "clay", "sand", "peaty", … ] in the precise sequence PySAL stacked them.

R = len(regime_order)
K = len(variable_names)
n_test = X_test.shape[0]

# ‣ 3. Build the “extended” test‐matrix of size (n_test, R*(1+K)).
#    Column‐block layout: for regime i = 0..R−1,
#      column  i*(K+1)   = intercept_dummy_i  = (soil_region == regime_i ? 1 : 0)
#      columns i*(K+1)+1 ... i*(K+1)+K =  (soil_region == regime_i) *  X_test[var_k]
X_test_ext = np.zeros((n_test, R * (K + 1)))

for i, regime in enumerate(regime_order):
    # Boolean mask of shape (n_test,): 1 if this test‐row belongs to `regime`, else 0
    mask = (X_test["soil region"] == regime).astype(int).values

    # intercept dummy for regime i
    X_test_ext[:, i * (K + 1)] = mask

    # for each covariate k, multiply mask * X_test[var_k]
    for j, var in enumerate(variable_names):
        X_test_ext[:, i * (K + 1) + 1 + j] = mask * X_test[var].values

In [38]:
# ‣ 4. Form y_test as a 1D array of the log1p(nitrate) target
y_test_log = y_test["nitrate"].values.flatten()

# ‣ 5. Compute predicted log1p(nitrate) on the test set
y_pred_log = X_test_ext.dot(beta_vec)

# ‣ 6. Evaluate in log‐space (for example, RMSE in log1p scale)
mse_log = mean_squared_error(y_test_log, y_pred_log)
rmse_log = np.sqrt(mse_log)
print(f"Test RMSE (log1p‐nitrate): {rmse_log:.4f}")

# ‣ 7. If you also want an “original‐scale” error, back‐transform both sides:
y_test_orig = np.expm1(y_test_log)     # invert log1p
y_pred_orig = np.expm1(y_pred_log)

mse_orig = mean_squared_error(y_test_orig, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
print(f"Test RMSE (original‐nitrate): {rmse_orig:.4f}")

r2_log = r2_score(y_test_log, y_pred_log)
print(f"Test  R² (log1p‐nitrate): {r2_log:.4f}")

# 2) R² on the original‐nitrate scale
r2_orig = r2_score(y_test_orig, y_pred_orig)
print(f"Test  R² (original‐nitrate): {r2_orig:.4f}")

Test RMSE (log1p‐nitrate): 0.6729
Test RMSE (original‐nitrate): 3.3080
Test  R² (log1p‐nitrate): 0.3924
Test  R² (original‐nitrate): 0.4080


In [39]:
m5.chow.joint

(22.941621365924235, 0.010964126795725757)

In [40]:
pd.DataFrame(
    # Chow results by variable
    m5.chow.regi,
    # Name of variables
    index=m5.name_x_r,
    # Column names
    columns=["Statistic", "P-value"],
)

Unnamed: 0,Statistic,P-value
CONSTANT,0.04982,0.823376
population,2.041441,0.153065
groundwater depth,0.022957,0.879569
elevation,0.324947,0.56865
precipitation,0.26344,0.607767
temperature,1.454171,0.22786
n deposition,1.084834,0.297619
organicmattercontent_1,6.023624,0.014116
density_1,11.624191,0.000651
acidity_1,2.009625,0.156304
