In [20]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
current_dir = os.getcwd()
path = os.path.join(current_dir, '../data/clean', "merged_dataset_1.csv")
df = pd.read_csv(path)

In [12]:
df = df[df["Year"] == 2018].copy()
len(df)

54

In [13]:
cols_to_drop = [
    'Well_ID', 'BRO-ID', 'geometry', 'Filter', 'Date', 'distance_m', 'Year',
    'Unnamed: 0'
]

df = df.drop(columns=cols_to_drop, errors="ignore")

In [14]:
categorical_cols = ["HGRnaam", "Landuse_Code"]

In [15]:
encoder = OrdinalEncoder()
df[['HGRnaam', 'Landuse_Code']] = encoder.fit_transform(df[['HGRnaam', 'Landuse_Code']])

In [22]:
X_reduced = df[['avg_depth_m', 'Population', 'HGRnaam', 'Landuse_Code',
       'avg_temp_mean', 'avg_precip_sum','lon', 'lat',
       'Elevation_m']]

y_log = np.log1p(df["Nitrate"].values.reshape(-1, 1))

In [23]:
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y_log).ravel()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_scaled, test_size=0.2, random_state=1)

In [25]:
# Define model
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Categorical preprocessing
categorical_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('cat', categorical_pipeline, categorical_cols)
])

# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb)
])

# Fit
pipeline.fit(X_train, y_train)

# Predict
y_pred_scaled = pipeline.predict(X_test)

# Inverse transform and evaluate
y_pred_log = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_test_log = target_scaler.inverse_transform(y_test.reshape(-1, 1))

y_pred_original = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

print("MSE:", mean_squared_error(y_test_original, y_pred_original))
print("R2 Score:", r2_score(y_test_original, y_pred_original))


MSE: 9.565544281175095
R2 Score: 0.6522377959723851
