In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import tensorflow as tf
from keras.api.models import Sequential
from keras.api.layers import Dense, Dropout, BatchNormalization
from keras.api.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

# Upload dataset
data = pd.read_csv("./lucas_pre.csv")

# Clean data and drop high-missingness columns
columns_to_drop = ["pH_H2O", "Depth", "CaCO3 (20-30 cm)", "OC (20-30 cm)", "Ox_Al", "Ox_Fe"]
data_clean = data.dropna(subset=["pH_H2O"])
X = data_clean.drop(columns=columns_to_drop)
y = data_clean["pH_H2O"]

# Enhanced Feature Engineering
X = X.copy()

# Define custom binning with 5 categories, handling duplicates
for col in ["CaCO3", "OC", "EC", "P", "K", "N"]:
    quantiles = X[col].quantile([0.2, 0.4, 0.6, 0.8]).tolist()
    bins = [X[col].min()] + quantiles + [X[col].max()]
    labels = ["low", "moderately_low", "medium", "moderately_high", "high"]
    # Use duplicates='drop' to remove duplicate edges
    X[f"{col}_cat"] = pd.cut(
        X[col], bins=bins, labels=labels[: len(set(bins)) - 1], include_lowest=True, duplicates="drop"
    )

# Create interaction category
X["CaCO3_OC_interaction"] = X["CaCO3_cat"].astype(str) + "_" + X["OC_cat"].astype(str)

# Additional numerical features
X["CaCO3_OC_ratio"] = X["CaCO3"] / X["OC"].replace(0, np.nan)
X["N_K_ratio"] = X["N"] / X["K"].replace(0, np.nan)
X["EC_CaCO3_interaction"] = X["EC"] * X["CaCO3"]
for col in ["EC", "OC", "K", "P", "CaCO3"]:
    X[f"log_{col}"] = np.log1p(X[col].clip(lower=0))


# Preprocessing
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                [
                    ("imputer", KNNImputer(n_neighbors=5)),
                    ("power", PowerTransformer(method="yeo-johnson")),
                    ("scaler", StandardScaler()),
                ]
            ),
            num_cols,
        ),
        (
            "cat",
            Pipeline(
                [
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("encoder", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")),
                ]
            ),
            cat_cols,
        ),
    ]
)
X_processed = preprocessor.fit_transform(X)
X_processed = np.nan_to_num(X_processed, nan=0.0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Simplified FNN Model for smaller training size
model = Sequential(
    [
        Dense(256, activation="relu", input_shape=(X_train.shape[1],)),  # Reduced from 256
        BatchNormalization(),
        Dropout(0.3),  # Lower dropout to retain more info
        Dense(128, activation="relu"),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation="relu"),
        BatchNormalization(),
        Dropout(0.1),
        Dense(32, activation="relu"),
        Dense(1),
    ]
)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mse", metrics=["mae"])

# Callbacks
early_stopping = EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)  # More patience
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=0.00001)

# Train with validation_split=0.2
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=400,  # Increased epochs
    batch_size=32,  # Larger batch for stability
    callbacks=[early_stopping, reduce_lr],
    verbose=1,
)

# Evaluate on test set
loss, mae = model.evaluate(X_test, y_test, verbose=0)
y_pred = model.predict(X_test)
y_pred = np.nan_to_num(y_pred, nan=np.nanmean(y_pred))
r2 = r2_score(y_test, y_pred)
print(f"\nImproved FNN Test MAE: {mae:.3f}")
print(f"Improved FNN R² Score: {r2:.3f}")

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost R² Score: {r2_xgb:.3f}")

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history["mae"], label="Training MAE")
plt.plot(history.history["val_mae"], label="Validation MAE")
plt.title("Model MAE")
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()
plt.tight_layout()
plt.show()

# Plot predictions
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2)
plt.xlabel("Actual pH")
plt.ylabel("Predicted pH")
plt.title("Predicted vs Actual pH")
plt.show()