In [None]:
# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from keras import layers, models, regularizers, callbacks
from keras.api.optimizers import Adam
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt


# %%
# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load the dataset
df = pd.read_csv("./lucas_pre.csv")

# Display basic information
print(f"Dataset shape: {df.shape}")

# Identify column types
target = "pH_H2O"
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_cols:
    numeric_cols.remove(target)

categorical_cols = ["Depth", "LC", "LU", "USDA", "ISSS", "NUTS_0", "LC0_Desc", "LC1_Desc", "LU1_Desc"]
# Filter to only keep categorical columns that exist in the dataset
categorical_cols = [col for col in categorical_cols if col in df.columns]

# Remove columns with too many unique values or too many missing values
filtered_cat_cols = []
for col in categorical_cols:
    if col in df.columns:
        if df[col].nunique() < 30 and df[col].isna().sum() / len(df) < 0.3:
            filtered_cat_cols.append(col)

print(f"Using {len(numeric_cols)} numeric columns and {len(filtered_cat_cols)} categorical columns")

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Feature engineering - create new features
if "EC" in df.columns and "OC" in df.columns:
    df["EC_OC_ratio"] = df["EC"] / df["OC"].replace(0, 0.001)

if "Clay" in df.columns and "Sand" in df.columns:
    df["Clay_Sand_ratio"] = df["Clay"] / df["Sand"].replace(0, 0.001)

if "N" in df.columns and "P" in df.columns and "K" in df.columns:
    # NPK balance is important for soil chemistry
    if df["N"].notnull().sum() > 0 and df["P"].notnull().sum() > 0 and df["K"].notnull().sum() > 0:
        df["NPK_sum"] = df["N"] + df["P"] + df["K"]

if "CaCO3" in df.columns:
    # Soil pH is strongly related to CaCO3
    df["log_CaCO3"] = np.log1p(df["CaCO3"])

if "OC" in df.columns and "N" in df.columns:
    # C:N ratio is important for soil biology
    df["CN_ratio"] = df["OC"] / df["N"].replace(0, 0.001)

if "Clay" in df.columns and "OC" in df.columns:
    # Clay-organic matter interactions affect pH
    df["Clay_OC_interaction"] = df["Clay"] * df["OC"]

# Extract target values
y = df[target].values

# Update numeric columns after adding engineered features
numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns.tolist() if col != target]

# Create preprocessing pipeline
numeric_transformer = Pipeline(
    steps=[
        ("imputer", KNNImputer(n_neighbors=5)),
        ("power", PowerTransformer(method="yeo-johnson")),  # Better than StandardScaler for skewed data
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_cols), ("cat", categorical_transformer, filtered_cat_cols)]
)

# Prepare features
X = df[numeric_cols + filtered_cat_cols]

# Split data before preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed feature shape: {X_train_processed.shape}")


# %%
def create_cnn_model(
    input_dim,
    neurons_layers=[128, 64, 32],
    dropout_rates=[0.4, 0.3, 0.2],
    activation="relu",
    learning_rate=0.001,
    l2_reg=0.001,
    cnn_filters=32,
    kernel_size=3,
):
    # For CNN, we need to reshape the input data
    model = models.Sequential()

    # Reshape layer: transform tabular data to 1D sequence for CNN
    # Input shape: (input_dim,) -> Reshape to (input_dim, 1)
    model.add(layers.Reshape((input_dim, 1), input_shape=(input_dim,)))

    # 1D CNN layer
    model.add(layers.Conv1D(filters=cnn_filters, kernel_size=kernel_size, padding="same", activation=activation))
    model.add(layers.BatchNormalization())

    # Add a second CNN layer to learn more complex patterns
    model.add(layers.Conv1D(filters=cnn_filters * 2, kernel_size=kernel_size, padding="same", activation=activation))
    model.add(layers.BatchNormalization())

    # Global pooling layer to convert features back to tabular format
    model.add(layers.GlobalAveragePooling1D())

    # Hidden dense layers
    for i, neurons in enumerate(neurons_layers):
        model.add(
            layers.Dense(
                neurons,
                activation=activation,
                kernel_regularizer=regularizers.l2(l2_reg),
                kernel_initializer="he_normal",
            )
        )
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout_rates[i]))

    # Output layer
    model.add(layers.Dense(1))

    # Compile
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse", metrics=["mae"])
    return model


# Create the CNN model
model = create_cnn_model(
    input_dim=X_train_processed.shape[1],
    neurons_layers=[256, 128, 64, 32],
    dropout_rates=[0.5, 0.4, 0.3, 0.2],
    activation="elu",
    learning_rate=0.0005,
    l2_reg=0.0005,
    cnn_filters=32,
    kernel_size=3,
)

# Print model summary
model.summary()

# Define callbacks
early_stopping = callbacks.EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True, verbose=1)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=0.00001, verbose=1)

# Train the model
history = model.fit(
    X_train_processed,
    y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1,
)

# %%
# Evaluate the model
loss, mae = model.evaluate(X_test_processed, y_test)
print(f"Test Mean Absolute Error: {mae:.4f}")

# Predict on test data
y_pred = model.predict(X_test_processed)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

# %%
# Plot training history
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc="upper right")

plt.subplot(1, 2, 2)
plt.plot(history.history["mae"])
plt.plot(history.history["val_mae"])
plt.title("Model MAE")
plt.ylabel("MAE")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc="upper right")
plt.tight_layout()
plt.show()

# Plot predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "r--")
plt.xlabel("Actual pH")
plt.ylabel("Predicted pH")
plt.title(f"Actual vs Predicted pH (R² = {r2:.4f})")
plt.grid(True)
plt.tight_layout()
plt.show()
