In [141]:
"""
GeoChemBoost-GCV Training Notebook
"""

'\nGeoChemBoost-GCV Training Notebook\n'

In [142]:
# ----------------------------------
# Step 0: Imports
# ----------------------------------
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle  # For saving models and scaler
import os
import matplotlib.pyplot as plt # Added for plotting

In [143]:
# ----------------------------------
# Step 2: Data Loading & Initial Prep
# ----------------------------------
df = pd.read_excel("Dataset4.xlsx")
print(f"Successfully loaded data with shape: {df.shape}")
print("First 5 rows:\n", df.head())

Successfully loaded data with shape: (331, 6)
First 5 rows:
    Moisture    Ash     VM     FC   GCV  Serial
0      3.58  55.04  16.80  24.58  2928       9
1      3.77  40.87  21.28  34.08  4152       6
2      3.23  56.67  14.46  23.64  2826       9
3      3.90  44.15  20.59  31.36  3844       7
4      3.71  45.35  20.66  30.28  3718       7


In [144]:
# --- Column Names in your Excel file ---
# Make sure these match your Excel columns EXACTLY
moisture_col = 'Moisture' # Example: As Received Basis
ash_col = 'Ash'           # Example: As Received Basis
vm_col = 'VM'             # Example: As Received Basis
fc_col = 'FC'
gcv_col = 'GCV'           # Example: GCV in MJ/kg or kcal/kg (must be consistent)

In [145]:
# --- Identify Feature and Target Columns ---
feature_cols = [moisture_col, ash_col, vm_col, fc_col]
target_col = gcv_col

In [146]:
# # --- Calculate Fixed Carbon (FC) ---
# fc_col = 'FC_calculated'
# df[fc_col] = 100 - (df[moisture_col] + df[ash_col] + df[vm_col])
# # Ensure FC is not negative (can happen with measurement errors)
# df[fc_col] = df[fc_col].clip(lower=0)
# feature_cols.append(fc_col) # Add FC to our features
# print(f"\nCalculated '{fc_col}' and added to features.")
print("Columns used as features:", feature_cols)
print("Column used as target:", target_col)

Columns used as features: ['Moisture', 'Ash', 'VM', 'FC']
Column used as target: GCV


In [147]:
# --- Handle Missing Values (Example: Simple Imputation or Dropping) ---
df.dropna(inplace=True)

In [148]:
# --- Define X (features) and y (target) ---
X = df[feature_cols]
y = df[target_col]

In [149]:
print("\n--- Data Preparation Summary ---")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


--- Data Preparation Summary ---
Features shape: (331, 4)
Target shape: (331,)


In [150]:
# --- Model & Training Parameters ---
TEST_SIZE = 0.20           # Percentage of data for testing
RANDOM_STATE = 42        # For reproducibility

In [151]:
# ----------------------------------
# Step 3: Data Splitting
# ----------------------------------
print("\n--- Splitting Data into Training and Testing Sets ---")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing set shape: X={X_test.shape}, y={y_test.shape}")


--- Splitting Data into Training and Testing Sets ---
Training set shape: X=(264, 4), y=(264,)
Testing set shape: X=(67, 4), y=(67,)


In [152]:
# ----------------------------------
# Step 4: Preprocessing (Scaling)
# ----------------------------------
print("\n--- Scaling Feature Data ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data successfully scaled.")


--- Scaling Feature Data ---
Data successfully scaled.


In [153]:
# ----------------------------------
# Step 5: Define the GFEN Model (Keras) - Revised Approach
# ----------------------------------
print("\n--- Defining GFEN Model (Keras) ---")


--- Defining GFEN Model (Keras) ---


In [154]:
# GFEN Parameters
embedding_dim = 8        # Dimension of the learned embedding vector
gfen_hidden_units = [32, 16] # Number of neurons in GFEN hidden layers
gfen_epochs = 50           # Number of training epochs for GFEN
gfen_batch_size = 16       # Batch size for GFEN training
gfen_learning_rate = 0.001
dropout_rate = 0.1         # Dropout rate for GFEN regularization

In [155]:
# Define the input layer *separately*
input_shape = (X_train_scaled.shape[1],)
inputs = keras.Input(shape=input_shape, name="Input_Layer")

In [156]:
# Build the GFEN layers starting from the defined input
x = inputs
for units in gfen_hidden_units:
    x = layers.Dense(units, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)

In [157]:
# Embedding Layer
embedding_output = layers.Dense(embedding_dim, activation="linear", name="Embedding_Layer")(x)

In [158]:
# Output Layer (Predicting GCV for training purposes)
gcv_prediction_output = layers.Dense(1, activation="linear", name="Output_GCV_Prediction")(embedding_output)

In [159]:
# Create the *predictor* model (Input -> GCV Prediction)
gfen_predictor_model = keras.Model(inputs=inputs, outputs=gcv_prediction_output, name="GFEN_Predictor")

In [160]:
# Compile the predictor model
optimizer = tf.keras.optimizers.Adam(learning_rate=gfen_learning_rate)
gfen_predictor_model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae', 'mse'])

In [161]:
print("\nGFEN Predictor Model Summary:")
gfen_predictor_model.summary()


GFEN Predictor Model Summary:


In [162]:
# ----------------------------------
# Step 6: Train the GFEN Predictor Model
# ----------------------------------
print("\n--- Training GFEN Predictor Model ---")
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


--- Training GFEN Predictor Model ---


In [163]:
history = gfen_predictor_model.fit(
    X_train_scaled, y_train,
    epochs=gfen_epochs,
    batch_size=gfen_batch_size,
    validation_split=0.15, # Use part of training data for validation
    callbacks=[early_stopping],
    verbose=1 # Set to 0 for less output, 1 for progress bar
)

Epoch 1/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 21858030.0000 - mae: 4561.0854 - mse: 21858030.0000 - val_loss: 20934044.0000 - val_mae: 4485.1084 - val_mse: 20934044.0000
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 21740394.0000 - mae: 4550.2642 - mse: 21740394.0000 - val_loss: 20928166.0000 - val_mae: 4484.4561 - val_mse: 20928166.0000
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 22017578.0000 - mae: 4592.0854 - mse: 22017578.0000 - val_loss: 20919850.0000 - val_mae: 4483.5430 - val_mse: 20919850.0000
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 22086186.0000 - mae: 4579.0190 - mse: 22086186.0000 - val_loss: 20907370.0000 - val_mae: 4482.1763 - val_mse: 20907370.0000
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 21694720.0000 - mae: 4542.0981 - mse: 21694

In [164]:
print("GFEN Predictor training finished.")

GFEN Predictor training finished.


In [165]:
# ----------------------------------
# Step 7: Create the Embedder Model & Extract Embeddings
# ----------------------------------
print("\n--- Creating GFEN Embedder Model & Extracting Embeddings ---")


--- Creating GFEN Embedder Model & Extracting Embeddings ---


In [166]:
# Create the *embedder* model (Input -> Embedding Layer Output)
# We reuse the 'inputs' tensor and the 'embedding_output' tensor defined earlier
gfen_embedder_model = keras.Model(inputs=inputs, outputs=embedding_output, name="GFEN_Embedder")

In [167]:
print("\nGFEN Embedder Model Summary:")
gfen_embedder_model.summary() # Show the embedder model structure


GFEN Embedder Model Summary:


In [168]:
# Extract Embeddings using the embedder model
X_train_embeddings = gfen_embedder_model.predict(X_train_scaled)
X_test_embeddings = gfen_embedder_model.predict(X_test_scaled)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


In [169]:
print(f"Shape of training embeddings: {X_train_embeddings.shape}")
print(f"Shape of testing embeddings: {X_test_embeddings.shape}")

Shape of training embeddings: (264, 8)
Shape of testing embeddings: (67, 8)


In [170]:
# ----------------------------------
# Step 8: Prepare Data for GBM
# ----------------------------------
print("\n--- Preparing Combined Features for XGBoost ---")
# Concatenate original scaled features and the learned embeddings
X_train_combined = np.concatenate([X_train_scaled, X_train_embeddings], axis=1)
X_test_combined = np.concatenate([X_test_scaled, X_test_embeddings], axis=1)


--- Preparing Combined Features for XGBoost ---


In [171]:
print(f"Shape of combined training features: {X_train_combined.shape}")
print(f"Shape of combined testing features: {X_test_combined.shape}")

Shape of combined training features: (264, 12)
Shape of combined testing features: (67, 12)


In [172]:
# XGBoost Parameters (These often require tuning)
xgb_n_estimators = 200
xgb_learning_rate = 0.1
xgb_max_depth = 5
xgb_subsample = 0.8
xgb_colsample_bytree = 0.8

In [173]:
from xgboost.callback import EarlyStopping

In [174]:
# ----------------------------------
# Step 9: Define and Train the GBM Model (XGBoost)
# ----------------------------------
print("\n--- Defining and Training Final XGBoost Model ---")
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', # Objective function for regression
    n_estimators=xgb_n_estimators,
    learning_rate=xgb_learning_rate,
    max_depth=xgb_max_depth,
    subsample=xgb_subsample,
    colsample_bytree=xgb_colsample_bytree,
    random_state=RANDOM_STATE,
    n_jobs=-1 # Use all available CPU cores
)


--- Defining and Training Final XGBoost Model ---


In [175]:
# Create the EarlyStopping callback instance
# Note: The parameter inside the callback is 'rounds'
early_stopping_callback = EarlyStopping(rounds=15, # Use 'rounds' here
                                       save_best=True) # Optionally save the best model internally

print("Training XGBoost model with Early Stopping Callback...") # Added print statement


Training XGBoost model with Early Stopping Callback...


In [176]:
# ----------------------------------
# Step 9: Define and Train the GBM Model (XGBoost)
# ----------------------------------
print("\n--- Defining and Training Final XGBoost Model (using constructor for early stopping) ---")

# Pass early_stopping_rounds during initialization
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',      # Objective function for regression
    n_estimators=xgb_n_estimators,
    learning_rate=xgb_learning_rate,
    max_depth=xgb_max_depth,
    subsample=xgb_subsample,
    colsample_bytree=xgb_colsample_bytree,
    random_state=RANDOM_STATE,
    n_jobs=-1,                         # Use all available CPU cores
    early_stopping_rounds=15           # <<< Pass early stopping here
)

print("Training XGBoost model...")

# Train the XGBoost model. eval_set is still needed for early stopping to function.
xgb_model.fit(X_train_combined, y_train,
              eval_set=[(X_test_combined, y_test)], # Evaluate on test set during training
              verbose=False)                         # Set to True to see training progress

# With early_stopping_rounds in the constructor, the model automatically uses
# the eval_set provided in fit() to perform early stopping and retains the best model.
# You can still access the best iteration if needed:
# print(f"XGBoost Best Iteration: {xgb_model.best_iteration}")


print("XGBoost model training finished.")


--- Defining and Training Final XGBoost Model (using constructor for early stopping) ---
Training XGBoost model...
XGBoost model training finished.


In [177]:
print("XGBoost model training finished.")

XGBoost model training finished.


In [178]:
# ----------------------------------
# Step 10: Make Predictions
# ----------------------------------
print("\n--- Making Predictions on Test Set ---")
y_pred = xgb_model.predict(X_test_combined)


--- Making Predictions on Test Set ---


In [179]:
# ----------------------------------
# Step 11: Evaluate the Model
# ----------------------------------
print("\n--- Evaluating GeoChemBoost-GCV Model Performance ---")
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


--- Evaluating GeoChemBoost-GCV Model Performance ---


In [180]:
print(f"Evaluation Metrics on Test Set:")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE:  {mae:.4f}")
print(f"  R²:   {r2:.4f}")

Evaluation Metrics on Test Set:
  RMSE: 131.6472
  MAE:  91.0155
  R²:   0.9837


In [181]:
print("\n--- GeoChemBoost-GCV Task Completed ---")


--- GeoChemBoost-GCV Task Completed ---


In [182]:
# Save Keras embedding model in native format
gfen_embedder_model.save('gfen_embedder_model4.keras')  # recommended

# Save XGBoost model
import joblib
joblib.dump(xgb_model, 'xgb_model4.pkl')

# Save scaler
joblib.dump(scaler, 'scaler4.pkl')


['scaler4.pkl']