# Importing Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scienceplots


import sklearn
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy import stats

import tensorflow as tf
import keras_tuner

import optuna

import missingno
import warnings
import gc

# Plotting Configuration

In [2]:
plt.rcdefaults()
mpl_global_config = {
    'figure.figsize': (7, 7),
    'figure.dpi': 1000,
    'font.size': 16,
    'axes.labelsize': 14,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 8,
    'lines.linewidth': 2,
    'lines.markersize': 3,
    'grid.linewidth': 0.75,
    'savefig.dpi': 1000,
    'savefig.transparent': False,
    'savefig.bbox': 'tight',
    'pdf.compression': 9,
    'axes.axisbelow': True
}
plt.rcParams.update(mpl_global_config)
plt.style.use(['science', 'nature', 'high-contrast', "no-latex"])


colors = {
    "yellow": "#DDAA33",
    "red": "#BB5566",
    "blue": "#004488",
    "black": "#000000",
    "white": "#FFFFFF"
}

# Global Configuration Setting Controling Randomness, Trials, etc

In [3]:
sklearn.set_config(transform_output="pandas")
np.seterr(under='ignore')
warnings.filterwarnings('ignore')
SEED = 42
tf.keras.utils.set_random_seed(SEED)
n_trials = 50

# Read the Preprocessed Data

In [4]:
# reading the dataset
df = pd.read_csv("df.csv",
                 dayfirst=True,
                 parse_dates=True,
                 index_col="Date")

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.describe()

# Separating X and y

In [None]:
X = df.drop(columns=["Total Biogas Flowrate (m3/d)"])
y = df.pop("Total Biogas Flowrate (m3/d)")
print(f"dataframe shape: {df.shape}\n"\
      f"features shape: {X.shape}\n"\
      f"target shape: {y.shape}")

# Train and Test Split and Normalization

In [None]:
# seperating train samples from test samples
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.70,
    random_state=SEED
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [10]:
# If features are normally distributed, use StandardScaler.
# If features are not normally distributed or you want to ensure all features are within a specific range, use MinMaxScaler.


scaler = MinMaxScaler()

# Fit the scaler on the training data
feature_scaling =scaler.fit(X_train)
X_train_scaled = feature_scaling.transform(X_train)
X_test_scaled = feature_scaling.transform(X_test)

In [11]:

mms_label = MinMaxScaler()
y_train_scaled = mms_label.fit_transform(y_train.values.reshape(-1,1))
y_test_scaled = mms_label.transform(y_test.values.reshape(-1,1))


# Preparing Data for Modeling

In [None]:
X_train_ann, X_valid_ann = X_train_scaled[ :600], X_train_scaled[600: ]
y_train_ann, y_valid_ann = y_train_scaled[ :600], y_train_scaled[600: ]
X_train_ann.shape, X_valid_ann.shape, y_train_ann.shape, y_valid_ann.shape

In [13]:
train_dataset = (tf.data
                 .Dataset
                 .from_tensor_slices((X_train_ann, y_train_ann))
                 .batch(32)
                 .prefetch(tf.data.AUTOTUNE)
                 )

valid_dataset = (tf.data
                 .Dataset
                 .from_tensor_slices((X_valid_ann, y_valid_ann))
                 .batch(32)
                 .prefetch(tf.data.AUTOTUNE)
                 )

# Optuna Optimization

In [14]:
# Objective Function for Optuna
def objective_ann(trial):
    # Define hyperparameters
    params = {
        "number_of_layers": trial.suggest_int("number_of_layers", 2, 10, step=1),
        "activation": trial.suggest_categorical("activation", ["relu", "swish"]),
        "units": trial.suggest_int("units", 10, 100, step=10),
        "lr": trial.suggest_float("lr", 0.0001, 0.01, log=True),
        "huber_delta": trial.suggest_float("huber_delta", 0.1, 2.0),
        "l1": trial.suggest_float("l1", 0.0001, 0.001, log=True),
        "l2": trial.suggest_float("l2", 0.0001, 0.01, log=True),
    }

    # Build the model
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=(X_train_ann.shape[1],)))
    model.add(tf.keras.layers.GaussianNoise(stddev=0.01))
    for _ in range(params["number_of_layers"]):
        model.add(
            tf.keras.layers.Dense(
                units=params["units"],
                activation=params["activation"],
                kernel_initializer="he_uniform",
                kernel_regularizer=tf.keras.regularizers.l1_l2(
                    l1=params["l1"], l2=params["l2"]
                ),
            )
        )
    model.add(tf.keras.layers.Dense(1))

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=params["lr"])
    loss = tf.keras.losses.Huber(delta=params["huber_delta"])
    model.compile(loss=loss, optimizer=optimizer, metrics=["mse"])

    # Early stopping callback
    es = tf.keras.callbacks.EarlyStopping(
        monitor="val_mse",
        patience=50,
        verbose=1,
        restore_best_weights=True,
    )

    # Train the model
    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=1000,
        callbacks=[es],
        verbose=0,
    )

    # Evaluate the model
    val_mse = history.history["val_mse"][-1]
    return val_mse


In [None]:
# Optuna Study
study_ann = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
)

study_ann.optimize(objective_ann, n_trials=50)

# Fitting the Model with Best HPs

In [17]:
best_hps = study_ann.best_params

In [None]:
best_hps

In [15]:
best_hps = {'number_of_layers': 4,
            'activation': 'relu',
            'units': 30,
            'lr': 0.0008911042438126581,
            'huber_delta': 0.8104462665441812,
            'l1': 0.00011362602193572822,
            'l2': 0.0001468622055305611}

In [16]:
# Final Model Using Best Hyperparameters

model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(X_train_ann.shape[1],)))
model.add(tf.keras.layers.GaussianNoise(stddev=0.01))
for _ in range(best_hps["number_of_layers"]):
    model.add(
        tf.keras.layers.Dense(
            units=best_hps["units"],
            activation=best_hps["activation"],
            kernel_initializer="he_uniform",
            kernel_regularizer=tf.keras.regularizers.l1_l2(
                l1=best_hps["l1"], l2=best_hps["l2"]
            ),
        )
    )
model.add(tf.keras.layers.Dense(1))

# Compile the model
optimizer = tf.keras.optimizers.AdamW(learning_rate=best_hps["lr"])
loss = tf.keras.losses.Huber(delta=best_hps["huber_delta"])
model.compile(loss=loss, optimizer=optimizer, metrics=["mae", "mse"])

# Callbacks
es = tf.keras.callbacks.EarlyStopping(
    monitor="mse",
    patience=100,
    verbose=1,
    restore_best_weights=True,
)

lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="mse",
    patience=20,
    verbose=1,
    factor=0.5,
)


In [None]:
# Train the final model
history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=1000,
    callbacks=[es, lr_schedule],
    verbose=1,
)


# Model Evaluation

In [None]:
test_pred_scaled = model.predict(X_test_scaled)
r2_score(y_test_scaled, test_pred_scaled)

In [None]:
train_pred_scaled = model.predict(X_train_scaled)
r2_score(y_train_scaled, train_pred_scaled)

In [None]:
r2_test_scaled = r2_score(y_test_scaled, test_pred_scaled)
r2_train_scaled = r2_score(y_train_scaled, train_pred_scaled)


mae_test_scaled = mean_absolute_error(y_test_scaled, test_pred_scaled)
mae_train_scaled = mean_absolute_error(y_train_scaled, train_pred_scaled)

mse_test_scaled = mean_squared_error(y_test_scaled, test_pred_scaled)
mse_train_scaled = mean_squared_error(y_train_scaled, train_pred_scaled)

rmse_test_scaled = mean_squared_error(y_test_scaled, test_pred_scaled) ** 0.5
rmse_train_scaled = mean_squared_error(y_train_scaled, train_pred_scaled) ** 0.5

mape_test_scaled = mean_absolute_percentage_error(y_test_scaled, test_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train_scaled, train_pred_scaled)

results = pd.DataFrame()
results.loc["train", "r2"] = r2_train_scaled
results.loc["train", "mae"] = mae_train_scaled
results.loc["train", "mse"] = mse_train_scaled
results.loc["train", "rmse"] = rmse_train_scaled
results.loc["train", "mape"] = mape_train_scaled



results.loc["test", "r2"] = r2_test_scaled
results.loc["test", "mae"] = mae_test_scaled
results.loc["test", "mse"] = mse_test_scaled
results.loc["test", "rmse"] = rmse_test_scaled
results.loc["test", "mape"] = mape_test_scaled


results.round(3).to_csv("model_performance_scaled")
results.round(3)

In [None]:
PredictionErrorDisplay.from_predictions(
    y_train_scaled,
    train_pred_scaled.squeeze(),
    kind="actual_vs_predicted",
    # scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_train_scaled.round(3)}"},
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_train_scaled}"},
    line_kwargs={"alpha": 1},

)
plt.legend()
plt.savefig("Scaled Test Evaluation.svg")

In [None]:
PredictionErrorDisplay.from_predictions(
    y_test_scaled,
    test_pred_scaled.squeeze(),
    kind="actual_vs_predicted",
    # scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_test_scaled.round(3)}"},
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_test_scaled}"},
    line_kwargs={ "alpha": 1},

)
plt.legend()
plt.savefig("Scaled Test Evaluation.svg")

In [None]:
history_df = pd.DataFrame(history.history)
print(history_df.columns)

In [None]:
pd.DataFrame(history.history).learning_rate.plot()
plt.yscale("log")
plt.ylabel("learning_rate")
plt.xlabel("Epoch")
plt.savefig("learning_rate_decaying_process.svg")

In [None]:
pd.DataFrame(history.history).mse.plot()
plt.yscale("linear")
plt.ylabel("Mean Squared Error")
plt.xlabel("Epoch")
plt.savefig("training_process.svg")

In [None]:
# To revert back to the original data

train_pred_scaled_1 = model.predict(X_train_scaled)

train_pred_org_1 = mms_label.inverse_transform(train_pred_scaled_1)
r2_train_org = r2_score(y_train, train_pred_org_1)

print(r2_train_org)



test_pred_scaled_1 = model.predict(X_test_scaled)

test_pred_org_1 = mms_label.inverse_transform(test_pred_scaled_1)
r2_test_org = r2_score(y_test, test_pred_org_1)

print(r2_test_org)

In [None]:
r2_test_org = r2_score(y_test, test_pred_org_1)
r2_train_org = r2_score(y_train, train_pred_org_1)


mae_test_org = mean_absolute_error(y_test, test_pred_org_1)
mae_train_org = mean_absolute_error(y_train, train_pred_org_1)

mse_test_org = mean_squared_error(y_test, test_pred_org_1)
mse_train_org = mean_squared_error(y_train, train_pred_org_1)

rmse_test_org = mean_squared_error(y_test, test_pred_org_1) ** 0.5
rmse_train_org = mean_squared_error(y_train, train_pred_org_1) ** 0.5

mape_test_org = mean_absolute_percentage_error(y_test, test_pred_org_1)
mape_train_org = mean_absolute_percentage_error(y_train, train_pred_org_1)

results = pd.DataFrame()
results.loc["train", "r2"] = r2_train_org
results.loc["train", "mae"] = mae_train_org
results.loc["train", "mse"] = mse_train_org
results.loc["train", "rmse"] = rmse_train_org
results.loc["train", "mape"] = mape_train_org



results.loc["test", "r2"] = r2_test_org
results.loc["test", "mae"] = mae_test_org
results.loc["test", "mse"] = mse_test_org
results.loc["test", "rmse"] = rmse_test_org
results.loc["test", "mape"] = mape_test_org


results.round(3).to_csv("model_performance")
results.round(3)

In [28]:
plt.style.use(['science', 'nature', 'high-contrast', "no-latex"])


In [None]:

# Evaluation for Train Dataset
PredictionErrorDisplay.from_predictions(
    y_train,
    train_pred_org_1.squeeze(),
    kind="actual_vs_predicted",
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"Train MAE: {int(mae_train_org)}"},  
    line_kwargs={"alpha": 1},
)
plt.xlabel("Predicted Total Biogas Flowrate (m3/d)")  
plt.ylabel("Actual Total Biogas Flowrate (m3/d)")    
plt.legend()
plt.savefig("Real Train Evaluation.svg")

# Evaluation for Test Dataset
PredictionErrorDisplay.from_predictions(
    y_test,
    test_pred_org_1.squeeze(),
    kind="actual_vs_predicted",
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"Test MAE: {int(mae_test_org)}"}, 
    line_kwargs={"alpha": 1},
)
plt.xlabel("Predicted Total Biogas Flowrate (m3/d)")  
plt.ylabel("Actual Total Biogas Flowrate (m3/d)")    
plt.legend()
plt.savefig("Real Test Evaluation.svg")

In [None]:
PredictionErrorDisplay.from_predictions(
    y_train,
    train_pred_org_1.squeeze(),
    kind="actual_vs_predicted",
    # scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_train_org.round(3)}"},
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_train_org}"},
    line_kwargs={"alpha": 1},
)
plt.legend()
plt.savefig("Real Train Evaluation.svg")

In [None]:
PredictionErrorDisplay.from_predictions(
    y_test,
    test_pred_org_1.squeeze(),
    kind="actual_vs_predicted",
    # scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_test_org.round(3)}"},
    scatter_kwargs={"color": colors["blue"], "alpha": 0.8, "label": f"R-Squared: {r2_test_org}"},
    line_kwargs={"alpha": 1},

)
plt.legend()
plt.savefig("Real Test Evaluation.svg")