In [5]:
#
# Libraries
#

# General
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio

# Sklearn
from sklearn.model_selection import *
from sklearn.feature_extraction import *
from sklearn.metrics import *
from sklearn.metrics import pairwise
from sklearn.preprocessing import *
from sklearn.utils import *
from sklearn.pipeline import *
from sklearn.compose import *
from sklearn.covariance import *
from sklearn.multioutput import *
from sklearn.ensemble import *
from sklearn.tree import *

# Stats
import scipy
from scipy.stats import *
from scipy.sparse import csr_matrix

# Optuna
import optuna

ModuleNotFoundError: No module named 'optuna'

In [3]:
#
# Data
#

# base path
base_path = 'data/'

# ds
ds = pd.read_csv(base_path+'ENB2012_data.csv')


In [6]:
seed = 100

# drop NULLs (if any)
ds.dropna(inplace=True)

In [7]:
# sampling
df = ds.sample(frac=1.0,random_state=seed).reset_index(drop=True)

# View
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,target_1,target_2
0,0.71,710.5,269.5,220.5,3.5,3,0.1,3,10.68,13.77
1,0.82,612.5,318.5,147.0,7.0,3,0.1,5,23.89,24.77
2,0.82,612.5,318.5,147.0,7.0,5,0.1,4,24.24,25.16
3,0.79,637.0,343.0,147.0,7.0,3,0.4,5,41.96,37.7
4,0.62,808.5,367.5,220.5,3.5,5,0.1,3,12.73,13.89


In [8]:
#
# Correlation Plot
#

X = df.drop(columns=['Y1','Y2'])
# Compute the correlation matrix
corr = X.select_dtypes(include = ['float']).corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

KeyError: "['Y1', 'Y2'] not found in axis"

In [24]:
#
# Feature Engineering & Data Split
#

y = df[['Y1','Y2']]  # Target
x = df.drop(columns=['Y1','Y2'])  # Features

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
print(f"Training Dataset: {x_train.shape[0]} | Testing Dataset: {x_test.shape[0]}")

Training Dataset: 614 | Testing Dataset: 154


In [25]:
#
# Scaling
#

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [26]:
#
# MOR Model Training
#

# Create multi-target regression model (RandomForest Regressor)
mo_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=10, max_depth=3, min_samples_split=3, random_state=seed, n_jobs=-1,criterion='squared_error'),
                                n_jobs=3)

# Train
mo_model.fit(x_train_scaled,y_train)

0,1,2
,estimator,RandomForestR...dom_state=100)
,n_jobs,3

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
import numpy as np

In [28]:
#
# Prediction & Evaluation
#

# Predictions
y_pred = mo_model.predict(X_test_scaled)

print("MOR MAE for Target 1:", mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("MOR MAE for Target 2:", mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1]))

print('')

print("MOR MAPE for Target 1:", mean_absolute_percentage_error(y_test.iloc[:, 0], y_pred[:, 0]) * 100)
print("MOR MAPE for Target 2:", mean_absolute_percentage_error(y_test.iloc[:, 1], y_pred[:, 1]) * 100)

print('')

print("MOR RMSE for Target 1:", np.sqrt(np.mean((y_test.iloc[:, 0] - y_pred[:, 0])**2)))
print("MOR RMSE for Target 2:", np.sqrt(np.mean((y_test.iloc[:, 1] - y_pred[:, 1])**2)))

print('')

print("MOR MSE for Target 1:", mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("MOR MSE for Target 2:", mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))

MOR MAE for Target 1: 1.7178860960159978
MOR MAE for Target 2: 2.121386181576255

MOR MAPE for Target 1: 8.7021014706894
MOR MAPE for Target 2: 8.326880364053292

MOR RMSE for Target 1: 2.302577387590086
MOR RMSE for Target 2: 2.8199223938739713

MOR MSE for Target 1: 5.301862625841186
MOR MSE for Target 2: 7.951962307471909


In [29]:
#
# LR Model Training
#

from sklearn.linear_model import LinearRegression

# Create multi-target regression model (RandomForest Regressor)
lr_model = MultiOutputRegressor(LinearRegression(), n_jobs=3)

# Train
lr_model.fit(x_train_scaled,y_train)

0,1,2
,estimator,LinearRegression()
,n_jobs,3

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
#
# Prediction & Evaluation
#

# Predictions
y_pred = lr_model.predict(X_test_scaled)

print("LR MAE for Target 1:", mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("LR MAE for Target 2:", mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1]))

print('')

print("LR MAPE for Target 1:", mean_absolute_percentage_error(y_test.iloc[:, 0], y_pred[:, 0]) * 100)
print("LR MAPE for Target 2:", mean_absolute_percentage_error(y_test.iloc[:, 1], y_pred[:, 1]) * 100)

print('')

print("LR RMSE for Target 1:", np.sqrt(np.mean((y_test.iloc[:, 0] - y_pred[:, 0])**2)))
print("LR RMSE for Target 2:", np.sqrt(np.mean((y_test.iloc[:, 1] - y_pred[:, 1])**2)))

print('')

print("LR MSE for Target 1:", mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("LR MSE for Target 2:", mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))

LR MAE for Target 1: 2.23303002844557
LR MAE for Target 2: 2.4343981758192648

LR MAPE for Target 1: 10.250534003619098
LR MAPE for Target 2: 9.39144073585848

LR RMSE for Target 1: 3.1459806064798324
LR RMSE for Target 2: 3.490331873311021

LR MSE for Target 1: 9.897193976347214
LR MSE for Target 2: 12.182416585850822


In [31]:
from sklearn.neural_network import MLPRegressor

# Create multi-target regression model (MLP)
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, 100),   # one hidden layer: 100 neurons
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=500,
    random_state=seed
)

# Train
mlp_model.fit(x_train_scaled, y_train)



0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(100, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'adaptive'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500


In [32]:
#
# Prediction & Evaluation
#

# Predictions
y_pred = mlp_model.predict(X_test_scaled)

print("MLP MAE for Target 1:", mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("MLP MAE for Target 2:", mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1]))

print('')

print("MLP MAPE for Target 1:", mean_absolute_percentage_error(y_test.iloc[:, 0], y_pred[:, 0]) * 100)
print("MLP MAPE for Target 2:", mean_absolute_percentage_error(y_test.iloc[:, 1], y_pred[:, 1]) * 100)

print('')

print("MLP RMSE for Target 1:", np.sqrt(np.mean((y_test.iloc[:, 0] - y_pred[:, 0])**2)))
print("MLP RMSE for Target 2:", np.sqrt(np.mean((y_test.iloc[:, 1] - y_pred[:, 1])**2)))

print('')

print("MLP MSE for Target 1:", mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("MLP MSE for Target 2:", mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))

MLP MAE for Target 1: 0.9609198095187925
MLP MAE for Target 2: 1.2923492996076513

MLP MAPE for Target 1: 4.082566458233003
MLP MAPE for Target 2: 4.666523826500289

MLP RMSE for Target 1: 1.3853908953784633
MLP RMSE for Target 2: 1.9443386904181743

MLP MSE for Target 1: 1.9193079329975402
MLP MSE for Target 2: 3.780452943057061


In [33]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
from sklearn.multioutput import MultiOutputRegressor

# Define a kernel: Constant * RBF + WhiteKernel (noise)
kernel = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0)

# Create single-target GP regressor
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)

# Multi-output GP
mo_gp = MultiOutputRegressor(gp)

# Train
mo_gp.fit(x_train_scaled, y_train)



0,1,2
,estimator,GaussianProce...ndom_state=42)
,n_jobs,

0,1,2
,kernel,1**2 * RBF(le...noise_level=1)
,alpha,1e-10
,optimizer,'fmin_l_bfgs_b'
,n_restarts_optimizer,2
,normalize_y,False
,copy_X_train,True
,n_targets,
,random_state,42
,kernel__k1,1**2 * RBF(length_scale=1)
,kernel__k2,WhiteKernel(noise_level=1)


In [34]:
#
# Prediction & Evaluation
#

# Predictions
y_pred = mo_gp.predict(X_test_scaled)

print("GP MAE for Target 1:", mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("GP MAE for Target 2:", mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1]))

print('')

print("GP MAPE for Target 1:", mean_absolute_percentage_error(y_test.iloc[:, 0], y_pred[:, 0]) * 100)
print("GP MAPE for Target 2:", mean_absolute_percentage_error(y_test.iloc[:, 1], y_pred[:, 1]) * 100)

print('')

print("GP RMSE for Target 1:", np.sqrt(np.mean((y_test.iloc[:, 0] - y_pred[:, 0])**2)))
print("GP RMSE for Target 2:", np.sqrt(np.mean((y_test.iloc[:, 1] - y_pred[:, 1])**2)))

print('')

print("GP MSE for Target 1:", mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))
print("GP MSE for Target 2:", mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))

GP MAE for Target 1: 0.34649863446396756
GP MAE for Target 2: 0.8672880407247735

GP MAPE for Target 1: 1.7204535683456674
GP MAPE for Target 2: 3.455651581931212

GP RMSE for Target 1: 0.4744946039774124
GP RMSE for Target 2: 1.2548340121128756

GP MSE for Target 1: 0.22514512920368143
GP MSE for Target 2: 1.5746083979552965
