In [54]:
import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, RMSprop, Adam, Optimizer
import plotly.graph_objs as go
from pathlib import Path

In [55]:
# I use my own GPU, I've set up CUDA on my machine
%set_env XLA_FLAGS=--xla_gpu_cuda_data_dir=/opt/cuda/

env: XLA_FLAGS=--xla_gpu_cuda_data_dir=/opt/cuda/


In [56]:

input_dim = 10
output_dim = 1
max_layer_scale_factor = 2

epochs = 100
batch_size = 1000

optimizers = [SGD, RMSprop, Adam]#[0:1]

learning_rates = [0.1, 0.01, 0.001]#[0:1]

architectures = [1, 5, 25]#[0:1]

Friedman dataset. 10 features. Regression target is given by 

$$y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1)$$

In [57]:
X, y = make_friedman1(
    n_samples=150000, n_features=input_dim, noise=1.0, random_state=42
)

In [58]:
X = MinMaxScaler().fit_transform(X)
y = MinMaxScaler().fit_transform(y.reshape(-1, 1)).ravel()

In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=int(0.8 * len(X)), random_state=42
)

In [60]:
# I am not using sklearn LinearRegression. cause that's how I get history and learning curves
# And it works with the same plotting functions as DNN!
def compile_linear():
    model = Sequential()
    model.add(Dense(output_dim, activation="linear"))
    model.compile(
        optimizer="rmsprop",
        loss="mean_squared_error",
        metrics=["mape", "mse", "r2_score"],
    )
    return model

In [61]:
def compile(
    optimizer: Optimizer, learning_rate: float, hidden_layers: int
) -> Sequential:
    print(
        f"=== Compiling network with {hidden_layers} hidden layer(s), {optimizer.__name__} optimizer and LR={learning_rate} ==="
    )
    model = Sequential()

    # plus input and output layers
    total_layers = hidden_layers + 2
    for i in range(1, total_layers):

        # a little formula to make the network wider in the middle and narrower at the ends
        interpolation_index = np.interp(
            i, [0, total_layers - 1], [input_dim, output_dim]
        )
        layer_size = int(
            interpolation_index
            * (1.0 + np.sin(i / (total_layers - 1) * np.pi) * max_layer_scale_factor)
        )

        model.add(
            Dense(layer_size, activation="relu" if i != total_layers - 1 else "sigmoid")
        )

    optimizer_instance = optimizer(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer_instance,
        loss="mean_squared_error",
        metrics=["mape", "mse", "r2_score"],
    )

    return model

In [62]:
def train(model: Sequential):
    print(
        f"=== Training network with {len(model.layers) - 1} hidden layer(s), {model.optimizer.__class__.__name__} optimizer and LR={round(float(model.optimizer.learning_rate.numpy()), 3)} ==="
    )
    model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
    )


In [63]:
def create_plot(model: Sequential):
    history = model.history.history

    r2_train = history["r2_score"]
    r2_val = history["val_r2_score"]
    mape_train = history["mape"]
    mape_val = history["val_mape"]
    mse_train = history["mse"]
    mse_val = history["val_mse"]

    train_step = np.arange(1, epochs + 1)

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=train_step, y=r2_train, mode="lines", name="Training R2", yaxis="y1"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=train_step, y=r2_val, mode="lines", name="Validation R2", yaxis="y1"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_step, y=mape_train, mode="lines", name="Training MAPE", yaxis="y2"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=train_step, y=mape_val, mode="lines", name="Validation MAPE", yaxis="y2"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_step, y=mse_train, mode="lines", name="Training MSE", yaxis="y3"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=train_step, y=mse_val, mode="lines", name="Validation MSE", yaxis="y3"
        )
    )

    # Different y-axes for different metrics
    fig.update_layout(
        yaxis=dict(
            title="R2",
            titlefont=dict(color="#1f77b4"),
            tickfont=dict(color="#1f77b4"),
        ),
        yaxis2=dict(
            title="MAPE",
            titlefont=dict(color="#ff7f0e"),
            tickfont=dict(color="#ff7f0e"),
            anchor="free",
            overlaying="y",
            autoshift=True,
        ),
        yaxis3=dict(
            title="MSE",
            titlefont=dict(color="#d62728"),
            tickfont=dict(color="#d62728"),
            anchor="free",
            overlaying="y",
            autoshift=True,
        ),
    )

    fig.update_layout(
        title=f"DNN with {len(model.layers) - 1} hidden Layer(s), {model.optimizer.__class__.__name__} optimizer, learning rate = {round(float(model.optimizer.learning_rate.numpy()), 3)}",
        xaxis_title="Epoch",
        legend_title="Metric",
    )

    return fig

In [64]:
linreg = compile_linear()
train(linreg)
fig = create_plot(linreg)

fig.show()

=== Training network with 0 hidden layer(s), RMSprop optimizer and LR=0.001 ===
Epoch 1/100


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1082 - mape: 1092.8607 - mse: 0.1082 - r2_score: -3.4686 - val_loss: 0.0724 - val_mape: 48.2630 - val_mse: 0.0724 - val_r2_score: -1.9974
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0633 - mape: 5355.1587 - mse: 0.0633 - r2_score: -1.6021 - val_loss: 0.0405 - val_mape: 36.2074 - val_mse: 0.0405 - val_r2_score: -0.6783
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0349 - mape: 5799.1558 - mse: 0.0349 - r2_score: -0.4481 - val_loss: 0.0209 - val_mape: 26.1589 - val_mse: 0.0209 - val_r2_score: 0.1337
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step - loss: 0.0178 - mape: 2318.9890 - mse: 0.0178 - r2_score: 0.2645 - val_loss: 0.0109 - val_mape: 18.8899 - val_mse: 0.0109 - val_r2_score: 0.5474
Epoch 5/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [65]:
# we'll store our models and plots here
Path("trained").mkdir(parents=True, exist_ok=True)

In [66]:
for optimizer in optimizers:
    for lr in learning_rates:
        for hidden_layers in architectures:
            model_name = f"trained/model_{optimizer.__name__}_{lr}_{hidden_layers}.keras"

            model = compile(optimizer, lr, hidden_layers)
            train(model)
            
            model.save(model_name)
            
            fig = create_plot(model)
            
            fig.write_html(
                f"trained/model_{model.optimizer.__class__.__name__}_{round(float(model.optimizer.learning_rate.numpy()), 3)}_{len(model.layers) - 1}.html"
            )
            fig.show()  

=== Compiling network with 1 hidden layer(s), SGD optimizer and LR=0.1 ===
=== Training network with 1 hidden layer(s), SGD optimizer and LR=0.1 ===
Epoch 1/100


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0366 - mape: 3367.1392 - mse: 0.0366 - r2_score: -0.5198 - val_loss: 0.0250 - val_mape: 32.6059 - val_mse: 0.0250 - val_r2_score: -0.0374
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 914us/step - loss: 0.0237 - mape: 5322.2480 - mse: 0.0237 - r2_score: 0.0173 - val_loss: 0.0208 - val_mape: 29.9120 - val_mse: 0.0208 - val_r2_score: 0.1399
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 948us/step - loss: 0.0201 - mape: 5968.2734 - mse: 0.0201 - r2_score: 0.1715 - val_loss: 0.0182 - val_mape: 27.9362 - val_mse: 0.0182 - val_r2_score: 0.2470
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step - loss: 0.0176 - mape: 10005.1680 - mse: 0.0176 - r2_score: 0.2678 - val_loss: 0.0162 - val_mape: 26.3478 - val_mse: 0.0162 - val_r2_score: 0.3278
Epoch 5/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

=== Compiling network with 5 hidden layer(s), SGD optimizer and LR=0.1 ===
=== Training network with 5 hidden layer(s), SGD optimizer and LR=0.1 ===
Epoch 1/100










[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0248 - mape: 4049.7871 - mse: 0.0248 - r2_score: -0.0234 - val_loss: 0.0241 - val_mape: 31.8517 - val_mse: 0.0241 - val_r2_score: 0.0018
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0240 - mape: 2069.5017 - mse: 0.0240 - r2_score: 0.0078 - val_loss: 0.0234 - val_mape: 31.6799 - val_mse: 0.0234 - val_r2_score: 0.0305
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0233 - mape: 1281.3517 - mse: 0.0233 - r2_score: 0.0386 - val_loss: 0.0226 - val_mape: 31.0672 - val_mse: 0.0226 - val_r2_score: 0.0644
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0225 - mape: 3913.1604 - mse: 0.0225 - r2_score: 0.0730 - val_loss: 0.0216 - val_mape: 30.3200 - val_mse: 0.0216 - val_r2_score: 0.1068
Epoch 5/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9

=== Compiling network with 25 hidden layer(s), SGD optimizer and LR=0.1 ===
=== Training network with 25 hidden layer(s), SGD optimizer and LR=0.1 ===
Epoch 1/100








































[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - loss: 0.0243 - mape: 389.4103 - mse: 0.0243 - r2_score: -9.0672e-04 - val_loss: 0.0241 - val_mape: 31.6990 - val_mse: 0.0241 - val_r2_score: -3.7432e-05
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0240 - mape: 3374.0388 - mse: 0.0240 - r2_score: -9.4444e-05 - val_loss: 0.0241 - val_mape: 31.7648 - val_mse: 0.0241 - val_r2_score: 1.6689e-06
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0242 - mape: 824.5826 - mse: 0.0242 - r2_score: -1.5828e-05 - val_loss: 0.0241 - val_mape: 31.7817 - val_mse: 0.0241 - val_r2_score: -2.8610e-06
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 908.3946 - mse: 0.0242 - r2_score: -7.9910e-05 - val_loss: 0.0241 - val_mape: 31.7691 - val_mse: 0.0241 - val_r2_score: 1.0729e-06
Epoch 5/100
[1m120/120[0m [32m━━━━━━━━━━

=== Compiling network with 1 hidden layer(s), SGD optimizer and LR=0.01 ===
=== Training network with 1 hidden layer(s), SGD optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0272 - mape: 9987.8906 - mse: 0.0272 - r2_score: -0.1223 - val_loss: 0.0250 - val_mape: 34.8364 - val_mse: 0.0250 - val_r2_score: -0.0350
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 955us/step - loss: 0.0246 - mape: 415.7113 - mse: 0.0246 - r2_score: -0.0217 - val_loss: 0.0236 - val_mape: 33.2238 - val_mse: 0.0236 - val_r2_score: 0.0216
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 970us/step - loss: 0.0233 - mape: 1835.2920 - mse: 0.0233 - r2_score: 0.0283 - val_loss: 0.0229 - val_mape: 32.2461 - val_mse: 0.0229 - val_r2_score: 0.0511
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0229 - mape: 8985.5234 - mse: 0.0229 - r2_score

=== Compiling network with 5 hidden layer(s), SGD optimizer and LR=0.01 ===
=== Training network with 5 hidden layer(s), SGD optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0245 - mape: 16281.1611 - mse: 0.0245 - r2_score: -0.0104 - val_loss: 0.0244 - val_mape: 32.1704 - val_mse: 0.0244 - val_r2_score: -0.0099
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0244 - mape: 14852.6182 - mse: 0.0244 - r2_score: -0.0093 - val_loss: 0.0244 - val_mape: 32.0994 - val_mse: 0.0244 - val_r2_score: -0.0088
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0245 - mape: 8121.0103 - mse: 0.0245 - r2_score: -0.0076 - val_loss: 0.0243 - val_mape: 32.0398 - val_mse: 0.0243 - val_r2_score: -0.0078
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0244 - mape: 1352.4353 - mse: 0.0244 - r2_sco

=== Compiling network with 25 hidden layer(s), SGD optimizer and LR=0.01 ===
=== Training network with 25 hidden layer(s), SGD optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.0241 - mape: 5368.0610 - mse: 0.0241 - r2_score: -6.7076e-04 - val_loss: 0.0241 - val_mape: 31.6141 - val_mse: 0.0241 - val_r2_score: 3.5709e-04
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 1078.0308 - mse: 0.0241 - r2_score: 2.4799e-04 - val_loss: 0.0241 - val_mape: 31.6943 - val_mse: 0.0241 - val_r2_score: 5.8907e-04
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 1800.7002 - mse: 0.0241 - r2_score: 5.7442e-04 - val_loss: 0.0241 - val_mape: 31.7331 - val_mse: 0.0241 - val_r2_score: 8.2141e-04
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 1854.3231 - 

=== Compiling network with 1 hidden layer(s), SGD optimizer and LR=0.001 ===
=== Training network with 1 hidden layer(s), SGD optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0386 - mape: 9663.6855 - mse: 0.0386 - r2_score: -0.5939 - val_loss: 0.0368 - val_mape: 43.8111 - val_mse: 0.0368 - val_r2_score: -0.5248
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0365 - mape: 8138.0234 - mse: 0.0365 - r2_score: -0.5045 - val_loss: 0.0349 - val_mape: 42.4658 - val_mse: 0.0349 - val_r2_score: -0.4455
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0344 - mape: 1374.5221 - mse: 0.0344 - r2_score: -0.4246 - val_loss: 0.0333 - val_mape: 41.2801 - val_mse: 0.0333 - val_r2_score: -0.3788
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0329 - mape: 5983.5752 - mse: 0.0329 - r2_sco

=== Compiling network with 5 hidden layer(s), SGD optimizer and LR=0.001 ===
=== Training network with 5 hidden layer(s), SGD optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0242 - mape: 6397.0186 - mse: 0.0242 - r2_score: -0.0020 - val_loss: 0.0242 - val_mape: 32.8740 - val_mse: 0.0242 - val_r2_score: -0.0018
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0242 - mape: 3308.8352 - mse: 0.0242 - r2_score: -0.0011 - val_loss: 0.0242 - val_mape: 32.8315 - val_mse: 0.0242 - val_r2_score: -8.8465e-04
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0242 - mape: 7437.2798 - mse: 0.0242 - r2_score: -0.0013 - val_loss: 0.0241 - val_mape: 32.7914 - val_mse: 0.0241 - val_r2_score: -2.7537e-05
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0240 - mape: 4419.7153 - mse: 0.0240 

=== Compiling network with 25 hidden layer(s), SGD optimizer and LR=0.001 ===
=== Training network with 25 hidden layer(s), SGD optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.0242 - mape: 9427.4834 - mse: 0.0242 - r2_score: -0.0012 - val_loss: 0.0242 - val_mape: 31.4582 - val_mse: 0.0242 - val_r2_score: -9.7382e-04
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 941.2215 - mse: 0.0242 - r2_score: -8.5887e-04 - val_loss: 0.0242 - val_mape: 31.4621 - val_mse: 0.0242 - val_r2_score: -9.3067e-04
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 3270.0273 - mse: 0.0241 - r2_score: -0.0013 - val_loss: 0.0242 - val_mape: 31.4663 - val_mse: 0.0242 - val_r2_score: -8.9049e-04
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 205.3076 - mse:

=== Compiling network with 1 hidden layer(s), RMSprop optimizer and LR=0.1 ===
=== Training network with 1 hidden layer(s), RMSprop optimizer and LR=0.1 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0427 - mape: 1289.0507 - mse: 0.0427 - r2_score: -0.7844 - val_loss: 0.0133 - val_mape: 19.5996 - val_mse: 0.0133 - val_r2_score: 0.4475
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0122 - mape: 181.3484 - mse: 0.0122 - r2_score: 0.4954 - val_loss: 0.0117 - val_mape: 17.5125 - val_mse: 0.0117 - val_r2_score: 0.5172
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 963us/step - loss: 0.0109 - mape: 5136.8618 - mse: 0.0109 - r2_score: 0.5471 - val_loss: 0.0102 - val_mape: 16.5293 - val_mse: 0.0102 - val_r2_score: 0.5790
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 988us/step - loss: 0.0103 - mape: 1769.6848 - mse: 0.0103 - r2_s

=== Compiling network with 5 hidden layer(s), RMSprop optimizer and LR=0.1 ===
=== Training network with 5 hidden layer(s), RMSprop optimizer and LR=0.1 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0334 - mape: 3218.8926 - mse: 0.0334 - r2_score: -0.3740 - val_loss: 0.0241 - val_mape: 31.9208 - val_mse: 0.0241 - val_r2_score: -2.5547e-04
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0244 - mape: 530.3024 - mse: 0.0244 - r2_score: -0.0068 - val_loss: 0.0241 - val_mape: 31.8446 - val_mse: 0.0241 - val_r2_score: -7.0572e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0242 - mape: 136.4698 - mse: 0.0242 - r2_score: -0.0057 - val_loss: 0.0242 - val_mape: 32.3309 - val_mse: 0.0242 - val_r2_score: -0.0030
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0243 - mape: 5248.1538 - mse: 0.024

=== Compiling network with 25 hidden layer(s), RMSprop optimizer and LR=0.1 ===
=== Training network with 25 hidden layer(s), RMSprop optimizer and LR=0.1 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - loss: 0.0244 - mape: 518.7344 - mse: 0.0244 - r2_score: -0.0110 - val_loss: 0.0242 - val_mape: 32.1912 - val_mse: 0.0242 - val_r2_score: -0.0017
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0244 - mape: 3044.8206 - mse: 0.0244 - r2_score: -0.0068 - val_loss: 0.0241 - val_mape: 31.7956 - val_mse: 0.0241 - val_r2_score: -1.0729e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0244 - mape: 1350.8842 - mse: 0.0244 - r2_score: -0.0078 - val_loss: 0.0245 - val_mape: 30.7004 - val_mse: 0.0245 - val_r2_score: -0.0162
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0245 - mape: 585.4556 - mse: 0.0245

=== Compiling network with 1 hidden layer(s), RMSprop optimizer and LR=0.01 ===
=== Training network with 1 hidden layer(s), RMSprop optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0158 - mape: 2082.5957 - mse: 0.0158 - r2_score: 0.3499 - val_loss: 0.0068 - val_mape: 15.6099 - val_mse: 0.0068 - val_r2_score: 0.7195
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0070 - mape: 3274.4363 - mse: 0.0070 - r2_score: 0.7131 - val_loss: 0.0061 - val_mape: 14.9120 - val_mse: 0.0061 - val_r2_score: 0.7490
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0059 - mape: 1020.5554 - mse: 0.0059 - r2_score: 0.7547 - val_loss: 0.0062 - val_mape: 15.4126 - val_mse: 0.0062 - val_r2_score: 0.7439
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 972us/step - loss: 0.0052 - mape: 2099.4316 - mse: 0.0052 - r2_s

=== Compiling network with 5 hidden layer(s), RMSprop optimizer and LR=0.01 ===
=== Training network with 5 hidden layer(s), RMSprop optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0184 - mape: 1921.5669 - mse: 0.0184 - r2_score: 0.2432 - val_loss: 0.0092 - val_mape: 20.0585 - val_mse: 0.0092 - val_r2_score: 0.6199
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0097 - mape: 917.3790 - mse: 0.0097 - r2_score: 0.5974 - val_loss: 0.0077 - val_mape: 17.8348 - val_mse: 0.0077 - val_r2_score: 0.6819
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0076 - mape: 1825.2858 - mse: 0.0076 - r2_score: 0.6849 - val_loss: 0.0060 - val_mape: 15.6898 - val_mse: 0.0060 - val_r2_score: 0.7527
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0058 - mape: 3253.7947 - mse: 0.0058 - r2_scor

=== Compiling network with 25 hidden layer(s), RMSprop optimizer and LR=0.01 ===
=== Training network with 25 hidden layer(s), RMSprop optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0240 - mape: 4290.2476 - mse: 0.0240 - r2_score: -6.4053e-04 - val_loss: 0.0241 - val_mape: 31.7776 - val_mse: 0.0241 - val_r2_score: -1.3113e-06
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0241 - mape: 16237.0361 - mse: 0.0241 - r2_score: -4.1021e-04 - val_loss: 0.0242 - val_mape: 31.4701 - val_mse: 0.0242 - val_r2_score: -9.3865e-04
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 1175.5391 - mse: 0.0242 - r2_score: -4.4040e-04 - val_loss: 0.0241 - val_mape: 31.8647 - val_mse: 0.0241 - val_r2_score: -1.0860e-04
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape

=== Compiling network with 1 hidden layer(s), RMSprop optimizer and LR=0.001 ===
=== Training network with 1 hidden layer(s), RMSprop optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0241 - mape: 285.4189 - mse: 0.0241 - r2_score: 3.7286e-04 - val_loss: 0.0134 - val_mape: 23.6500 - val_mse: 0.0134 - val_r2_score: 0.4437
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0119 - mape: 1020.7427 - mse: 0.0119 - r2_score: 0.5090 - val_loss: 0.0085 - val_mape: 17.9009 - val_mse: 0.0085 - val_r2_score: 0.6470
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0081 - mape: 180.0020 - mse: 0.0081 - r2_score: 0.6686 - val_loss: 0.0070 - val_mape: 15.2901 - val_mse: 0.0070 - val_r2_score: 0.7117
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0069 - mape: 222.2740 - mse: 0.0069 - r2_

=== Compiling network with 5 hidden layer(s), RMSprop optimizer and LR=0.001 ===
=== Training network with 5 hidden layer(s), RMSprop optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0220 - mape: 5872.4131 - mse: 0.0220 - r2_score: 0.0859 - val_loss: 0.0109 - val_mape: 20.4195 - val_mse: 0.0109 - val_r2_score: 0.5490
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0089 - mape: 4756.6768 - mse: 0.0089 - r2_score: 0.6333 - val_loss: 0.0070 - val_mape: 14.5854 - val_mse: 0.0070 - val_r2_score: 0.7109
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0067 - mape: 4028.7043 - mse: 0.0067 - r2_score: 0.7215 - val_loss: 0.0064 - val_mape: 14.6557 - val_mse: 0.0064 - val_r2_score: 0.7345
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0063 - mape: 923.8016 - mse: 0.0063 - r2_sc

=== Compiling network with 25 hidden layer(s), RMSprop optimizer and LR=0.001 ===
=== Training network with 25 hidden layer(s), RMSprop optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - loss: 0.0243 - mape: 18142.4316 - mse: 0.0243 - r2_score: -2.7106e-04 - val_loss: 0.0241 - val_mape: 31.7571 - val_mse: 0.0241 - val_r2_score: 2.0266e-06
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0243 - mape: 5159.7305 - mse: 0.0243 - r2_score: -7.0505e-05 - val_loss: 0.0241 - val_mape: 31.7973 - val_mse: 0.0241 - val_r2_score: -1.2159e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 5958.4185 - mse: 0.0242 - r2_score: -5.4654e-05 - val_loss: 0.0241 - val_mape: 31.7472 - val_mse: 0.0241 - val_r2_score: 1.7881e-07
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape

=== Compiling network with 1 hidden layer(s), Adam optimizer and LR=0.1 ===
=== Training network with 1 hidden layer(s), Adam optimizer and LR=0.1 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0204 - mape: 8052.9067 - mse: 0.0204 - r2_score: 0.1575 - val_loss: 0.0069 - val_mape: 14.0853 - val_mse: 0.0069 - val_r2_score: 0.7140
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 963us/step - loss: 0.0063 - mape: 2002.8805 - mse: 0.0063 - r2_score: 0.7373 - val_loss: 0.0060 - val_mape: 14.1107 - val_mse: 0.0060 - val_r2_score: 0.7511
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 932us/step - loss: 0.0058 - mape: 1498.7238 - mse: 0.0058 - r2_score: 0.7589 - val_loss: 0.0058 - val_mape: 13.1443 - val_mse: 0.0058 - val_r2_score: 0.7602
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 943us/step - loss: 0.0058 - mape: 4406.9565 - mse: 0.0058 - r2_score

=== Compiling network with 5 hidden layer(s), Adam optimizer and LR=0.1 ===
=== Training network with 5 hidden layer(s), Adam optimizer and LR=0.1 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0262 - mape: 1826.3822 - mse: 0.0262 - r2_score: -0.0814 - val_loss: 0.0242 - val_mape: 32.0434 - val_mse: 0.0242 - val_r2_score: -7.7736e-04
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0243 - mape: 1899.6428 - mse: 0.0243 - r2_score: -0.0015 - val_loss: 0.0241 - val_mape: 31.5860 - val_mse: 0.0241 - val_r2_score: -3.2663e-04
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0241 - mape: 4813.3623 - mse: 0.0241 - r2_score: -7.8946e-04 - val_loss: 0.0242 - val_mape: 31.4846 - val_mse: 0.0242 - val_r2_score: -8.4388e-04
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0242 - mape: 7437.7246 - mse: 0

=== Compiling network with 25 hidden layer(s), Adam optimizer and LR=0.1 ===
=== Training network with 25 hidden layer(s), Adam optimizer and LR=0.1 ===
Epoch 1/100
[1m 66/120[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 0.0244 - mape: 6219.0137 - mse: 0.0244 - r2_score: -0.0026




[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 0.0243 - mape: 5909.4727 - mse: 0.0243 - r2_score: -0.0025 - val_loss: 0.0242 - val_mape: 31.5309 - val_mse: 0.0242 - val_r2_score: -5.7590e-04
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0243 - mape: 6844.9946 - mse: 0.0243 - r2_score: -0.0013 - val_loss: 0.0241 - val_mape: 31.7970 - val_mse: 0.0241 - val_r2_score: -1.2040e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 1894.9414 - mse: 0.0242 - r2_score: -0.0017 - val_loss: 0.0242 - val_mape: 32.0212 - val_mse: 0.0242 - val_r2_score: -6.6340e-04
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 3596.0867 - mse: 0.0241 - r2_score: -0.0010 - val_loss: 0.0241 - val_mape: 31.6876 - val_mse: 0.0241 - val_r2_score: -5.3644e-05
Epoch 5/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━

=== Compiling network with 1 hidden layer(s), Adam optimizer and LR=0.01 ===
=== Training network with 1 hidden layer(s), Adam optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0115 - mape: 1294.8542 - mse: 0.0115 - r2_score: 0.5285 - val_loss: 0.0052 - val_mape: 12.5277 - val_mse: 0.0052 - val_r2_score: 0.7834
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0048 - mape: 518.9036 - mse: 0.0048 - r2_score: 0.8022 - val_loss: 0.0027 - val_mape: 9.1140 - val_mse: 0.0027 - val_r2_score: 0.8869
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0024 - mape: 1475.1696 - mse: 0.0024 - r2_score: 0.8991 - val_loss: 0.0021 - val_mape: 7.7067 - val_mse: 0.0021 - val_r2_score: 0.9150
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0019 - mape: 677.2529 - mse: 0.0019 - r2_score: 0.9200

=== Compiling network with 5 hidden layer(s), Adam optimizer and LR=0.01 ===
=== Training network with 5 hidden layer(s), Adam optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0098 - mape: 5419.0645 - mse: 0.0098 - r2_score: 0.5945 - val_loss: 0.0020 - val_mape: 8.0947 - val_mse: 0.0020 - val_r2_score: 0.9176
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0018 - mape: 1477.0881 - mse: 0.0018 - r2_score: 0.9257 - val_loss: 0.0018 - val_mape: 8.0939 - val_mse: 0.0018 - val_r2_score: 0.9254
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0014 - mape: 1972.0770 - mse: 0.0014 - r2_score: 0.9404 - val_loss: 0.0013 - val_mape: 6.7115 - val_mse: 0.0013 - val_r2_score: 0.9448
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0015 - mape: 1514.7765 - mse: 0.0015 - r2_score: 0.938

=== Compiling network with 25 hidden layer(s), Adam optimizer and LR=0.01 ===
=== Training network with 25 hidden layer(s), Adam optimizer and LR=0.01 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - loss: 0.0242 - mape: 1224.3534 - mse: 0.0242 - r2_score: -2.7013e-04 - val_loss: 0.0241 - val_mape: 31.6917 - val_mse: 0.0241 - val_r2_score: -4.7565e-05
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 352.5218 - mse: 0.0242 - r2_score: -3.7521e-04 - val_loss: 0.0241 - val_mape: 31.8253 - val_mse: 0.0241 - val_r2_score: -4.1246e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 6107.6328 - mse: 0.0241 - r2_score: -7.4831e-05 - val_loss: 0.0241 - val_mape: 31.7351 - val_mse: 0.0241 - val_r2_score: -4.5300e-06
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0241 - mape: 4433.

=== Compiling network with 1 hidden layer(s), Adam optimizer and LR=0.001 ===
=== Training network with 1 hidden layer(s), Adam optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0252 - mape: 2700.5193 - mse: 0.0252 - r2_score: -0.0379 - val_loss: 0.0115 - val_mape: 21.4544 - val_mse: 0.0115 - val_r2_score: 0.5231
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0105 - mape: 8364.8184 - mse: 0.0105 - r2_score: 0.5666 - val_loss: 0.0084 - val_mape: 17.4833 - val_mse: 0.0084 - val_r2_score: 0.6507
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0081 - mape: 1298.0178 - mse: 0.0081 - r2_score: 0.6656 - val_loss: 0.0071 - val_mape: 15.7252 - val_mse: 0.0071 - val_r2_score: 0.7041
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0070 - mape: 3488.7251 - mse: 0.0070 - r2_score:

=== Compiling network with 5 hidden layer(s), Adam optimizer and LR=0.001 ===
=== Training network with 5 hidden layer(s), Adam optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0203 - mape: 3248.9729 - mse: 0.0203 - r2_score: 0.1564 - val_loss: 0.0124 - val_mape: 24.3282 - val_mse: 0.0124 - val_r2_score: 0.4849
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0115 - mape: 7186.2402 - mse: 0.0115 - r2_score: 0.5281 - val_loss: 0.0087 - val_mape: 20.3431 - val_mse: 0.0087 - val_r2_score: 0.6376
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0079 - mape: 2832.7100 - mse: 0.0079 - r2_score: 0.6758 - val_loss: 0.0056 - val_mape: 16.2362 - val_mse: 0.0056 - val_r2_score: 0.7675
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0052 - mape: 236.0970 - mse: 0.0052 - r2_score: 0

=== Compiling network with 25 hidden layer(s), Adam optimizer and LR=0.001 ===
=== Training network with 25 hidden layer(s), Adam optimizer and LR=0.001 ===
Epoch 1/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 0.0241 - mape: 4128.6562 - mse: 0.0241 - r2_score: -2.2896e-04 - val_loss: 0.0241 - val_mape: 31.7550 - val_mse: 0.0241 - val_r2_score: 1.7881e-06
Epoch 2/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0243 - mape: 16216.3516 - mse: 0.0243 - r2_score: -4.6796e-05 - val_loss: 0.0241 - val_mape: 31.8037 - val_mse: 0.0241 - val_r2_score: -1.7524e-05
Epoch 3/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0239 - mape: 5178.5630 - mse: 0.0239 - r2_score: -6.9951e-05 - val_loss: 0.0241 - val_mape: 31.7271 - val_mse: 0.0241 - val_r2_score: -9.6560e-06
Epoch 4/100
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0242 - mape: 78

### We can thus conclude that:
- DNN with 1 and 5 hidden layers performed better than baseline LR, confirming non-linearity of the dataset (as obvious from the cosine therm in the Friedman formula).
- In the performed results, the best number of hidden layers is 5
- Networks with 25 layers perform the worst, never reaching lower than 0.2 MSE. R2 scores for the configuration suggest it is worse fit than horizontal line.
- Picking higher learning rate sometimes results in inability to find the minimum, thus remaining stuck at a high loss, about 0.2 MSE, like with 5 hidden layers, Adam optimizer and LR=0.1
- The absolute best pick of hyperparameters is 5 hidden layers, Adam optimizer and LR=0.001. With it, loss is 0.001, R2 score is 0.95+ 
- Number of layers higher than 5 causes training process to become unstable. It does not find the minimum, learning process is erratic.
- Data distribution influenced validity of some scores (unnnormalized data yielded NaN values) and loss descent rates. Normalizing both input and output data relative to each dimension yielded the best results