In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
import time
import sklearn.metrics
from sklearn.metrics import mean_squared_error

# Hyperparameters
num_epochs = 100
random_seeds = [42, 0, 17, 9, 3, 16, 2]  # model seeds are used for deep ensembling
lr = 0.001  # learning rate
WD = 0.001  # weight decay (L2 regularization)
DO = 0.1  # dropout (at the training stage only)

# Transformer model
def create_transformer_model(input_shape, num_heads=4, ff_dim=64, num_transformer_blocks=2, do=DO):
    inputs = Input(shape=input_shape)
    x = inputs

    for _ in range(num_transformer_blocks):
        # Multi-Head Attention
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=input_shape[-1])(x, x)
        x = LayerNormalization(epsilon=1e-6)(x + attention_output)

        # Feed-forward network
        ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation=sinOcos), Dense(input_shape[-1])]
        )
        ffn_output = ffn(x)
        x = LayerNormalization(epsilon=1e-6)(x + ffn_output)

    # Output layer
    x = Dense(100, activation='relu')(x)
    x = Dropout(do)(x)
    outputs = Dense(1)(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Learning rate scheduler
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.96, patience=10, min_lr=1e-7, verbose=1)
early_stop = EarlyStopping(monitor="val_loss", patience=num_epochs, restore_best_weights=True, verbose=1)

# Split the data
X_train_s, X_val, y_train_s, y_val = train_test_split(X_train, y_train, test_size=0.1)

# Reshape input data for Transformer
# Transformer expects 3D input: (batch_size, sequence_length, features)
X_train_reshaped = X_train_s.reshape(X_train_s.shape[0], 1, X_train_s.shape[1])
X_val_reshaped = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])
X_test_reshaped = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Train multiple models with different random seeds
models = []
start = time.time()
for seed in random_seeds:
    tf.random.set_seed(seed)
    model = create_transformer_model(input_shape=(1, X_train_s.shape[1]))
    model.compile(loss="mse", optimizer=Adam(learning_rate=lr, decay=WD))
    history = model.fit(
        x=X_train_reshaped,
        y=y_train_s,
        validation_data=(X_val_reshaped, y_val),
        epochs=num_epochs,
        batch_size=500,
        callbacks=[reduce_lr, early_stop]
    )
    models.append(model)
end_train = time.time()

# Aggregate predictions from the models
y_predictions = np.zeros((X_test_reshaped.shape[0], len(models)))
for i, model in enumerate(models):
    y_predictions[:, i] = model.predict(X_test_reshaped).flatten()

# Compute the ensemble prediction
ensemble_prediction = np.mean(y_predictions, axis=1)
end_predict = time.time()

# Model performance
model_performance.loc['Transformer+MSE'] = [
    sklearn.metrics.r2_score(y_test, ensemble_prediction),
    mean_squared_error(y_test, ensemble_prediction, squared=False),
    end_train - start,
    end_predict - end_train,
    end_predict - start
]

print('R-squared error: ' + "{:.2%}".format(sklearn.metrics.r2_score(y_test, ensemble_prediction)))
print('Root Mean Squared Error: ' + "{:.2f}".format(mean_squared_error(y_test, ensemble_prediction, squared=False)))