## Attention model

Reference : https://machinelearningmastery.com/encoder-decoder-attention-sequence-to-sequence-prediction-keras/

The encoder-decoder model for recurrent neural networks is an architecture for sequence-to-sequence prediction problems.

Encoder: The encoder is responsible for stepping through the input time steps and encoding the entire sequence into a fixed length vector called a context vector.

Decoder: The decoder is responsible for stepping through the output time steps while reading from the context vector.

A problem with the architecture is that performance is poor on long input or output sequences.

Attention is an extension to the architecture that addresses this limitation. It works by first providing a richer context from the encoder to the decoder and a learning mechanism where the decoder can learn where to pay attention in the richer encoding when predicting each time step in the output sequence.

In [1]:
!pip install keras-self-attention

Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18894 sha256=6d542357a0a6deec1d0959198606f1cfe7805083be26911d41dc06b2a1a5f1b2
  Stored in directory: /root/.cache/pip/wheels/b8/f7/24/607b483144fb9c47b4ba2c5fba6b68e54aeee2d5bf6c05302e
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0


In [25]:
from random import randint

In [3]:
!pip install keras-attention

Collecting keras-attention
  Downloading keras_attention-1.0.0-py3-none-any.whl.metadata (642 bytes)
Downloading keras_attention-1.0.0-py3-none-any.whl (7.0 kB)
Installing collected packages: keras-attention
Successfully installed keras-attention-1.0.0


In [8]:
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras.models import Sequential
from keras.layers import LSTM
from tensorflow.keras.layers import Attention

In [28]:
import numpy as np

In [9]:
# generate a sequence of random integers
def generate_sequence(length, n_unique):
	return [randint(0, n_unique-1) for _ in range(length)]

In [10]:
# one hot encode sequence
def one_hot_encode(sequence, n_unique):
	encoding = list()
	for value in sequence:
		vector = [0 for _ in range(n_unique)]
		vector[value] = 1
		encoding.append(vector)
	return array(encoding)


In [11]:
# decode a one hot encoded string
def one_hot_decode(encoded_seq):
	return [argmax(vector) for vector in encoded_seq]


In [12]:
# prepare data for the LSTM
def get_pair(n_in, n_out, cardinality):

	# generate random sequence
	sequence_in = generate_sequence(n_in, cardinality)
	sequence_out = sequence_in[:n_out] + [0 for _ in range(n_in-n_out)]

	# one hot encode
	X = one_hot_encode(sequence_in, cardinality)
	y = one_hot_encode(sequence_out, cardinality)
	# reshape as 3D
	X = X.reshape((1, X.shape[0], X.shape[1]))
	y = y.reshape((1, y.shape[0], y.shape[1]))
	return X,y

In [13]:
n_features = 20
n_timesteps_in = 5
n_timesteps_out = 2

In [22]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Attention

# Define the input layer
inputs = Input(shape=(n_timesteps_in, n_features))

# LSTM layer with return_sequences=True to output the entire sequence
lstm_out = LSTM(150, return_sequences=True)(inputs)

# Attention layer
attention = Attention()([lstm_out, lstm_out])

# You may need to reduce the dimensionality (optional, depending on your task)
dense_out = Dense(20, activation='softmax')(attention)

# Define the model
model = Model(inputs=inputs, outputs=dense_out)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary
model.summary()

In [31]:
X_train, y_train = [], []
X_test, y_test = [], []
for _ in range(1000):
    X, y = get_pair(n_timesteps_in, 3, 10)
    X_train.append(X)
    y_train.append(y)

X_train = np.vstack(X_train)
y_train = np.vstack(y_train)


In [33]:
import numpy as np
from random import randint
from numpy import array, argmax
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Attention
from keras.utils import to_categorical

# Generate a sequence of random integers
def generate_sequence(length, n_unique):
    return [randint(0, n_unique-1) for _ in range(length)]

# One hot encode sequence
def one_hot_encode(sequence, n_unique):
    encoding = list()
    for value in sequence:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        encoding.append(vector)
    return array(encoding)

# Decode a one hot encoded sequence
def one_hot_decode(encoded_seq):
    return [argmax(vector) for vector in encoded_seq]

# Prepare data for the LSTM
def get_pair(n_in, n_out, cardinality):
    # Generate random sequence
    sequence_in = generate_sequence(n_in, cardinality)
    sequence_out = sequence_in[:n_out] + [0 for _ in range(n_in - n_out)]

    # One hot encode
    X = one_hot_encode(sequence_in, cardinality)
    y = one_hot_encode(sequence_out, cardinality)

    # Reshape as 3D for LSTM input
    X = X.reshape((1, X.shape[0], X.shape[1]))
    y = y.reshape((1, y.shape[0], y.shape[1]))

    return X, y

# Define the input shape
n_timesteps_in = 5  # Number of time steps in input
n_features = 10     # Number of unique features
n_out = 3           # Number of output time steps (smaller than input)
cardinality = 10    # Number of unique values (for one hot encoding)

# Create the model
inputs = Input(shape=(n_timesteps_in, n_features))

# LSTM layer
lstm_out = LSTM(150, return_sequences=True)(inputs)

# Attention layer
attention = Attention()([lstm_out, lstm_out])

# Dense output layer
dense_out = Dense(n_features, activation='softmax')(attention)

# Define the model
model = Model(inputs=inputs, outputs=dense_out)
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Print model summary
model.summary()

# Prepare training data (let's use 1000 samples)
X_train, y_train = [], []
for _ in range(1000):
    X, y = get_pair(n_timesteps_in, n_out, cardinality)
    X_train.append(X)
    y_train.append(y)

# Convert to numpy arrays for Keras
X_train = np.vstack(X_train)
y_train = np.vstack(y_train)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Prepare test data (let's use 100 samples for testing)
X_test, y_test = [], []
for _ in range(100):
    X, y = get_pair(n_timesteps_in, n_out, cardinality)
    X_test.append(X)
    y_test.append(y)

X_test = np.vstack(X_test)
y_test = np.vstack(y_test)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Generate predictions on a test sample
X_sample, y_sample = get_pair(n_timesteps_in, n_out, cardinality)
y_pred = model.predict(X_sample)

# Decode the prediction
decoded_input = one_hot_decode(X_sample[0])
decoded_output = one_hot_decode(y_sample[0])
decoded_pred = one_hot_decode(y_pred[0])

print(f"Input Sequence: {decoded_input}")
print(f"True Output Sequence: {decoded_output}")
print(f"Predicted Output Sequence: {decoded_pred}")


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.3737 - loss: 2.2174 - val_accuracy: 0.4540 - val_loss: 1.7169
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.4612 - loss: 1.7200 - val_accuracy: 0.4540 - val_loss: 1.6579
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.4626 - loss: 1.6209 - val_accuracy: 0.4540 - val_loss: 1.5822
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.4612 - loss: 1.5329 - val_accuracy: 0.4540 - val_loss: 1.4707
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4606 - loss: 1.4163 - val_accuracy: 0.4690 - val_loss: 1.3478
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4840 - loss: 1.3052 - val_accuracy: 0.4990 - val_loss: 1.2572
Epoch 7/10
[1m25/25[0m [32m━━━━