In [24]:
# Simple implementation of a custom layer
import tensorflow as tf
import numpy as np
from keras import Model, layers, Input
from keras.optimizers import Adam
import math
from typing import Optional

import torch
from torch import nn
from typing import List

In [25]:
from keras.datasets import imdb
from keras_preprocessing.sequence import pad_sequences


# load the IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data()

# pad the sequences to the same length
x_train = pad_sequences(x_train, maxlen=200)
x_test = pad_sequences(x_test, maxlen=200)


In [26]:
class PrepareForMultiHeadAttention(nn.Module):
    """
    <a id="PrepareMHA"></a>

    ## Prepare for multi-head attention

    This module does a linear transformation and splits the vector into given
    number of heads for multi-head attention.
    This is used to transform **key**, **query**, and **value** vectors.
    """

    def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
        super().__init__()
        # Linear layer for linear transform
        self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
        # Number of heads
        self.heads = heads
        # Number of dimensions in vectors in each head
        self.d_k = d_k

    def forward(self, x: torch.Tensor):
        # Input has shape `[seq_len, batch_size, d_model]` or `[batch_size, d_model]`.
        # We apply the linear transformation to the last dimension and split that into
        # the heads.
        head_shape = x.shape[:-1]

        # Linear transform
        x = self.linear(x)

        # Split last dimension into heads
        x = x.view(*head_shape, self.heads, self.d_k)

        # Output has shape `[seq_len, batch_size, heads, d_k]` or `[batch_size, heads, d_model]`
        return x


class MultiHeadAttention(nn.Module):
    r"""
    <a id="MHA"></a>

    ## Multi-Head Attention Module

    This computes scaled multi-headed attention for given `query`, `key` and `value` vectors.

    $$\mathop{Attention}(Q, K, V) = \underset{seq}{\mathop{softmax}}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)V$$

    In simple terms, it finds keys that matches the query, and gets the values of
     those keys.

    It uses dot-product of query and key as the indicator of how matching they are.
    Before taking the $softmax$ the dot-products are scaled by $\frac{1}{\sqrt{d_k}}$.
    This is done to avoid large dot-product values causing softmax to
    give very small gradients when $d_k$ is large.

    Softmax is calculated along the axis of of the sequence (or time).
    """

    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool = True):
        """
        * `heads` is the number of heads.
        * `d_model` is the number of features in the `query`, `key` and `value` vectors.
        """

        super().__init__()

        # Number of features per head
        self.d_k = d_model // heads
        # Number of heads
        self.heads = heads

        # These transform the `query`, `key` and `value` vectors for multi-headed attention.
        self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=bias)
        self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=bias)
        self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=True)

        # Softmax for attention along the time dimension of `key`
        self.softmax = nn.Softmax(dim=1)

        # Output layer
        self.output = nn.Linear(d_model, d_model)
        # Dropout
        self.dropout = nn.Dropout(dropout_prob)
        # Scaling factor before the softmax
        self.scale = 1 / math.sqrt(self.d_k)

        # We store attentions so that it can be used for logging, or other computations if needed
        self.attn = None

    def get_scores(self, query: torch.Tensor, key: torch.Tensor):
        """
        ### Calculate scores between queries and keys

        This method can be overridden for other variations like relative attention.
        """

        # Calculate $Q K^\top$ or $S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd}$
        return torch.einsum('ibhd,jbhd->ijbh', query, key)

    def prepare_mask(self, mask: torch.Tensor, query_shape: List[int], key_shape: List[int]):
        """
        `mask` has shape `[seq_len_q, seq_len_k, batch_size]`, where first dimension is the query dimension.
        If the query dimension is equal to $1$ it will be broadcasted.
        """

        assert mask.shape[0] == 1 or mask.shape[0] == query_shape[0]
        assert mask.shape[1] == key_shape[0]
        assert mask.shape[2] == 1 or mask.shape[2] == query_shape[1]

        # Same mask applied to all heads.
        mask = mask.unsqueeze(-1)

        # resulting mask has shape `[seq_len_q, seq_len_k, batch_size, heads]`
        return mask

    def forward(self, *,
                query: torch.Tensor,
                key: torch.Tensor,
                value: torch.Tensor,
                mask: Optional[torch.Tensor] = None):
        """
        `query`, `key` and `value` are the tensors that store
        collection of *query*, *key* and *value* vectors.
        They have shape `[seq_len, batch_size, d_model]`.

        `mask` has shape `[seq_len, seq_len, batch_size]` and
        `mask[i, j, b]` indicates whether for batch `b`,
        query at position `i` has access to key-value at position `j`.
        """

        # `query`, `key` and `value`  have shape `[seq_len, batch_size, d_model]`
        seq_len, batch_size, _ = query.shape

        if mask is not None:
            mask = self.prepare_mask(mask, query.shape, key.shape)

        # Prepare `query`, `key` and `value` for attention computation.
        # These will then have shape `[seq_len, batch_size, heads, d_k]`.
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # Compute attention scores $Q K^\top$.
        # This gives a tensor of shape `[seq_len, seq_len, batch_size, heads]`.
        scores = self.get_scores(query, key)

        # Scale scores $\frac{Q K^\top}{\sqrt{d_k}}$
        scores *= self.scale

        # Apply mask
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # $softmax$ attention along the key sequence dimension
        # $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$
        attn = self.softmax(scores)

        # Save attentions if debugging

        # Apply dropout
        attn = self.dropout(attn)

        # Multiply by values
        # $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)V$$
        x = torch.einsum("ijbh,jbhd->ibhd", attn, value)

        # Save attentions for any other calculations
        self.attn = attn.detach()

        # Concatenate multiple heads
        x = x.reshape(seq_len, batch_size, -1)

        # Output layer
        return self.output(x)

def get_slopes(n_heads: int):
    """
    ## Get head-specific slope $m$ for each head

    * `n_heads` is the number of heads in the attention layer $n$

    The slope for first head is

    $$\frac{1}{2^{\frac{8}{n}}} = 2^{-\frac{8}{n}}$$

    The slopes for the rest of the heads are in a geometric series with a ratio same as above.

    For instance when the number of heads is $8$ the slopes are
    $$\frac{1}{2^1}, \frac{1}{2^2}, \dots, \frac{1}{2^8}$$
    """

    # Get the closest power of 2 to `n_heads`.
    # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2,
    # and then add the remaining slopes.
    n = 2 ** math.floor(math.log2(n_heads))
    # $2^{-\frac{8}{n}}$
    m_0 = 2.0 ** (-8.0 / n)
    # $2^{-1\frac{8}{n}}, 2^{-2 \frac{8}{n}}, 2^{-3 \frac{8}{n}}, \dots$
    m = torch.pow(m_0, torch.arange(1, 1 + n))

    # If `n_heads` is not a power of 2, then we add the remaining slopes.
    # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously).
    # And pick the slopes upto `n_heads`.
    if n < n_heads:
        # $2^{-\frac{8}{2n}}$
        m_hat_0 = 2.0 ** (-4.0 / n)
        # $2^{-1\frac{8}{2n}}, 2^{-3 \frac{8}{2n}}, 2^{-5 \frac{8}{2n}}, \dots$
        # Note that we take steps by $2$ to avoid slopes added previously.
        m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (n_heads - n), 2))
        # Concatenate the slopes with the remaining slopes.
        m = torch.cat([m, m_hat])

    return m


@torch.no_grad()
def get_alibi_biases(n_heads: int, mask: torch.Tensor):
    """
    ## Calculate the attention biases matrix

    * `n_heads` is the number of heads in the attention layer
    * `mask` is the attention mask of shape `[seq_len_q, seq_len_k]`

    This returns a matrix of shape `[seq_len_q, seq_len_k, n_heads, ]` with ALiBi attention biases.
    """

    # Get slopes $m$ for each head
    m = get_slopes(n_heads).to(mask.device)

    # Calculate distances $[0, 1, \dots, N]$
    # Here we calculate the distances using the mask.
    #
    # Since it's causal mask we can just use $[0, 1, \dots, N]$ too.
    # `distance = torch.arange(mask.shape[1], dtype=torch.long, device=mask.device)[None, :]`
    distance = mask.cumsum(dim=-1)

    # Multiply them pair-wise to get the AliBi bias matrix
    return distance[:, :, None] * m[None, None, :]


class AlibiMultiHeadAttention(MultiHeadAttention):
    """
    ## Attention with Linear Biases (ALiBi)

    We override [Multi-Head Attention](../mha.html).
    """

    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__(heads, d_model, dropout_prob)

        # To cache AliBi the biases
        self.alibi_biases = None

    def forward(self, *,
                query: torch.Tensor,
                key: torch.Tensor,
                value: torch.Tensor,
                mask: Optional[torch.Tensor] = None):
        """
        `query`, `key` and `value` are the tensors that store
        collection of *query*, *key* and *value* vectors.
        They have shape `[seq_len, batch_size, d_model]`.

        `mask` has shape `[seq_len, seq_len, batch_size]` and
        `mask[i, j, b]` indicates whether for batch `b`,
        query at position `i` has access to key-value at position `j`.
        """

        # ALiBi only works with causal masks.
        assert mask is not None
        assert mask.shape[0] == mask.shape[1] and mask.shape[2] == 1

        # `query`, `key` and `value` have shape `[seq_len, batch_size, d_model]`
        seq_len, batch_size, _ = query.shape

        # Add head dimension to mask and check its shape.
        mask = self.prepare_mask(mask, query.shape, key.shape)

        # Prepare `query`, `key` and `value` for attention computation.
        # These will then have shape `[seq_len, batch_size, heads, d_k]`.
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # Compute attention scores $Q K^\top$.
        # This gives a tensor of shape `[seq_len, seq_len, batch_size, heads]`.
        scores = self.get_scores(query, key)

        # Scale scores $\frac{Q K^\top}{\sqrt{d_k}}$
        scores *= self.scale

        # Create AliBi biases if it's not cached
        if self.alibi_biases is None or self.alibi_biases.shape[1] < seq_len:
            # `mask` has shape [seq_len, seq_len, 1, 1]
            self.alibi_biases = get_alibi_biases(scores.shape[-1], mask[:, :, 0, 0])

        # Add AliBi biases to attention scores.
        # ALiBi biases has shape `[seq_len, seq_len, n_heads]`
        # and `scores` has shape `[seq_len, seq_len, batch_size, n_heads]`
        scores += self.alibi_biases[:seq_len, :seq_len, None, :]

        # Apply mask
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # $softmax$ attention along the key sequence dimension
        # $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$
        attn = self.softmax(scores)

        # Apply dropout
        attn = self.dropout(attn)

        # Multiply by values
        # $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)V$$
        x = torch.einsum("ijbh,jbhd->ibhd", attn, value)

        # Concatenate multiple heads
        x = x.reshape(seq_len, batch_size, -1)

        # Output layer
        return self.output(x)




In [33]:

# Hyperparameters
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
# Find the maximum word index in x_train and x_test
max_word_index = max(np.max(x_train), np.max(x_test))
# Set vocab_size to be at least as large as max_word_index + 1
vocab_size = max_word_index + 1  # Add 1 because word indices start from 1
max_sequence_length = x_train.shape[1]  # The length of each sequence

# Define the model
inputs = Input(shape=(max_sequence_length,))
embedding_layer = layers.Embedding(vocab_size, embed_dim)(inputs)
attention_layer = AlibiMultiHeadAttention(num_heads, embed_dim)()
pooling_layer = layers.GlobalAveragePooling1D()(attention_layer)
outputs = layers.Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=2, validation_data=(x_test, y_test))


TypeError: AlibiMultiHeadAttention.forward() missing 3 required keyword-only arguments: 'query', 'key', and 'value'

In [None]:
# loss of 0.0788
# Evaluating the model
model.evaluate(x_test, y_test)

# 782/782 [==============================] - 6s 7ms/step - loss: 0.3520 - accuracy: 0.8654

In [None]:
# model 2 with positional embedding

class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, output_dim):
        super(PositionalEncoding, self).__init__()
        angles = self.get_angles(np.arange(sequence_length)[:, np.newaxis],
                                 np.arange(output_dim)[np.newaxis, :],
                                 output_dim)
        self.positional_encoding = tf.cast(angles, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_encoding[:shape[-2], :shape[-1]]


# Hyperparameters
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
# Find the maximum word index in x_train and x_test
max_word_index = max(np.max(x_train), np.max(x_test))
# Set vocab_size to be at least as large as max_word_index + 1
vocab_size = max_word_index + 1  # Add 1 because word indices start from 1
max_sequence_length = x_train.shape[1]  # The length of each sequence

# Define the model
inputs = layers.Input(shape=(max_sequence_length,))
embedding_layer = layers.Embedding(vocab_size, embed_dim)(inputs)
position_embedding_layer = PositionalEncoding(max_sequence_length, embed_dim)(embedding_layer)
attention_layer = AlibiMultiHeadAttention(num_heads, embed_dim)(position_embedding_layer)
pooling_layer = layers.GlobalAveragePooling1D()(attention_layer)
outputs = layers.Dense(1, activation='sigmoid')(pooling_layer)

model2 = Model(inputs=inputs, outputs=outputs)

# Compile the model2
model2.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

# Train the model2
model2.fit(x_train, y_train, batch_size=32, epochs=5, validation_data=(x_test, y_test))


In [None]:
# model 2 with positional embedding evaluation:
model2.evaluate(x_test, y_test)

# 2 epoch:
# 782/782 [==============================] - 6s 8ms/step - loss: 0.7052 - accuracy: 0.5057

# 5 epoch:
# 782/782 [==============================] - 6s 8ms/step - loss: 0.6328 - accuracy: 0.7213

In [None]:
# plot model
tf.keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
# do single prediction
# predict probabilities for test set first exemple
yhat_probs = model.predict(x_test, verbose=0)
print(yhat_probs[0])

In [None]:
# understand the dataset

index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])

def decode_review(text):
    return ' '.join([reverse_index.get(i, '?') for i in text])
for i in range(5):
    review = decode_review(x_train[i])
    print(review)
    print("Sentiment:")
    print(y_train[i])