In [None]:
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.datasets import imdb
from keras.utils import pad_sequences

# Define the Transformer block

In [3]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        # embed_dim : This parameter specifies the dimentionality of input
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
         # self ffn : This creates a feedforward network , often used for additional normalization
        self.ffn = Sequential(
            [Dense(ff_dim,activation='relu'),
             Dense(embed_dim)]
        )
        #  these createLayerNormalization layers
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        # These create dropout layers, randomly sets
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
    
    def call(self,inputs,training):
        # Applies multi-head attention to the input sequence 
        attn_output = self.att(inputs,inputs)
        
        attn_output = self.dropout1(attn_output,training=training)

        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)

        ffn_output = self.dropout2(ffn_output,training=training)

        return self.layernorm2(out1 + ffn_output)


## Token and Position Embedding

In [5]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        # maxlen : The maximum length of the input_sequnces  the model will handle
        # vocab_size : The total number of unique tokens (words) in the vocabulory 
        super().__init__()
        # An Embedding layer that maps each token in the input sequence 
        # to a dense vector of size embed_dim
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        # An Embedding layer that maps each position in the sequence
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
    
    def call(self,x):
        # Extracts the actual length of the current input sequence
        maxlen = tf.shape(x)[-1]
        # Creates a tensor of positions from 0 to maxlen-1
        positions = tf.range(start=0, limit=maxlen, delta=1)
        # Looks up the position embeddings for each element-wise
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        # Resulting in a combined representation that captures both words
        # meaning and positional information
        return x + positions
    

In [6]:
vocab_size = 2000 # Only consider the 20k words
maxlen = 200 #Only consider the first 200 words

In [9]:
(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training Sequences")
print(len(x_val), " Validation Sequences")


25000 Training Sequences
25000  Validation Sequences


In [11]:
x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)

In [12]:
x_train.shape, x_val.shape

((25000, 200), (25000, 200))

In [13]:
x_train[4000]

array([  89,    8,  511,    2,   59,    2,   41,  523,  147, 1876,    5,
          2,  175,  347,   11,  618,    4,  172,   96,    2,    2,    9,
        862,    2,    8,   41,    5,   27,  532,    2,    9,    2,    4,
          2,  136,    2,    2,    5,    2,   19, 1456,  921,   42,    2,
       1488,   68,    2,  216,   17,    6,    2,   48,   13,   69,    6,
          2,   13,   62,   28,    2,   12,    8,   98,  634,  908,   10,
         10,    2,    2,    9,    2,   17,    2,    6,   87, 1465,   48,
         25,  377,   27,  478,  157,   11,    2,    2,   29,    2,    4,
          2,    7,    2,    2,   83,    6,    2,    2,    7,  107,   42,
        289,  715,  257,    5,   95,    2,    4,    2,   11,   17,    2,
          5,    2, 1377,   17,  614,   11,   14,  365, 1652,    2,    2,
        373,   10,   10,    4,  167,    2,    2,  287,   64,   35,    2,
          2,    7, 1489,    4,  370,  121,   12,   80,  123,  178,   51,
         75,  181,    8,   67,    4,  636,    2,   

### Combine all-together

In [14]:
embed_dim=32
num_heads=2
ff_dim=32

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(2, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)




In [15]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 token_and_position_embeddi  (None, 200, 32)           70400     
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_block (Transfo  (None, 200, 32)           10656     
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

## Compile the model


In [16]:
model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy',metrics=['accuracy'])




## Train the model

In [17]:
history = model.fit(x_train,y_train,batch_size = 32,epochs=10,validation_data=(x_val,y_val))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
