<a href="https://colab.research.google.com/github/Gowri-Rk/EmergingTechnologiesAssignments/blob/master/Assignment_5/Text_Classification_using_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text classificaiton of IMDB data using transformers 

Reference: https://keras.io/examples/nlp/text_classification_with_transformer/ 

In [1]:
import tensorflow as tf
from tensorflow import keras

##Creating a multihead self-attention as a keras layer

In [2]:
class MHSAttention(keras.layers.Layer):
  def __init__(self, e_dim, headcount=8):
    super(MHSAttention, self).__init__()

    #embedded dimension
    self.e_dim = e_dim

    #number of heads
    self.headcount = headcount

    if e_dim % headcount != 0:
      raise ValueError(f"embedding dimension = {e_dim} should be divisible by number of heads = {headcount}")
    
    #projection_dimension
    self.p_dim = e_dim // headcount
    self.query_layer = keras.layers.Dense(e_dim)
    self.key_layer = keras.layers.Dense(e_dim)
    self.value_layer = keras.layers.Dense(e_dim)
    self.combined = keras.layers.Dense(e_dim)

  
  def attention(self, query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1],tf.float32)
    score_seal_value = score / tf.math.sqrt(dim_key)

    weights = tf.nn.softmax(score_seal_value, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights
  

  def separate_heads(self, batchsize, x):
    x = tf.reshape(x, (batchsize, -1, self.headcount, self.p_dim))
    return tf.transpose(x, perm=[0, 2, 1, 3])
  


  def call(self, input):
    batchsize = tf.shape(input)[0]

    query = self.query_layer(input)
    key = self.key_layer(input)
    value = self.value_layer(input)


    query = self.separate_heads(batchsize, query)
    key = self.separate_heads(batchsize, key)
    value = self.separate_heads(batchsize, value)


    attention, weights = self.attention(query, key, value)
    attention = tf.transpose(attention, perm=[0, 2, 1, 3])
    concat_attention = tf.reshape(attention, (batchsize, -1, self.e_dim))
    output = self.combined(concat_attention)

    return output

## Creating Transformer as a layer

In [3]:
class Transformer(keras.layers.Layer):
  def __init__(self, e_dim, headcount, ff_dim, rate=0.1):
    super(Transformer, self).__init__()
    self.att = MHSAttention(e_dim, headcount)
    self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(e_dim),]
        )
    self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(rate)
    self.dropout2 = keras.layers.Dropout(rate)

  def call(self, inputs, training):
      attn_output = self.att(inputs)
      attn_output = self.dropout1(attn_output, training=training)
      out1 = self.layernorm1(inputs + attn_output)
      ffn_output = self.ffn(out1)
      ffn_output = self.dropout2(ffn_output, training=training)
      return self.layernorm2(out1 + ffn_output)

## Embedding layer

In [8]:
class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxLength, vocabulary, e_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocabulary, output_dim=e_dim)
        self.pos_emb = keras.layers.Embedding(input_dim=maxLength, output_dim=e_dim)

    def call(self, x):
        maxLength = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxLength, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [9]:
vocabulary = 20000
maxLength = 200
(trainX, trainY), (valueX, valueY) = keras.datasets.imdb.load_data(num_words=vocabulary)
print(len(trainX), "Training sequences")
print(len(valueX), "Validation sequences")
trainX = keras.preprocessing.sequence.pad_sequences(trainX, maxlen=maxLength)
valueX = keras.preprocessing.sequence.pad_sequences(valueX, maxlen=maxLength)

25000 Training sequences
25000 Validation sequences


Classifier model using transformer layer

In [6]:
maxLength

200

In [10]:
# Embedding size for each token
e_dim = 32

# Number of attention heads
headcount = 2

# Hidden layer size in feed forward network inside transformer
ff_dim = 32  

input = keras.layers.Input(shape=(maxLength,))
embedding_layer = TokenAndPositionEmbedding(maxLength, vocabulary, e_dim)
x = embedding_layer(input)
transformerLayer = Transformer(e_dim, headcount, ff_dim)
x = transformerLayer(x)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(20, activation="relu")(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=input, outputs=outputs)

## Training and evaluation

In [11]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    trainX, trainY, batch_size=32, epochs=2, validation_data=(valueX, valueY)
)

Epoch 1/2
Epoch 2/2
