# NNDL - HW5
## Transformers
### Hesam Asadollahzadeh & Masoud Tahmasbi Fard

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.initializers import TruncatedNormal
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import math

In [2]:
! gdown 15UGzRAuH9ih_TJI-slSwIjjoNSoqSZvE
! unzip reviews.zip

Downloading...
From: https://drive.google.com/uc?id=15UGzRAuH9ih_TJI-slSwIjjoNSoqSZvE
To: /content/reviews.zip
100% 19.8M/19.8M [00:00<00:00, 65.3MB/s]
Archive:  reviews.zip
  inflating: train_reviews.csv       
  inflating: test_reviews.csv        


# Code

### Encoder

In [2]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, hidden_size, num_heads):

        super(MultiHeadAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.projection_dim = hidden_size // num_heads
        self.Q = layers.Dense(hidden_size)
        self.K = layers.Dense(hidden_size)
        self.V = layers.Dense(hidden_size)
        self.out = layers.Dense(hidden_size)

    def attention(self, query, key, value, mask):
        
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)

        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, att_mask=None):
        batch_size = tf.shape(inputs)[0]
        query = self.separate_heads(self.Q(inputs)  , batch_size)  
        key = self.separate_heads(self.K(inputs), batch_size)  
        value = self.separate_heads(self.V(inputs) , batch_size) 
        attention, self.att_weights = self.attention(query, key, value, att_mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.hidden_size))
        output = self.out(concat_attention)  
        return output

#### Feed-Forward Sub-Layer

Unlike the original transformer, BERT uses "GELU" activation function. In this part you should implement the GELU activation function based on the paper provided to you.

In [3]:
@tf.function
def GELU(x):
    cdf = 0.5 * (1.0 + tf.math.tanh(tf.math.sqrt(2.0/math.pi)*(x + 0.044715*x*x*x)))
    return x * cdf

In [4]:
class FFN(layers.Layer):
    def __init__(self, intermediate_size, hidden_size, drop_rate):
        super(FFN, self).__init__()
        self.intermediate = layers.Dense(intermediate_size, activation=GELU, kernel_initializer=TruncatedNormal(stddev=0.02))
        self.out = layers.Dense(hidden_size, kernel_initializer=TruncatedNormal(stddev=0.02))
        self.drop = layers.Dropout(drop_rate)

    def call(self, inputs):
        out = self.intermediate(inputs)
        out = self.drop(out)
        out = self.out(out)
        return out

#### Add & Norm

In this part implement the add & norm blocks

In [5]:
class AddNorm(layers.Layer):
    def __init__(self, LNepsilon, drop_rate): 
        super(AddNorm, self).__init__()
        self.LN = layers.LayerNormalization(epsilon=LNepsilon)
        self.dropout = layers.Dropout(drop_rate)

    def call(self, sub_layer_in, sub_layer_out):
        return self.LN(self.dropout(sub_layer_out) + sub_layer_in)

#### Residual connections

Now put together all parts and build the encoder with the residual connections




In [18]:
class Encoder(layers.Layer):
    def __init__(self, hidden_size, num_heads, intermediate_size, drop_rate=0.1, LNepsilon=1e-12):
        super(Encoder, self).__init__()
        self.attention = MultiHeadAttention(hidden_size, num_heads)
        self.addnorm1 = AddNorm(LNepsilon, drop_rate)
        self.ffn = FFN(intermediate_size, hidden_size, drop_rate)
        self.addnorm2 = AddNorm(LNepsilon, drop_rate)

    def call(self, inputs, mask=None):
        Y = self.addnorm1(inputs, self.attention(inputs, mask))
        return self.addnorm2(Y, self.ffn(Y))

    def compute_mask(self, x, mask=None):
        return tf.not_equal(x, 0)

### BERT

In [7]:
class BertEmbedding(layers.Layer):

    def __init__(self, vocab_size, maxlen, hidden_size):

      super(BertEmbedding, self).__init__()
      self.TokEmb = layers.Embedding(input_dim=vocab_size, output_dim=hidden_size, mask_zero=True)
      self.PosEmb = layers.Embedding(input_dim=maxlen, output_dim=hidden_size)
      self.LN = layers.LayerNormalization(epsilon=1e-12)
      self.dropout = layers.Dropout(0.1)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        out = self.TokEmb(inputs)
        out = out + self.PosEmb(positions)
        return self.dropout(self.LN(out))

    def compute_mask(self, x, mask=None):
      m = 1-tf.cast(self.TokEmb.compute_mask(x), tf.float32)
      m = m[:, tf.newaxis, tf.newaxis, :]
      return m

The "pooler" is the last layer you need to put in place.
For each input sentence, the pooler changes the hidden states of the last encoder layer (which have the shape [batch size, sequence lenght, hidden size]) into a vector representation (which has the shape [batch size, hidden size]).
The pooler does this by giving a dense layer the hidden state that goes with the first token, which is a special token at the beginning of each sentence.

In [8]:
class Pooler(layers.Layer):
    def __init__(self, hidden_size):
        super(Pooler, self).__init__()
        self.dense = layers.Dense(hidden_size, activation='tanh')

    def call(self, encoder_out):
        # first_token_tensor = hidden_states[:, 0]
        return self.dense(inputs=encoder_out)

Now you should complete the **create_BERT** function in the cell below. This function gets BERT's hyper-parameters as its inputs and return a BERT model. 
Note that the returned model must have two outputs (just like the pre-trained BERTs): 
- The hidden states of the last encoder layer
- Output of the pooler

In [22]:
def create_BERT(vocab_size, maxlen, hidden_size, num_layers, num_att_heads, intermediate_size, drop_rate=0.1):
    """
    creates a BERT model based on the arguments provided

        Arguments:
        vocab_size: number of words in the vocabulary
        maxlen: maximum length of each sentence
        hidden_size: dimension of the hidden state of each encoder layer
        num_layers: number of encoder layers
        num_att_heads: number of attention heads in the multi-headed attention layer
        intermediate_size: dimension of the intermediate layer in the feed-forward sublayer of the encoders
        drop_rate: dropout rate of all the dropout layers used in the model
        returns: 
        model
    """
    inputs = tf.keras.Input(shape=(maxlen,))
    emb = BertEmbedding(vocab_size, maxlen, hidden_size)
    out = emb(inputs)
    for i in range(num_layers):
        enc = Encoder(hidden_size, num_att_heads, intermediate_size, drop_rate)
        out = enc(out)
    pooler = Pooler(hidden_size)
    out = keras.layers.Dense(1, activation='sigmoid')(keras.layers.Flatten()(pooler(out)))
    model = tf.keras.Model(inputs=inputs, outputs=out) 

    return model

We will use the Rotten tomatoes critic reviews dataset for this assignment. The zip file is provided to you. Unzip it and run the cells below to split the dataset in training and test sets and prepare it for feeding to the bert model.

In [10]:
train_reviews, test_reviews = pd.read_csv('train_reviews.csv').values[:, 1:], pd.read_csv('test_reviews.csv').values[:, 1:]
(train_texts, train_labels), (test_texts, test_labels)  = (train_reviews[:,0],train_reviews[:,1]), (test_reviews[:,0],test_reviews[:,1]) 
train_texts = [s.lower() for s in train_texts]
test_texts = [s.lower() for s in test_texts] 
aprx_vocab_size = 20000
cls_token = '[cls]'
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_generator=train_texts,
                                                        target_vocab_size=aprx_vocab_size,
                                                        reserved_tokens=[cls_token])

In the following cell, you need to complete the implementation of the encode_sentence function. This function takes as input a sentence and an integer representing the maximum length of the sentence and returns a list of token ids. To implement this function, follow these steps:

-Use the trained tokenizer to encode the input sentence and obtain a list of token ids.

-Pad the token id list with zeros to the maximum length specified.

-Prepend the id of the special token to the beginning of the token id list.

In [11]:
def encode_sentence(s, maxlen):
    tok_id_list = tokenizer.encode(cls_token + s)
    tok_id_list = tf.keras.utils.pad_sequences([tok_id_list], maxlen, padding='post')
    return tok_id_list[0]

print(encode_sentence('I liked this movie', 32))

[    1 19779 19738  2252    18    67     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [12]:
MAXLEN = 32
x_train = np.array([encode_sentence(x, MAXLEN) for x in train_texts], dtype=np.int64)
x_test = np.array([encode_sentence(x, MAXLEN) for x in test_texts], dtype=np.int64)
y_train = train_labels.astype(np.int64)
y_test = test_labels.astype(np.int64)

Now use the functional api and the **create_BERT** function you implemented earlier to create a classifier for the movie reviews dataset.
Note that the intermediate layer in the feed-forward sub-layer of the encoders is set to $4\times H$ in the original BERT implementation, where $H$ is the hidden layer size. 

In [23]:
hidden_size = 768
num_heads = 12
num_layers = 12
vocab_size = tokenizer.vocab_size  

model = create_BERT(vocab_size, MAXLEN, hidden_size, num_layers, num_heads, num_heads*4)

In [24]:
model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5), "binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 32)]              0         
                                                                 
 bert_embedding_4 (BertEmbed  (None, 32, 768)          15356928  
 ding)                                                           
                                                                 
 encoder_48 (Encoder)        (None, 32, 768)           2439984   
                                                                 
 encoder_49 (Encoder)        (None, 32, 768)           2439984   
                                                                 
 encoder_50 (Encoder)        (None, 32, 768)           2439984   
                                                                 
 encoder_51 (Encoder)        (None, 32, 768)           2439984   
                                                           

In [25]:
history = model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=2,
    validation_data=(x_test, y_test)
)

Epoch 1/2
Epoch 2/2


In [35]:
model(np.array([encode_sentence("I liked this moive".lower(), MAXLEN)], dtype=np.int64))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.95685554]], dtype=float32)>

### Attention Visualization

In [26]:
#@title Run this!
import sys

!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']

from bertviz import head_view

def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

Cloning into 'bertviz_repo'...
remote: Enumerating objects: 1625, done.[K
remote: Counting objects: 100% (322/322), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 1625 (delta 227), reused 221 (delta 209), pack-reused 1303[K
Receiving objects: 100% (1625/1625), 198.36 MiB | 29.54 MiB/s, done.
Resolving deltas: 100% (1069/1069), done.


In order to use bertviz, we need to obtain the attention weights in the encoders of the BERT model implemented in the previous section. To do this, you need to complete the implementation of the get_att_weights function in the following cell. This function takes as input a model (the trained BERT-based model from the previous section) and a list of tokens (an encoded sentence). Here's what you need to do:

-Feed the input token list to the model to generate the attention weights for that input.

-Access the att_weights attribute of the MultiHeadAttention sub-layer of each encoder in the model and add them all to a list.

-Return the list (which should be a list of Tensors).

In [63]:
def get_att_weights(model, tok_id_list):
    att_weights = []
    out = model(tf.convert_to_tensor(np.array([tok_id_list], dtype=np.int64)))
    for i in range(2, 14):
        att_weights.append(model.layers[i].attention.att_weights)
    return att_weights

In [65]:
import torch
def get_att_tok(model, sent):
    maxlen = model.layers[0].input_shape[0][-1]
    encoded_toks = encode_sentence(sent, maxlen)
    att_weights = get_att_weights(model, encoded_toks)
    pad_start_idx = np.min(np.where(np.array(encoded_toks) == 0))
    toks = encoded_toks[:pad_start_idx]
    atts = []
    for att in att_weights:
        layer_att = torch.FloatTensor(att[:, :, :pad_start_idx, :pad_start_idx].numpy())
    atts.append(layer_att)
    toks = [tokenizer.decode([m]) for m in toks]
    return toks, atts

#### Attention visualization
now give a sample sentence in the context of giving your opinion about a movie and visualize the attention. for example "I liked that movie"

In [72]:
sentence = "I liked this movie because of its scenario"
toks, atts = get_att_tok(model, sentence.lower())
call_html()
head_view(atts, toks, layer=0)

<IPython.core.display.Javascript object>

In [69]:
model.save_weights('myBERT')