<a href="https://colab.research.google.com/github/Lee-Gunju/AI-paper-code-review-for-personal-project/blob/master/End_to_end_Masked_Language_Modeling_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint

In [None]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1


config = Config()

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  26.4M      0  0:00:03  0:00:03 --:--:-- 26.4M


In [None]:
def get_text_list_from_files(files):
  text_list = []
  for name in files:
    with open(name) as f:
      for line in f:
        text_list.append(line)
  return text_list 


def get_data_from_text_files(folder_name):

  pos_files = glob.glob('aclImdb/' + folder_name + '/pos/*.txt')
  pos_texts = get_text_list_from_files(pos_files)
  neg_files = glob.glob('aclImdb/' + folder_name + '/neg/*.txt')
  neg_texts = get_text_list_from_files(neg_files)
  df = pd.DataFrame({"review" : pos_texts + neg_texts, 'sentiment': [0] * len(pos_texts) + [1] * len(neg_texts)})

  df = df.sample(len(df)).reset_index(drop = True)
  return df 


train_df = get_data_from_text_files('train')
test_df = get_data_from_text_files('test')

all_data = train_df.append(test_df)


In [None]:
all_data[0:10].head(1)['review'].values

array(['This an free adaptation of the novels of Clarence Mulford; fans of the Willaim Boyd films will probably feel a little at sea here (and the reviews here so far reflect that). But I knew of Hopalong from the novels first, and never cared much for the Boyd films once I got around to them.<br /><br />Christopher Coppola has made a wise choice - he has not made a nostalgic "Western"; instead, he has approached the Cassidy story as a slice of what we used to call \'Americana\'; or what older critics once called \'homespun\'. As the film unraveled, I found myself more and more reminded of the great "Hallmark Theater" version of Mark Twain\'s "Roughing It", with James Garner narrating.<br /><br />Both these films remind us that, although films about the \'old west\' are probably always to be mythic for Americans, they need not be \'westerns\'; they can very well be just films about what it meant to be American in that time, in that place.<br /><br />I never feel pandered to, watching t

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
    )


def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer


vectorize_layer = get_vectorize_layer(
    all_data.review.values.tolist(),
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

# Get mask token id for masked language model
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]


def encode(texts):
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()


def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights


# We have 25000 examples for training
x_train = encode(train_df.review.values)  # encode reviews with vectorizer
y_train = train_df.sentiment.values
train_classifier_ds = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .shuffle(1000)
    .batch(config.BATCH_SIZE)
)

# We have 25000 examples for testing
x_test = encode(test_df.review.values)
y_test = test_df.sentiment.values
test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
    config.BATCH_SIZE
)

# Build dataset for end to end model input (will be used at the end)
test_raw_classifier_ds = tf.data.Dataset.from_tensor_slices(
    (test_df.review.values, y_test)
).batch(config.BATCH_SIZE)

# Prepare data for masked language model
x_all_review = encode(all_data.review.values)
x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
    x_all_review
)

mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels, sample_weights)
)
mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
x_masked_train[0]

array([   11, 29999,   958,  1269,     5,     2,  2704, 29999,  9263,
           1,   445,     5,     2,     1, 14271, 29999, 29999, 29999,
         233, 29999,   117,    30,  1729,   132,     3,     2,   806,
         132, 29999, 29999,  4327,    12,    18,    10,   649,     5,
       22209,    35,     2,  2704,    88, 29999,   109,  3665,    71,
          16,     2, 14271,    94,   282,    10,   183,   181,     6,
          90, 29999,  7220,    44,    93,     4,  2186,  1089,    27,
          44,    21,    93,     4,  4568, 27658,   296,    27,    44,
        6308,     2, 29999,    65,    14, 29999,  5471, 29999,    48,
          73, 29999,     6,   651, 15183,    39,    48,   940,  1343,
         282,   468, 24956, 29999,     2,    19, 29999,    10,   248,
       29999,    51,     3,    51,  1502,     5,     2, 29999,     1,
       29999,   310,     5,   950,     1,     1,  2303,    15,   587,
        6783, 12741, 29999,   129,    94,  3064,   169,    12,   253,
          94,    42,

In [None]:
y_masked_labels[0]

array([   11,    33,   958,  1269,     5,     2,  2704,     5,  9263,
           1,   445,     5,     2,     1, 14271,    94,    77,   235,
         233,     4,   117,    30,  1729,   132,     3,     2,   806,
         132,    37,   227,  4327,    12,    18,    10,   649,     5,
       22209,    35,     2,  2704,    88,     3,   109,  3665,    71,
          16,     2, 14271,    94,   282,    10,   183,   181,     6,
          90,  1411,  7220,    44,    93,     4,  2186,  1089,    27,
          44,    21,    93,     4,  4568, 27658,   296,    27,    44,
        6308,     2,  7047,    65,    14,     4,  5471,     5,    48,
          73,   321,     6,   651, 15183,    39,    48,   940,  1343,
         282,   468, 24956,    14,     2,    19, 22679,    10,   248,
         522,    51,     3,    51,  1502,     5,     2,    79,     1,
       26582,   310,     5,   950,     1,     1,  2303,    15,   587,
        6783, 12741,   190,   129,    94,  3064,   169,    12,   253,
          94,    42,

In [None]:
sample_weights[0]

array([0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 1., 0.

In [None]:
next(iter(test_raw_classifier_ds))[0]

<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b"What is this!! its so bad. The animation looks so terrible , it looks like a ps1 type game. The actors are awful, they just cannot act to save their lives. I sat through all of this film an then at the end I was annoyed when I realised I had wasted 3 hours of my life. I've not heard of this film, did it ever actually come out in the cinema or did it go straight to DVD? A girl got shot?! What is up with that, it was just a stupid film. They totally copied 'The Day After Tomorrow'. Its got to be one of the worst films i have ever seen. I would definitely recommend to people to not waste their time with this. You could spend your time watching 'The Day After Tomorrow', its a lot better. Well thats what I think of the film. Actually why have I wasted my time writing about it, ah dam!! Its really annoying me, its wasted 3 hours and 10 minutes now.",
       b'To call this film a complete waste of celluloid would be an understatement.<br 

In [None]:
next(iter(test_classifier_ds))[0][0]

<tf.Tensor: shape=(256,), dtype=int64, numpy=
array([   48,     7,    11,    29,    37,    82,     2,   818,   280,
          37,   373,     9,   280,    38,     4, 24672,   570,   475,
           2,   150,    23,   369,    34,    40,   556,   497,     6,
         571,    63,   465,    10,  1703,   138,    31,     5,    11,
          19,    33,    91,    30,     2,   125,    10,    13,  3225,
          50,    10,  5173,    10,    67,   974,   443,   616,     5,
          54,   119,   193,    21,   536,     5,    11,    19,   111,
           9,   120,   155,   212,    45,     8,     2,   439,    39,
         111,     9,   139,   805,     6,   266,     4,   255,   183,
         315,    48,     7,    55,    15,    12,     9,    13,    40,
           4,   359,    19,    34,   442,  6246,     2,   267,   101,
        5566,    29,   183,     6,    26,    28,     5,     2,   243,
          94,    10,    25,   120,   105,    10,    57,   391,   358,
           6,    84,     6,    21,   436,   

In [None]:
def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name="encoder_{}/multiheadattention".format(i),
    )(query, key, value)
    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(config.FF_DIM, activation="relu"),
            layers.Dense(config.EMBED_DIM),
        ],
        name="encoder_{}/ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output


def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")


class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]


def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)

    word_embeddings = layers.Embedding(
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = layers.Embedding(
        input_dim=config.MAX_LEN,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=config.MAX_LEN, delta=1))
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model


id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}


class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)


sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 256, 128)     3840000     input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 256, 128)     0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 256, 128)     66048       tf.__operators__.add[0][0]       
                                                                 tf.__operators__.

In [None]:
mlm_ds

<BatchDataset shapes: ((None, 256), (None, 256), (None, 256)), types: (tf.int64, tf.int64, tf.float64)>

<bound method MaskedTextGenerator.on_epoch_end of <__main__.MaskedTextGenerator object at 0x7f4ba4939690>>

In [None]:
bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model.save("bert_mlm_imdb.h5")

Epoch 1/5
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'this',
 'prediction': 'i have watched this this and it was awesome',
 'probability': 0.06871921}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'a',
 'prediction': 'i have watched this a and it was awesome',
 'probability': 0.05148873}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'movie',
 'prediction': 'i have watched this movie and it was awesome',
 'probability': 0.03854019}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'of',
 'prediction': 'i have watched this of and it was awesome',
 'probability': 0.034013562}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'i',
 'prediction': 'i have watched this i and it was awesome',
 'probability': 0.02866348}
Epoch 2/5
{'input_text': 'i have watched this [mask] and it was awes

In [None]:
# Load pretrained bert model
mlm_model = keras.models.load_model(
    "bert_mlm_imdb.h5", custom_objects={"MaskedLanguageModel": MaskedLanguageModel})

pretrained_bert_model = tf.keras.Model(mlm_model.input, mlm_model.get_layer("encoder_0/ffn_layernormalization").output)

# Freeze it
pretrained_bert_model.trainable = False

def create_classifier_bert_model():
  inputs = layers.Input((config.MAX_LEN,), dtype = tf.int64)
  sequence_output = pretrained_bert_model(inputs)
  pooled_output = layers.GlobalMaxPooling1D()(sequence_output)
  hidden_layer = layers.Dense(64, activation="relu")(pooled_output)
  outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)

  classifer_model = keras.Model(inputs, outputs, name="classification")
  optimizer = keras.optimizers.Adam()
  classifer_model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
  return classifer_model




In [None]:
classifer_model = create_classifier_bert_model()
classifer_model.summary()

Model: "classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 256)]             0         
_________________________________________________________________
model (Functional)           (None, 256, 128)          3939584   
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 3,947,905
Trainable params: 8,321
Non-trainable params: 3,939,584
_________________________________________________________________


In [None]:
# Train the classifier with frozen BERT stage
classifer_model.fit(
    train_classifier_ds,
    epochs=5,
    validation_data=test_classifier_ds,
)

# Unfreeze the BERT model for fine-tuning
pretrained_bert_model.trainable = True
optimizer = keras.optimizers.Adam()
classifer_model.compile(
    optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
)
classifer_model.fit(
    train_classifier_ds,
    epochs=5,
    validation_data=test_classifier_ds,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4bc6448090>

In [None]:
def get_end_to_end(model):
    inputs_string = keras.Input(shape=(1,), dtype="string")
    indices = vectorize_layer(inputs_string)
    outputs = model(indices)
    end_to_end_model = keras.Model(inputs_string, outputs, name="end_to_end_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    end_to_end_model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    return end_to_end_model


end_to_end_classification_model = get_end_to_end(classifer_model)
end_to_end_classification_model.evaluate(test_raw_classifier_ds)



[0.5624432563781738, 0.8510000109672546]