<a href="https://colab.research.google.com/github/NITHISHM2410/Text_Processing/blob/NLP/Text%20Encoding/encoding_for_bert%20/TextPreprocessing_For_Bert_CustomVocabulary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
!pip install tensorflow-text
import tensorflow_text as text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.12.1


In [3]:
model = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1")

This Module can be used for scenarios where we need to train BERT models on our custom vocabulary instead of BERT inbuilt vocabulary.

In [86]:
class BertED(tf.keras.layers.Layer):
    def __init__(self, vocab, max_len):
        super(BertED, self).__init__()
        self.vocab = self.get_vocab(vocab)
        self.maxlen = max_len
        self.encode = tf.keras.layers.TextVectorization(
            max_tokens=1000,
            output_mode='int',
            vocabulary=self.vocab,
            standardize='lower_and_strip_punctuation'
        )
        self.decode = tf.keras.layers.StringLookup(
            max_tokens=1000,
            output_mode='int',
            vocabulary=self.vocab,
            invert=True
        )

    def get_vocab(self,vocab):
        with open(vocab) as f:
            vocab = f.read()
        return vocab.split("\n")    


    def pad(self, inputs):
        inputs = tf.keras.preprocessing.sequence.pad_sequences(
            inputs,
            maxlen=self.maxlen,
            dtype='int32',
            padding='post',
            truncating='post',
            value=0
        )
        return tf.convert_to_tensor(inputs)

    def mask(self, input_tensor):
        mask_tensor = tf.where(tf.equal(input_tensor, 0), tf.fill(tf.shape(input_tensor), 0), tf.ones_like(input_tensor))
        return mask_tensor

    def typeids(self, input):
        return tf.zeros_like(input, dtype=tf.int32)

    def create_dict(self, input):
        sample = dict()
        sample['input_word_ids'] = tf.cast(self.pad(self.encode(input)), tf.int32)
        sample['input_mask'] = self.mask(sample['input_word_ids'])
        sample['input_type_ids'] = self.typeids(sample['input_word_ids'])
        return sample

    def call(self, inputs):
        inputs = [self.create_dict(input) for input in inputs]
        return inputs

    def decoder(self, input):
        output = self.decode(input)
        cond = tf.math.logical_not(tf.equal(output, "[UNK]"))
        output = tf.boolean_mask(output, cond)
        return output
    def back_to_string(self,inputs):
        outputs = [self.decoder(input) for input in inputs]
        return outputs
         

    def return_vocab(self):
        return self.encode.get_vocabulary()


DEFINE YOUR CUSTOM VOCABULARY IN 'vocab' PARAMETER AND MAXIMUM LENGTH OF INPUT SENTENCE IN 'max_len' PARAMETER.


In [88]:
preprocess = BertED(vocab = "vocab.txt", # Add your vocab
                         max_len = 10,
                    )

PASSING FEW INPUT SENTENCES TO ENCODE BASED ON OUR CUSTOM VOCAB

In [89]:
inputs = tf.convert_to_tensor(
        [[tf.Variable(['day and night']),tf.Variable(['car and bike'])],
         [tf.Variable(['day and night']),tf.Variable(['car and bike'])]]
        )


In [90]:
inputs

<tf.Tensor: shape=(2, 2, 1), dtype=string, numpy=
array([[[b'day and night'],
        [b'car and bike']],

       [[b'day and night'],
        [b'car and bike']]], dtype=object)>

In [91]:
results = preprocess(inputs)
results

[{'input_word_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[ 2,  1,  3,  0,  0,  0,  0,  0,  0,  0],
         [10,  1, 13,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>,
  'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
  'input_type_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>},
 {'input_word_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[ 2,  1,  3,  0,  0,  0,  0,  0,  0,  0],
         [10,  1, 13,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>,
  'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
  'input_type_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
  array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 

VOCABULARY SET

In [92]:
preprocess.return_vocab()

['',
 '[UNK]',
 'day',
 'night',
 'father',
 'mother',
 'like',
 'love',
 'today',
 'after',
 'car',
 'tomorrow',
 'how',
 'bike',
 'are',
 'you',
 'i',
 'call',
 'me',
 'afternoon']

TRYING TO DECODE USING THE SAME MODULE

In [93]:
for batch in results:
    print(batch)

{'input_word_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[ 2,  1,  3,  0,  0,  0,  0,  0,  0,  0],
       [10,  1, 13,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'input_type_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
{'input_word_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[ 2,  1,  3,  0,  0,  0,  0,  0,  0,  0],
       [10,  1, 13,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'input_type_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}


TRYING TO PASS ENCODED SENTENCES TO MODEL

In [94]:
for batch in results:
    print(model(batch)['pooled_output'].shape)

(2, 512)
(2, 512)
