In [8]:
#importing libraries
import tensorflow as tf

#### Vocabulary

In [19]:
#without include_special_token option
sentences= ['I love my dog', 'I love my cat','you love my dog!']

vectorize_layer = tf.keras.layers.TextVectorization()
vectorize_layer.adapt(sentences)
vocab = vectorize_layer.get_vocabulary()
for i, word in enumerate(vocab):
    print(i,word)

0 
1 [UNK]
2 my
3 love
4 i
5 dog
6 you
7 cat


In [7]:
#with include_special_token

vectorize_layer = tf.keras.layers.TextVectorization()
vectorize_layer.adapt(sentences)

vocabulary = vectorize_layer.get_vocabulary(include_special_tokens=False)
for inx, word in enumerate(vocabulary) :
    print(inx, word)

0 my
1 love
2 i
3 dog
4 you
5 cat


The resulting vocabulary will be a list where more frequently used words will have a lower index. 

#### Text to Sequence

In [11]:
sequence = vectorize_layer("I love my dog")
sequence

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([4, 3, 2, 5])>

In [20]:
sentences.extend(["I'm in love my your dog, its amazing"])
sentences

['I love my dog',
 'I love my cat',
 'you love my dog!',
 "I'm in love my your dog, its amazing"]

In [None]:
#let's apply on list of sentences and check sequences
sequences = vectorize_layer(sentences)
sequences

#0-padding, 1-<unk>
#by default it is doing post padding, based on the length of largest sentence

<tf.Tensor: shape=(4, 8), dtype=int64, numpy=
array([[4, 3, 2, 5, 0, 0, 0, 0],
       [4, 3, 2, 7, 0, 0, 0, 0],
       [6, 3, 2, 5, 0, 0, 0, 0],
       [1, 1, 3, 2, 1, 5, 1, 1]])>

#### Padding

You can get a list of varying lengths to have a uniform size by padding or truncating tokens from the sequences. Padding is more common to preserve information.

In [25]:
sentences 

['I love my dog',
 'I love my cat',
 'you love my dog!',
 "I'm in love my your dog, its amazing"]

In [30]:
vectorize_layer = tf.keras.layers.TextVectorization()
vectorize_layer.adapt(sentences)

sentence_dataset = tf.data.Dataset.from_tensor_slices(
    sentences
)

sent_sequences = sentence_dataset.map(vectorize_layer)
for sent, seq in zip(sentences, sent_sequences):
    print(f"{sent}-----{seq}")


I love my dog-----[5 3 2 4]
I love my cat-----[ 5  3  2 11]
you love my dog!-----[7 3 2 4]
I'm in love my your dog, its amazing-----[10  9  3  2  6  4  8 12]


In [None]:
#vectorize_layer(sentences)

<tf.Tensor: shape=(4, 8), dtype=int64, numpy=
array([[ 5,  3,  2,  4,  0,  0,  0,  0],
       [ 5,  3,  2, 11,  0,  0,  0,  0],
       [ 7,  3,  2,  4,  0,  0,  0,  0],
       [10,  9,  3,  2,  6,  4,  8, 12]])>

In [32]:
#to add padding to sequences

pad_sequences = tf.keras.utils.pad_sequences(sent_sequences, padding="pre")
pad_sequences

array([[ 0,  0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  0,  5,  3,  2, 11],
       [ 0,  0,  0,  0,  7,  3,  2,  4],
       [10,  9,  3,  2,  6,  4,  8, 12]], dtype=int32)

#### Ragged Tensors

Another way to prepare your sequences for prepadding is to set the TextVectorization to output a ragged tensor. This means the output will not be automatically post-padded.

In [None]:

vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)
vectorize_layer.adapt(sentences)
seq = vectorize_layer(sentences)
seq

<tf.RaggedTensor [[5, 3, 2, 4], [5, 3, 2, 11], [7, 3, 2, 4], [10, 9, 3, 2, 6, 4, 8, 12]]>

In [None]:
pad_seq = tf.keras.utils.pad_sequences(seq.numpy()) # default padding is pre
pad_seq

array([[ 0,  0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  0,  5,  3,  2, 11],
       [ 0,  0,  0,  0,  7,  3,  2,  4],
       [10,  9,  3,  2,  6,  4,  8, 12]], dtype=int32)

In [47]:
pad_seq = tf.keras.utils.pad_sequences(seq.numpy(), maxlen=5, truncating="pre") # limiting size to 5, removing from beginning
pad_seq

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2, 11],
       [ 0,  7,  3,  2,  4],
       [ 2,  6,  4,  8, 12]], dtype=int32)

#### Out of vocabulary tokens

In [45]:
test_sentences = ['I love my cat', 'My dog loves my friend bird']
vectorize_layer = tf.keras.layers.TextVectorization()
vectorize_layer.adapt(sentences)
test_sequence = vectorize_layer(test_sentences)
test_sequence

<tf.Tensor: shape=(2, 6), dtype=int64, numpy=
array([[ 5,  3,  2, 11,  0,  0],
       [ 2,  4,  1,  2,  1,  1]])>