### Tokenization

In [27]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
sentences = [
    'I love my dog.',
    'I love my cat.',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

#### Indexing words 

In [33]:
tokenizer = Tokenizer(num_words = 100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


#### Using the dictionary to form sequence

In [34]:
sequence = tokenizer.texts_to_sequences(sentences)
print(sequence)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


#### OOV words that are not present in tokenizer

In [37]:
test_data = [
    'I really love my dog',
    'my dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


#### Padding to ensure length is same

In [44]:
pad = pad_sequences(sequence, padding = 'post')
# padding - specify where to place the zeros, either at beginning or end
print(pad)
test_pad = pad_sequences(test_seq, maxlen = 4, truncating = 'post')
# maxlen - length of sequence
# truncating - where to truncate
print(test_pad)

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]
[[5 1 3 2]
 [2 4 1 2]]
