In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = [
             "I like eggs and ham.",
             "You love chocolate and bunnies.",
             "I hate onions."
            ]

In [4]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [5]:
print(sequences)

[[1, 3, 4, 2, 5], [6, 7, 8, 2, 9], [1, 10, 11]]


In [6]:
tokenizer.word_index

{'and': 2,
 'bunnies': 9,
 'chocolate': 8,
 'eggs': 4,
 'ham': 5,
 'hate': 10,
 'i': 1,
 'like': 3,
 'love': 7,
 'onions': 11,
 'you': 6}

In [8]:
# Padding using default values
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 6  7  8  2  9]
 [ 0  0  1 10 11]]


In [9]:
# Padding changing the default values
MAX_SEQ_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding="post")
print(data)

[[ 1  3  4  2  5]
 [ 6  7  8  2  9]
 [ 1 10 11  0  0]]


In [10]:
# Truncation using default values. By Default, RNN truncates the values in the beginning rather than the ending values.
# This happens because RNN pays more attention to the final values in the sequence.
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 3  4  2  5]
 [ 7  8  2  9]
 [ 0  1 10 11]]


In [11]:
# Truncation changing the default values
data = pad_sequences(sequences, maxlen=4, truncating="post")
print(data)

[[ 1  3  4  2]
 [ 6  7  8  2]
 [ 0  1 10 11]]
