In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!', # Is tokenizer spot '!'?
    'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words=100) # Maximum number of words to keep
tokenizer.fit_on_texts(senteces) # Go through all the text and fit them with some numbers
word_index = tokenizer.word_index # List of words

sequences = tokenizer.texts_to_sequences(senteces) # Create sequences of tokens representing each sentense

In [39]:
print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


### What will happen when we'll try to make seqences from words that are not in the word index?

In [40]:
test_data = [
    'I really love my dog',
    'My dog loves my manatee'
]

test_sq = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sq)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [1, 3, 1]]


#### What we'v got is: I love my dog, My dog my
#### And what we can do with that is replace unknown words with index OOV

In [41]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>") 
tokenizer.fit_on_texts(senteces) 
word_index = tokenizer.word_index
test_data = [
    'I really love my dog',
    'My dog loves my manatee'
]

test_sq = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


#### We've still lost some meanings, but the sentences are at least at the correct length
#### But let's make all seqences the same size

In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]


#### What we did here is make seqences the same size
#### But what if we would like to add '0' after the sentence instead of the front?

In [43]:
padded = pad_sequences(sequences, padding='post')
print(padded)

[[ 4  2  1  3  0  0  0]
 [ 4  2  1  6  0  0  0]
 [ 5  2  1  3  0  0  0]
 [ 7  5  8  1  3  9 10]]
