In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = [
    "i love my dog!",
    "I love my cat",
    "my dog loves me",
    "my cat does not love me",
    "You love my dog!",
    "Do you not think my dog is amazing?"
]

## Create Tokenizer instance
`num_words` = 100 are the unique distinct words in some text, in this case `sentences`. 
`oov_token` takes care of unseen words

In [3]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [4]:
word_index = tokenizer.word_index

In [5]:
print(word_index) # Notice case insensitive

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'cat': 6, 'me': 7, 'not': 8, 'you': 9, 'loves': 10, 'does': 11, 'do': 12, 'think': 13, 'is': 14, 'amazing': 15}


In [6]:
sequences = tokenizer.texts_to_sequences(sentences)

In [7]:
print(sequences) # Tokens replacing the words

[[5, 3, 2, 4], [5, 3, 2, 6], [2, 4, 10, 7], [2, 6, 11, 8, 3, 7], [9, 3, 2, 4], [12, 9, 8, 13, 2, 4, 14, 15]]


In [8]:
test_data = ["I really love my dog",
"my dog really loves my sandals"]

In [9]:
test_seq = tokenizer.texts_to_sequences(test_data) # need to tokenize test data
print(test_seq) 

[[5, 1, 3, 2, 4], [2, 4, 1, 10, 2, 1]]
