### Tokenizing Text

##### Set up

In [55]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

##### Tokenize the sentences

In [56]:
sentences = [
    "My favourite food is ice cream",
    "do you like ice cream too?"
]
tokenizer = Tokenizer(num_words = 20, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)

##### View the word index

In [57]:
tokenizer.word_index

{'<OOV>': 1,
 'ice': 2,
 'cream': 3,
 'my': 4,
 'favourite': 5,
 'food': 6,
 'is': 7,
 'do': 8,
 'you': 9,
 'like': 10,
 'too': 11}

##### Sequencing

In [58]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[4, 5, 6, 7, 2, 3], [8, 9, 10, 2, 3, 11]]


In [59]:
new_sentence = [
    'Your favourite food is strawberries and cream'
]
new_sequences = tokenizer.texts_to_sequences(new_sentence)
print(new_sequences)

[[1, 5, 6, 7, 1, 1, 3]]


The 1s represent OOV words.

### Padding and sequencing

In [49]:
sentences = [
    "My favourite food is ice cream",
    "I love dogs",
    "It's raining cats and dogs"
]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'dogs': 1,
 'my': 2,
 'favourite': 3,
 'food': 4,
 'is': 5,
 'ice': 6,
 'cream': 7,
 'i': 8,
 'love': 9,
 "it's": 10,
 'raining': 11,
 'cats': 12,
 'and': 13}

In [50]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[2, 3, 4, 5, 6, 7], [8, 9, 1], [10, 11, 12, 13, 1]]

In [54]:
padded_seq = pad_sequences(sequences, maxlen = 10, padding = 'post')
padded_seq

array([[ 2,  3,  4,  5,  6,  7,  0,  0,  0,  0],
       [ 8,  9,  1,  0,  0,  0,  0,  0,  0,  0],
       [10, 11, 12, 13,  1,  0,  0,  0,  0,  0]], dtype=int32)