## Preparing text to use with tensorflow models

### Import classes you need

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Write some sentences

In [42]:
sentences = [
    "My favourite food is ice cream",
    "do you like ice cream too?",
    "My dog likes ice cream too!",
    "your favourite flavour of ice cream is chocolate",
    "chocolate isn't good for dogs",
    "your dog your cat, and your parrot prefer brocolli"
]
print(sentences)

['My favourite food is ice cream', 'do you like ice cream too?', 'My dog likes ice cream too!', 'your favourite flavour of ice cream is chocolate', "chocolate isn't good for dogs", 'your dog your cat, and your parrot prefer brocolli']


### Create a tokenizer and define an out of vocabulary token

In [43]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer

<keras.src.preprocessing.text.Tokenizer at 0x7f6f6e10fb50>

### Tokenize the words

In [44]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'ice': 2, 'cream': 3, 'your': 4, 'my': 5, 'favourite': 6, 'is': 7, 'too': 8, 'dog': 9, 'chocolate': 10, 'food': 11, 'do': 12, 'you': 13, 'like': 14, 'likes': 15, 'flavour': 16, 'of': 17, "isn't": 18, 'good': 19, 'for': 20, 'dogs': 21, 'cat': 22, 'and': 23, 'parrot': 24, 'prefer': 25, 'brocolli': 26}


### Turn sentences to sequences

In [45]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 6, 11, 7, 2, 3], [12, 13, 14, 2, 3, 8], [5, 9, 15, 2, 3, 8], [4, 6, 16, 17, 2, 3, 7, 10], [10, 18, 19, 20, 21], [4, 9, 4, 22, 23, 4, 24, 25, 26]]


### Make the sentences to be of same length

In [46]:
padded = pad_sequences(sequences)
print("Padded sequence: \n",padded)

Padded sequence: 
 [[ 0  0  0  5  6 11  7  2  3]
 [ 0  0  0 12 13 14  2  3  8]
 [ 0  0  0  5  9 15  2  3  8]
 [ 0  4  6 16 17  2  3  7 10]
 [ 0  0  0  0 10 18 19 20 21]
 [ 4  9  4 22 23  4 24 25 26]]


In [47]:
# Specify the maximum length
padded = pad_sequences(sequences, maxlen = 15)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  5  6 11  7  2  3]
 [ 0  0  0  0  0  0  0  0  0 12 13 14  2  3  8]
 [ 0  0  0  0  0  0  0  0  0  5  9 15  2  3  8]
 [ 0  0  0  0  0  0  0  4  6 16 17  2  3  7 10]
 [ 0  0  0  0  0  0  0  0  0  0 10 18 19 20 21]
 [ 0  0  0  0  0  0  4  9  4 22 23  4 24 25 26]]


In [48]:
# Padding at the end of sentences
padded = pad_sequences(sequences, maxlen = 15, padding = 'post')
print(padded)

[[ 5  6 11  7  2  3  0  0  0  0  0  0  0  0  0]
 [12 13 14  2  3  8  0  0  0  0  0  0  0  0  0]
 [ 5  9 15  2  3  8  0  0  0  0  0  0  0  0  0]
 [ 4  6 16 17  2  3  7 10  0  0  0  0  0  0  0]
 [10 18 19 20 21  0  0  0  0  0  0  0  0  0  0]
 [ 4  9  4 22 23  4 24 25 26  0  0  0  0  0  0]]


In [49]:
# Limit length
padded = pad_sequences(sequences, maxlen = 3)
print(padded)

[[ 7  2  3]
 [ 2  3  8]
 [ 2  3  8]
 [ 3  7 10]
 [19 20 21]
 [24 25 26]]


##### What happens if some of the sentences contain words that are not in the word index

In [50]:
test_data = [
    "my best friend's favourite ice cream flavour is strawberry",
    "my dog's best friend is a manatee"
]
print(test_data, end = "\n\n")
print("<OOV> has the number ", word_index['<OOV>'], " in the word index")

["my best friend's favourite ice cream flavour is strawberry", "my dog's best friend is a manatee"]

<OOV> has the number  1  in the word index


In [51]:
# convert the test_data to sequences
test_sequences = tokenizer.texts_to_sequences(test_data)
print("Test sequence: \n", test_sequences)
# pad test sequences
padded_test_sequences = pad_sequences(test_sequences, maxlen = 10)
print('\n Padded test sequences: \n', padded_test_sequences)

Test sequence: 
 [[5, 1, 1, 6, 2, 3, 16, 7, 1], [5, 1, 1, 1, 7, 1, 1]]

 Padded test sequences: 
 [[ 0  5  1  1  6  2  3 16  7  1]
 [ 0  0  0  5  1  1  1  7  1  1]]
