## Preparing text to use with tensorflow models

### Import classes you need

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-03-10 22:37:22.998845: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-10 22:37:23.689809: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-10 22:37:23.691062: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Write some sentences

In [2]:
sentences = [
    "My favourite food is ice cream",
    "do you like ice cream too?",
    "My dog likes ice cream too!",
    "your favourite flavour of ice cream is chocolate",
    "chocolate isn't good for dogs",
    "your dog your cat, and your parrot prefer brocolli"
]
print(sentences)

['My favourite food is ice cream', 'do you like ice cream too?', 'My dog likes ice cream too!', 'your favourite flavour of ice cream is chocolate', "chocolate isn't good for dogs", 'your dog your cat, and your parrot prefer brocolli']


### Create a tokenizer and define an out of vocabulary token

In [3]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer

<keras.src.preprocessing.text.Tokenizer at 0x7f6f6e77e110>

### Tokenize the words

In [4]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'ice': 2, 'cream': 3, 'your': 4, 'my': 5, 'favourite': 6, 'is': 7, 'too': 8, 'dog': 9, 'chocolate': 10, 'food': 11, 'do': 12, 'you': 13, 'like': 14, 'likes': 15, 'flavour': 16, 'of': 17, "isn't": 18, 'good': 19, 'for': 20, 'dogs': 21, 'cat': 22, 'and': 23, 'parrot': 24, 'prefer': 25, 'brocolli': 26}


### Turn sentences to sequences

In [17]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 6, 11, 7, 2, 3], [12, 13, 14, 2, 3, 8], [5, 9, 15, 2, 3, 8], [4, 6, 16, 17, 2, 3, 7, 10], [10, 18, 19, 20, 21], [4, 9, 4, 22, 23, 4, 24, 25, 26]]


### Make the sentences to be of same length

In [18]:
padded = pad_sequences(sequences)
print("Padded sequence: \n",padded)

Padded sequence: 
 [[ 0  0  0  5  6 11  7  2  3]
 [ 0  0  0 12 13 14  2  3  8]
 [ 0  0  0  5  9 15  2  3  8]
 [ 0  4  6 16 17  2  3  7 10]
 [ 0  0  0  0 10 18 19 20 21]
 [ 4  9  4 22 23  4 24 25 26]]


In [19]:
# Specify the maximum length
padded = pad_sequences(sequences, maxlen = 15)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  5  6 11  7  2  3]
 [ 0  0  0  0  0  0  0  0  0 12 13 14  2  3  8]
 [ 0  0  0  0  0  0  0  0  0  5  9 15  2  3  8]
 [ 0  0  0  0  0  0  0  4  6 16 17  2  3  7 10]
 [ 0  0  0  0  0  0  0  0  0  0 10 18 19 20 21]
 [ 0  0  0  0  0  0  4  9  4 22 23  4 24 25 26]]


In [20]:
# Padding at the end of sentences
padded = pad_sequences(sequences, maxlen = 15, padding = 'post')
print(padded)

[[ 5  6 11  7  2  3  0  0  0  0  0  0  0  0  0]
 [12 13 14  2  3  8  0  0  0  0  0  0  0  0  0]
 [ 5  9 15  2  3  8  0  0  0  0  0  0  0  0  0]
 [ 4  6 16 17  2  3  7 10  0  0  0  0  0  0  0]
 [10 18 19 20 21  0  0  0  0  0  0  0  0  0  0]
 [ 4  9  4 22 23  4 24 25 26  0  0  0  0  0  0]]


In [21]:
# Limit length
padded = pad_sequences(sequences, maxlen = 3)
print(padded)

[[ 7  2  3]
 [ 2  3  8]
 [ 2  3  8]
 [ 3  7 10]
 [19 20 21]
 [24 25 26]]
