In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentences= [
    'I love my dog',
    'I love my cat',
    'You love my dog!'
]
tokenizer = Tokenizer(num_words= 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


# Text to Sequence

In [4]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [5]:
test_data = [
    'I really love my dog',
    'My dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)
# the words 'really', 'loves', and 'manatee' are lost from the sequence list
# as they don't have an index

[[4, 2, 1, 3], [1, 3, 1]]


# Introduction to Padding

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words =100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [7]:
test_data = [
    'I really love my dog',
    'My dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)


[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [12]:
# To make the padded zeroes appear after the sentence in the list do this:
padded = pad_sequences(sequences, padding='post')
print(padded)

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]


In [8]:
# To make the padded sequence of a fixed length in the list do this:
# by setting a length value less than the longest sequence we 
# effectively truncate the sequence
padded = pad_sequences(sequences, maxlen=5)
print(padded)

[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]


In [9]:
# To make the padded sequence of a fixed length in the list do this:
padded = pad_sequences(sequences, maxlen=10)
print(padded)

[[ 0  0  0  0  0  0  5  3  2  4]
 [ 0  0  0  0  0  0  5  3  2  7]
 [ 0  0  0  0  0  0  6  3  2  4]
 [ 0  0  0  8  6  9  2  4 10 11]]


In [None]:
import os
import zipfile
local_zip = 'news-headlines-dataset-for-sarcasm-detection.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()