# Split words with text_to_word_sequence

In [1]:
# Words are called tokens and the process of 
# splitting text into tokens is called tokenization.
import keras
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = 'The quick brown fox jumped over the lazy dog.'
# tokenize the document
result = text_to_word_sequence(text)
print(result)

Using TensorFlow backend.


['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


# Encoding with one_hot

In [2]:
# estimate the size of the vocabulary
words = set(result)
vocab_size = len(words)
print(vocab_size)

8


In [3]:
# Putting it together with the one_hot() function
from keras.preprocessing.text import one_hot
# integer encode the document
encoding = one_hot(text, round(vocab_size*1.3))
print(encoding)

[7, 1, 9, 4, 2, 7, 7, 2, 5]


# Hash Encoding with hashing_trick

In [4]:
from keras.preprocessing.text import hashing_trick
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5')
print(result)

[6, 4, 1, 2, 7, 5, 6, 2, 6]


# Tokenizer API

In [5]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!',
       'Good work',
       'Great effort',
       'nice work',
       'Excellent!']
# create the tokenzier
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(docs)

In [6]:
# summarize what was learned
print(tokenizer.word_counts)
print(tokenizer.document_count)
print(tokenizer.word_index)
print(tokenizer.word_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'well': 1, 'done': 1, 'work': 2, 'good': 1, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1})


In [8]:
# integer encode documents
encoded_docs = tokenizer.texts_to_matrix(docs, mode='count')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
