# text_to_word_sequence()  aka  tokenizing sent

In [2]:

from keras.preprocessing.text import text_to_word_sequence
# define the document
text = "The quick brown fox jumped over the lazy dog. He couldn't drive green-blueish!"
# tokenize the document
result = text_to_word_sequence(text)
print(result)

Using Theano backend.


['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'he', "couldn't", 'drive', 'green', 'blueish']


# one_hot encoding  (converts to integer equiv.)
* Name suggests that it will create a one-hot encoding of the document, WHICH IS NOT THE CASE.
* uses HASH function means that there may be collisions, hence tokens wont be unique (eg: 2 diffrent word in vocab may be assigned same integer token ) so ideally vocabulary size is perhaps 25% more to minimize the number of collisions

In [3]:
#  text lower case, filter out punctuation, and split words based on white space
# also need vocab size = defines the hashing space from which words are hashed
# recommended vocabulary by some percentage (perhaps 25%) to minimize the number of collisions
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot

# define the document
text = "The quick brown fox jumped over the lazy dog. He couldn't drive green-blueish!"

# tokenize the document
tokensList = text_to_word_sequence(text)
vocab_size = len(tokensList)
print("vocab_size: ", vocab_size)


encoded = one_hot(text, round(vocab_size*1.3)) # or add extra value
print(encoded)

vocab_size:  14
[1, 7, 2, 5, 17, 12, 1, 3, 11, 7, 13, 5, 5, 8]


# Tokenizer API
* okenizer must be constructed and then fit on either raw text documents or integer encoded text documents.

In [4]:

from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!']
# create the tokenizer
tokenizer = Tokenizer()# HERE WE CAN PASS VOCAB, SO WHEN CONVERTIN TO SEQ; IT WOULD IGNORE ALL VALUES ABOVE THAT DEFINED VOCAB 
# fit the tokenizer on the documents
tokenizer.fit_on_texts(docs)

# has 4 attributes 
* word_counts: A dictionary of words and their counts.
* word_docs: A dictionary of words and how many documents each appeared in.
* word_index: A dictionary of words and their uniquely assigned integers.
* document_count:An integer count of the total number of documents that were used to fit 

In [5]:
tokenizer.word_counts

OrderedDict([('well', 1),
             ('done', 1),
             ('good', 1),
             ('work', 2),
             ('great', 1),
             ('effort', 1),
             ('nice', 1),
             ('excellent', 1)])

In [6]:
tokenizer.word_docs

defaultdict(int,
            {'done': 1,
             'well': 1,
             'work': 2,
             'good': 1,
             'great': 1,
             'effort': 1,
             'nice': 1,
             'excellent': 1})

In [11]:
tokenizer.word_index
#oredered as per max. freq

{'work': 1,
 'well': 2,
 'done': 3,
 'good': 4,
 'great': 5,
 'effort': 6,
 'nice': 7,
 'excellent': 8}

In [8]:
tokenizer.document_count

5

In [9]:
seq = tokenizer.texts_to_sequences(docs)
print(seq)

[[2, 3], [4, 1], [5, 6], [7, 1], [8]]
