Original Article : http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/

# Practice Tokenizer Class

In [1]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
tokenizer = Tokenizer()

In [3]:
text = ["The sun is shining in june!", "I like disney cartoons.", "Lelouch of the rebellion is in Code Geass"]

In [4]:
tokenizer.fit_on_texts(text)

In [5]:
# shows number of times each word appears in the text
tokenizer.word_counts

OrderedDict([('the', 2),
             ('sun', 1),
             ('is', 2),
             ('shining', 1),
             ('in', 2),
             ('june', 1),
             ('i', 1),
             ('like', 1),
             ('disney', 1),
             ('cartoons', 1),
             ('lelouch', 1),
             ('of', 1),
             ('rebellion', 1),
             ('code', 1),
             ('geass', 1)])

In [6]:
# associates a id with each word. seems like starts from the words with highest frequency
tokenizer.word_index

{'cartoons': 10,
 'code': 14,
 'disney': 9,
 'geass': 15,
 'i': 7,
 'in': 3,
 'is': 2,
 'june': 6,
 'lelouch': 11,
 'like': 8,
 'of': 12,
 'rebellion': 13,
 'shining': 5,
 'sun': 4,
 'the': 1}

In [7]:
# replaces each word with their integer id's
tokenizer.texts_to_sequences(text)

[[1, 4, 2, 5, 3, 6], [7, 8, 9, 10], [11, 12, 1, 13, 2, 3, 14, 15]]

In [8]:
# number of documents the tokenizer was applied to
tokenizer.document_count

3

In [9]:
# if lower case was applied or not
tokenizer.lower

True

In [10]:
# this is the filter that was applied
tokenizer.filters

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [11]:
# whether we are using the tokenizer in a character level or not
tokenizer.char_level

False

In [12]:
# which word is in which document, seems like it chooses the last occurance
tokenizer.word_docs

{'cartoons': 1,
 'code': 1,
 'disney': 1,
 'geass': 1,
 'i': 1,
 'in': 2,
 'is': 2,
 'june': 1,
 'lelouch': 1,
 'like': 1,
 'of': 1,
 'rebellion': 1,
 'shining': 1,
 'sun': 1,
 'the': 2}

In [13]:
# tokenizer.texts_to_sequences returns maps the text to ids but returns variable length sequences, in this case
# tokenizer.texts_to_matrix creates a n x m matrix where n = number of rows = number of documents and m = size of the vocab while
# tracking if each word appears in that sentence or not
print(tokenizer.texts_to_matrix(text))

[[ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.]]


In [14]:
# vectorization mode choosing
print(tokenizer.texts_to_matrix(text,mode='count'))

[[ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.]]


In [15]:
print(tokenizer.texts_to_matrix(text,mode='tfidf'))

[[ 0.          0.69314718  0.69314718  0.69314718  0.91629073  0.91629073
   0.91629073  0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.91629073  0.91629073  0.91629073  0.91629073  0.          0.          0.
   0.          0.        ]
 [ 0.          0.69314718  0.69314718  0.69314718  0.          0.          0.
   0.          0.          0.          0.          0.91629073  0.91629073
   0.91629073  0.91629073  0.91629073]]


In [16]:
print(tokenizer.texts_to_matrix(text,mode='freq'))

[[ 0.          0.16666667  0.16666667  0.16666667  0.16666667  0.16666667
   0.16666667  0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.25        0.25        0.25        0.25        0.          0.          0.
   0.          0.        ]
 [ 0.          0.125       0.125       0.125       0.          0.          0.
   0.          0.          0.          0.          0.125       0.125       0.125
   0.125       0.125     ]]


# Fit a Basic Model on text data:

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

In [18]:
text = ["The sun is shining in june!", "I like disney cartoons.", "Lelouch of the rebellion is in Code Geass"]
X = tokenizer.texts_to_matrix(text)
y = [0,1,0]

In [19]:
model = Sequential()

In [20]:
model.add(Dense(8,input_dim=X.shape[1]))
model.add(Dense(1,activation='sigmoid'))

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 145
Trainable params: 145
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy')

In [23]:
model.fit(X,y=y,batch_size=200,epochs=1000,verbose=0,validation_split=0.2,shuffle=True)

<keras.callbacks.History at 0x7dcdeef60>

In [24]:
from keras.utils.np_utils import np as np
np.round(model.predict(X))

array([[ 0.],
       [ 1.],
       [ 0.]], dtype=float32)

# Create embeddings with keras 

When vocabulary is large text sequences turn into sparse vectors as most sentences will never have the size of a vocabulary. We cast those sparse vectors to a lower dimension with an embedding layer.

In [25]:
# this is the sparse vector. note it has to be a np array lists don't work
sparse = np.array([[0,1,0,1,1,0,0]])

In [26]:
sparse.shape

(1, 7)

In [30]:
# getting the embedding layer
from keras.layers import Embedding

In [31]:
model = Sequential()

In [32]:
# input_dim = size of the vocab/max integer index + 1. here max integer is 1+1 = 2.
# output_dim = size of the output embedding
# input_length if constant
model.add(Embedding(input_dim=2,output_dim=2,input_length=7))

In [33]:
model.compile('rmsprop','mse')

In [34]:
# predict gives the embeddings
output_embedding = model.predict(sparse)

In [35]:
output_embedding

array([[[-0.00041743, -0.03563239],
        [ 0.0299042 , -0.01917176],
        [-0.00041743, -0.03563239],
        [ 0.0299042 , -0.01917176],
        [ 0.0299042 , -0.01917176],
        [-0.00041743, -0.03563239],
        [-0.00041743, -0.03563239]]], dtype=float32)

In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 2)              4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


In [37]:
output_embedding.shape

(1, 7, 2)

In [38]:
X

array([[ 0.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
         1.,  1.,  1.]])

In [39]:
X.shape

(3, 16)

In [40]:
model2 = Sequential()

In [41]:
model2.add(Embedding(X.shape[1],10,input_length=X.shape[1]))

In [42]:
model2.add(Flatten())

In [43]:
model2.add(Dense(1,activation='sigmoid'))

In [44]:
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 16, 10)            160       
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 161       
Total params: 321
Trainable params: 321
Non-trainable params: 0
_________________________________________________________________


In [45]:
model2.compile(loss='binary_crossentropy', optimizer='rmsprop')
model2.fit(X, y=y, batch_size=200, epochs=700, verbose=0, validation_split=0.2, shuffle=True)
 
 
 


<keras.callbacks.History at 0x7e93a0f60>

In [46]:
np.round(model2.predict(X))

array([[ 0.],
       [ 1.],
       [ 0.]], dtype=float32)

In [47]:
from keras.layers import LSTM

model3 = Sequential()
model3.add(Embedding(X.shape[1],10,input_length=X.shape[1]))
model3.add(LSTM(3))
model3.add(Dense(1,activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='rmsprop')
model3.fit(X, y=y,  epochs=500, verbose=0, validation_split=0.2, shuffle=True)

<keras.callbacks.History at 0x7eaae0160>

In [60]:
np.round(model3.predict(X))

array([[ 0.],
       [ 1.],
       [ 0.]], dtype=float32)