In [1]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re

Using TensorFlow backend.


In [2]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [3]:
doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."
tokens = tokenize(doc)
print(tokens)

['after', 'the', 'deduction', 'of', 'the', 'costs', 'of', 'investing', 'beating', 'the', 'stock', 'market', 'is', 'a', "loser's", 'game']


In [4]:
word_to_id, id_to_word = mapping(tokens)
print(word_to_id)
print(id_to_word)

{'the': 0, 'after': 1, 'market': 2, 'game': 3, 'beating': 4, 'a': 5, 'is': 6, 'costs': 7, 'stock': 8, 'investing': 9, "loser's": 10, 'deduction': 11, 'of': 12}
{0: 'the', 1: 'after', 2: 'market', 3: 'game', 4: 'beating', 5: 'a', 6: 'is', 7: 'costs', 8: 'stock', 9: 'investing', 10: "loser's", 11: 'deduction', 12: 'of'}


In [5]:
#real code
def generate_training_data(tokens, word_to_id, window_size):
    L = len(tokens)
    X, Y = [], []
    tempX, tempY = [], []
    for i in range(L):
        index_before_after = list(range(max(0, i - window_size), i)) + \
                             list(range(i + 1, min(i + window_size + 1,L)))
        #print(index_before_after)
        for j in index_before_after:
            tempX.append(word_to_id[tokens[j]])
            #tempY.append(word_to_id[tokens[i]])
        print(tempX)
        X.append(tempX)
        Y.append(word_to_id[tokens[i]])
        tempX = []
    #X = np.array(X)
    #Y = np.array(Y)    
    return X,Y

In [6]:
X,Y = generate_training_data(tokens, word_to_id, 3)
print(X[:10])
print(type(X))
print("------------------------------------------------------")
print(Y[:10])
#print(Y.shape)

[0, 11, 12]
[1, 11, 12, 0]
[1, 0, 12, 0, 7]
[1, 0, 11, 0, 7, 12]
[0, 11, 12, 7, 12, 9]
[11, 12, 0, 12, 9, 4]
[12, 0, 7, 9, 4, 0]
[0, 7, 12, 4, 0, 8]
[7, 12, 9, 0, 8, 2]
[12, 9, 4, 8, 2, 6]
[9, 4, 0, 2, 6, 5]
[4, 0, 8, 6, 5, 10]
[0, 8, 2, 5, 10, 3]
[8, 2, 6, 10, 3]
[2, 6, 5, 3]
[6, 5, 10]
[[0, 11, 12], [1, 11, 12, 0], [1, 0, 12, 0, 7], [1, 0, 11, 0, 7, 12], [0, 11, 12, 7, 12, 9], [11, 12, 0, 12, 9, 4], [12, 0, 7, 9, 4, 0], [0, 7, 12, 4, 0, 8], [7, 12, 9, 0, 8, 2], [12, 9, 4, 8, 2, 6]]
<class 'list'>
------------------------------------------------------
[1, 0, 11, 12, 0, 7, 12, 9, 4, 0]


In [7]:
MAX_LENGTH = 6

X = pad_sequences(X, maxlen=MAX_LENGTH, padding='post')
print(X)

[[ 0 11 12  0  0  0]
 [ 1 11 12  0  0  0]
 [ 1  0 12  0  7  0]
 [ 1  0 11  0  7 12]
 [ 0 11 12  7 12  9]
 [11 12  0 12  9  4]
 [12  0  7  9  4  0]
 [ 0  7 12  4  0  8]
 [ 7 12  9  0  8  2]
 [12  9  4  8  2  6]
 [ 9  4  0  2  6  5]
 [ 4  0  8  6  5 10]
 [ 0  8  2  5 10  3]
 [ 8  2  6 10  3  0]
 [ 2  6  5  3  0  0]
 [ 6  5 10  0  0  0]]


In [8]:
Y = to_categorical(Y, num_classes=13)
print(Y.shape)

(16, 13)


In [9]:
model = Sequential()
model.add(Embedding(13, 50, input_length=6))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 50)             650       
_________________________________________________________________
gru_1 (GRU)                  (None, 150)               90450     
_________________________________________________________________
dense_1 (Dense)              (None, 13)                1963      
Total params: 93,063
Trainable params: 93,063
Non-trainable params: 0
_________________________________________________________________
None


In [10]:

model.fit(X, Y, epochs=100, verbose=2)


Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 0s - loss: 2.5657 - acc: 0.0625
Epoch 2/100
 - 0s - loss: 2.5577 - acc: 0.1875
Epoch 3/100
 - 0s - loss: 2.5510 - acc: 0.1875
Epoch 4/100
 - 0s - loss: 2.5429 - acc: 0.1875
Epoch 5/100
 - 0s - loss: 2.5340 - acc: 0.2500
Epoch 6/100
 - 0s - loss: 2.5258 - acc: 0.2500
Epoch 7/100
 - 0s - loss: 2.5209 - acc: 0.1875
Epoch 8/100
 - 0s - loss: 2.5135 - acc: 0.1875
Epoch 9/100
 - 0s - loss: 2.5004 - acc: 0.1875
Epoch 10/100
 - 0s - loss: 2.4937 - acc: 0.1875
Epoch 11/100
 - 0s - loss: 2.4839 - acc: 0.1875
Epoch 12/100
 - 0s - loss: 2.4708 - acc: 0.1875
Epoch 13/100
 - 0s - loss: 2.4537 - acc: 0.1875
Epoch 14/100
 - 0s - loss: 2.4413 - acc: 0.1875
Epoch 15/100
 - 0s - loss: 2.4285 - acc: 0.1875
Epoch 16/100
 - 0s - loss: 2.4153 - acc: 0.1875
Epoch 17/100
 - 0s - loss: 2.4074 - acc: 0.1875
Epoch 18/100
 - 0s - loss: 2.3768 - acc: 0.1875
Epoch 19/100
 - 0s - loss: 2.3492 - acc: 0.1875
Epoch 20/100
 - 0s - loss: 2.3470 - acc: 0.1875
E

<keras.callbacks.callbacks.History at 0x7f69253ab490>

In [32]:
print(word_to_id['beating'])
print(word_to_id['the'])
print(word_to_id['stock'])
print(word_to_id['is'])
print(word_to_id['a'])
print(word_to_id["loser's"])

4
0
8
6
5
10


In [33]:
temp = np.array([[ 4,0,8,6,5,10]])
print(temp.shape)

(1, 6)


In [34]:
yhat = model.predict_proba(temp, verbose=0)
print(yhat)

[[2.6379066e-04 1.0716057e-06 8.5258102e-01 1.3444260e-05 1.2245412e-03
  2.3611935e-02 2.6541259e-02 2.9621115e-05 9.4571628e-02 3.1810878e-06
  1.1495572e-03 2.9548980e-06 5.9930489e-06]]


In [35]:
yhat = model.predict_classes(temp, verbose=0)
print(yhat)

[2]


In [36]:
print(id_to_word[2])

market
