In [1]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
from numpy import asarray

Using TensorFlow backend.


In [2]:
# # load the whole embedding into memory
# embeddings_index = dict()
# f = open('/media/hieu/CA48F77D48F7669B/WordVectorPretrained/model.txt', encoding = "ISO-8859-1")
# for line in f:
#     values = line.split()
#     if len(values) == 101:
#         word = values[0]
#         coefs = asarray(values[1:], dtype='float32')
#     elif len(values) == 102:
#         word = values[0] + values[1]
#         coefs = asarray(values[2:], dtype='float32')
#     try:
#         embeddings_index[word] = coefs
#     except:
#         print(word)
# f.close()
# print('Loaded %s word vectors.' % len(embeddings_index))

In [3]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [5]:
import os
entries = os.listdir("./mysmalltest")
#print(entries)
data = ""
for entry in entries:
    path = "./mysmalltest/" + entry
    with open(path) as f:
        currentdata = f.read()
    data = data + currentdata
print("Loading data from file .........")

Loading data from file .........


In [6]:
tokenszero = [""]
tokens = tokenize(data)
tokens = tokenszero + tokens
vocab = set(tokens)
print("token length",len(tokens))
print("vocab length",len(vocab))
#print(tokens)

token length 62487
vocab length 4881


In [7]:
word_to_id, id_to_word = mapping(tokens)
#print(word_to_id)
#print(id_to_word)

In [8]:
def generate_training_data(sentence, word_to_id, window_size):
    L = len(sentence)
    X, Y = [], []
    tempX, tempY = [], []
    for i in range(L):
        index_before_target = list(range(max(0, i - window_size), i))           
        index_after_target = list(range(i + 1, min(i + window_size + 1,L)))           
        index_before_after_target = index_before_target + index_after_target
        #print(index_before_after_target)                     
        for j in index_before_after_target:
            tempX.append(word_to_id[sentence[j]])
        #print(tempX)
        filling_missing_left = len(index_before_target)
        filling_missing_right = len(index_after_target)
        while(filling_missing_left < 3):
            tempX.insert(0,0)
            filling_missing_left +=1
        while(filling_missing_right < 3):
            tempX.append(0)
            filling_missing_right +=1
        X.append(tempX)
        Y.append(word_to_id[sentence[i]])
        tempX = []
    return X,Y

In [9]:
X,Y = generate_training_data(tokens, word_to_id, 3)
print(len(X))
print(len(Y))

62487
62487


In [10]:
MAX_LENGTH = 6

X = pad_sequences(X, maxlen=MAX_LENGTH, padding='post')
print(X.shape)

(62487, 6)


In [11]:
vocab_size = len(vocab)
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)

(62487, 4881)


In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=6))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 100)            488100    
_________________________________________________________________
gru_1 (GRU)                  (None, 150)               112950    
_________________________________________________________________
dense_1 (Dense)              (None, 4881)              737031    
Total params: 1,338,081
Trainable params: 1,338,081
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(X, Y, epochs=100, verbose=2)
model.save("model.h5")
print("Saved model to disk")

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 25s - loss: 6.9431 - acc: 0.0220
Epoch 2/100
 - 25s - loss: 6.4565 - acc: 0.0403
Epoch 3/100
 - 24s - loss: 5.8162 - acc: 0.0888
Epoch 4/100
 - 25s - loss: 4.9968 - acc: 0.1791
Epoch 5/100
 - 25s - loss: 4.2938 - acc: 0.2609
Epoch 6/100
 - 23s - loss: 3.7901 - acc: 0.3154
Epoch 7/100
 - 23s - loss: 3.4042 - acc: 0.3616
Epoch 8/100
 - 23s - loss: 3.0799 - acc: 0.4062
Epoch 9/100
 - 23s - loss: 2.8137 - acc: 0.4450
Epoch 10/100
 - 24s - loss: 2.5914 - acc: 0.4790
Epoch 11/100
 - 23s - loss: 2.3967 - acc: 0.5078
Epoch 12/100
 - 23s - loss: 2.2424 - acc: 0.5312
Epoch 13/100
 - 23s - loss: 2.1014 - acc: 0.5523
Epoch 14/100
 - 24s - loss: 1.9815 - acc: 0.5738
Epoch 15/100
 - 24s - loss: 1.8817 - acc: 0.5866
Epoch 16/100
 - 24s - loss: 1.7881 - acc: 0.6047
Epoch 17/100
 - 24s - loss: 1.7106 - acc: 0.6152
Epoch 18/100
 - 24s - loss: 1.6316 - acc: 0.6308
Epoch 19/100
 - 24s - loss: 1.5674 - acc: 0.6399
Epoch 20/100
 - 25s - loss: 1.

In [14]:
def encode_input(input_string):
    return input_string.split(" ")

In [15]:
def spell_checking(sentence):
    encode_sentence = encode_input(sentence)
    X, y = generate_training_data(encode_sentence, word_to_id, 3)
    error_words = []
    for i in range(len(X)):
        word = encode_sentence[i]
        word_index = word_to_id[word]
        yhat = model.predict_proba(np.array([X[i]]))
        if yhat[:,word_index] < 0.1:
            error_words.append(encode_sentence[i])
        print("input:",np.array([X[i]]))
        print("index:",i)
        print("probability of ---- ", encode_sentence[i], "----given surrouding is:" ,yhat[:,word_index])
        print("__________________________________________________________________________")
    return error_words

In [33]:
spell_checking("chiến tranh thế giới")

input: [[   0    0    0  199 3767 2306]]
index: 0
probability of ----  chiến ----given surrouding is: [0.11843435]
__________________________________________________________________________
input: [[   0    0 1292 3767 2306    0]]
index: 1
probability of ----  tranh ----given surrouding is: [0.32584605]
__________________________________________________________________________
input: [[   0 1292  199 2306    0    0]]
index: 2
probability of ----  thế ----given surrouding is: [0.9970222]
__________________________________________________________________________
input: [[1292  199 3767    0    0    0]]
index: 3
probability of ----  giới ----given surrouding is: [0.86991745]
__________________________________________________________________________


[]

In [30]:
spell_checking("chiến tránh thế giới")

input: [[   0    0    0 2038 3767 2306]]
index: 0
probability of ----  chiến ----given surrouding is: [1.3140418e-06]
__________________________________________________________________________
input: [[   0    0 1292 3767 2306    0]]
index: 1
probability of ----  tránh ----given surrouding is: [1.7283796e-07]
__________________________________________________________________________
input: [[   0 1292 2038 2306    0    0]]
index: 2
probability of ----  thế ----given surrouding is: [0.9638856]
__________________________________________________________________________
input: [[1292 2038 3767    0    0    0]]
index: 3
probability of ----  giới ----given surrouding is: [0.01197058]
__________________________________________________________________________


['chiến', 'tránh', 'giới']