In [157]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
from numpy import asarray

In [158]:
import os
entries = os.listdir("./mysmalltest")
#print(entries)
data = ""
for entry in entries:
    path = "./mysmalltest/" + entry
    with open(path) as f:
        currentdata = f.read()
    data = data + currentdata
print("Loading data from file .........")

Loading data from file .........


In [159]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [160]:
tokenszero = [""]
tokens = tokenize(data)
tokens = tokenszero + tokens
vocab = set(tokens)
vocab_size = len(vocab)
print("token length",len(tokens))
print("vocab length",len(vocab))
print(vocab_size)

token length 62487
vocab length 4881
4881


In [161]:
word_to_id, id_to_word = mapping(tokens)
print(word_to_id[""])
#print(id_to_word)

0


In [190]:
word_to_id

{'': 0,
 'löwenthal': 1,
 'drifting': 2,
 'trứ': 3,
 'cọc': 4,
 'tắm': 5,
 'bomb': 6,
 'triều': 7,
 'climate': 8,
 'kịp': 9,
 'quật': 10,
 'images': 11,
 'ngưng': 12,
 'gắn': 13,
 'hamilton': 14,
 'yourself': 15,
 'kỹ': 16,
 'mauritanie': 17,
 'wonderland': 18,
 'freedom': 19,
 'thinker': 20,
 'kastriot': 21,
 'rey': 22,
 'lorentz': 23,
 'giới': 24,
 'dang': 25,
 'baker': 26,
 'antonio': 27,
 'chốt': 28,
 'schiff': 29,
 'danes': 30,
 'downs': 31,
 'leon': 32,
 'chim': 33,
 'lập': 34,
 'alia': 35,
 'kỷ': 36,
 'ldzæjər': 37,
 'pharos': 38,
 'cậy': 39,
 'quartiers': 40,
 'muối': 41,
 'verus': 42,
 'dõi': 43,
 'va': 44,
 'christof': 45,
 'signalling': 46,
 'widespread': 47,
 'bennabi': 48,
 'mốc': 49,
 'cong': 50,
 'từ': 51,
 'globalguide': 52,
 'khán': 53,
 'xuất': 54,
 'stalin': 55,
 'those': 56,
 'database': 57,
 'could': 58,
 'ipu': 59,
 'già': 60,
 'placed': 61,
 'kapillarität': 62,
 'biographie': 63,
 'phách': 64,
 'kim': 65,
 'genius': 66,
 'au': 67,
 'lệ': 68,
 'colloquium': 69,
 '

In [195]:
import json
with open('word_to_id.json', 'w') as f:
    json.dump(word_to_id, f, ensure_ascii=False)

In [196]:
with open('id_to_word.json', 'w') as f:
    json.dump(id_to_word, f, ensure_ascii=False)

In [162]:
def generate_training_data(sentence, word_to_id, window_size):
    L = len(sentence)
    X, Y = [], []
    tempX= []
    for i in range(L):
        index_before_target = list(range(max(0, i - window_size), i))           
        index_after_target = list(range(i + 1, min(i + window_size + 1,L)))           
        index_before_after_target = index_before_target + index_after_target
        #print(index_before_after_target)                     
        for j in index_before_after_target:
            tempX.append(word_to_id[sentence[j]])
        #print(tempX)
        filling_missing_left = len(index_before_target)
        filling_missing_right = len(index_after_target)
        while(filling_missing_left < 3):
            tempX.insert(0,0)
            filling_missing_left +=1
        while(filling_missing_right < 3):
            tempX.append(0)
            filling_missing_right +=1
        X.append(tempX)
        Y.append(word_to_id[sentence[i]])
        tempX = []
    return X,Y

In [163]:
X,Y = generate_training_data(tokens, word_to_id, 3)
print(len(X))
print(len(Y))
print(X[:10])

62487
62487
[[0, 0, 0, 4690, 2528, 1076], [0, 0, 0, 2528, 1076, 4205], [0, 0, 4690, 1076, 4205, 2796], [0, 4690, 2528, 4205, 2796, 4808], [4690, 2528, 1076, 2796, 4808, 679], [2528, 1076, 4205, 4808, 679, 1327], [1076, 4205, 2796, 679, 1327, 4783], [4205, 2796, 4808, 1327, 4783, 3797], [2796, 4808, 679, 4783, 3797, 3017], [4808, 679, 1327, 3797, 3017, 4288]]


In [173]:
MAX_LENGTH = 6

X = pad_sequences(X, maxlen=MAX_LENGTH, padding='post')
print(X.shape)
print(X[:10,:])

(62487, 6)
[[   0    0    0 4690 2528 1076]
 [   0    0    0 2528 1076 4205]
 [   0    0 4690 1076 4205 2796]
 [   0 4690 2528 4205 2796 4808]
 [4690 2528 1076 2796 4808  679]
 [2528 1076 4205 4808  679 1327]
 [1076 4205 2796  679 1327 4783]
 [4205 2796 4808 1327 4783 3797]
 [2796 4808  679 4783 3797 3017]
 [4808  679 1327 3797 3017 4288]]


In [164]:
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)

(62487, 4881)


In [165]:
def handle_key_range(values):
    l = len(values)
    char_key = l - 100
    word = ''
    for i in range(char_key):
        word += " "+ values[i]
    coefs = asarray(values[char_key:], dtype='float32')
    return word.strip(), coefs

In [166]:
embeddings_index = dict()
count = 0
with open('/media/hieu/CA48F77D48F7669B/SpellChecking/model.txt',"r", encoding='utf-8') as f:
    f.readline() #skip first row   
    while True:
        try:
            line = next(f)
            values = line.split()
            word, coefs = handle_key_range(values)
            embeddings_index[word] = coefs
            count += 1
            if count == 1000000:
                break
        except:
            print("Line is broken")
print('Loaded %s word vectors.' % len(embeddings_index))


Line is broken
Line is broken
Loaded 999999 word vectors.


In [169]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in word_to_id.items():
    #print(word, i)
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        try:
            embedding_matrix[i] = embedding_vector
        except:
            embedding_matrix[i] = np.zeros(100)


In [170]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=6, weights=[embedding_matrix], trainable=False))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 6, 100)            488100    
_________________________________________________________________
gru_2 (GRU)                  (None, 150)               112950    
_________________________________________________________________
dense_2 (Dense)              (None, 4881)              737031    
Total params: 1,338,081
Trainable params: 849,981
Non-trainable params: 488,100
_________________________________________________________________
None


In [174]:
model.fit(X, Y, epochs=100, verbose=2)
model.save("modelpretrained.h5")
print("Saved model to disk")

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 23s - loss: 6.7310 - acc: 0.0310
Epoch 2/100
 - 21s - loss: 5.7507 - acc: 0.0940
Epoch 3/100
 - 21s - loss: 4.9113 - acc: 0.1794
Epoch 4/100
 - 22s - loss: 4.3751 - acc: 0.2325
Epoch 5/100
 - 22s - loss: 3.9898 - acc: 0.2702
Epoch 6/100
 - 22s - loss: 3.7084 - acc: 0.3007
Epoch 7/100
 - 22s - loss: 3.4782 - acc: 0.3249
Epoch 8/100
 - 23s - loss: 3.2928 - acc: 0.3467
Epoch 9/100
 - 23s - loss: 3.1470 - acc: 0.3714
Epoch 10/100
 - 24s - loss: 3.0175 - acc: 0.3896
Epoch 11/100
 - 24s - loss: 2.9064 - acc: 0.4050
Epoch 12/100
 - 23s - loss: 2.8148 - acc: 0.4186
Epoch 13/100
 - 24s - loss: 2.7349 - acc: 0.4321
Epoch 14/100
 - 24s - loss: 2.6695 - acc: 0.4401
Epoch 15/100
 - 25s - loss: 2.6053 - acc: 0.4514
Epoch 16/100
 - 25s - loss: 2.5582 - acc: 0.4576
Epoch 17/100
 - 25s - loss: 2.5132 - acc: 0.4620
Epoch 18/100
 - 25s - loss: 2.4711 - acc: 0.4694
Epoch 19/100
 - 25s - loss: 2.4327 - acc: 0.4724
Epoch 20/100
 - 25s - loss: 2.

In [93]:
def generate_training_data(document, window_size):
    L = len(document)
    docs, tempX, label = [], [], []
    for i in range(L):
        index_before_target = list(range(max(0, i - window_size), i))           
        index_after_target = list(range(i + 1, min(i + window_size + 1,L)))           
        index_before_after_target = index_before_target + index_after_target
        for j in index_before_after_target:
            tempX.append(document[j])
            
        filling_missing_left = len(index_before_target)
        filling_missing_right = len(index_after_target)
        while(filling_missing_left < 3):
            tempX.insert(0,"")
            filling_missing_left +=1
        while(filling_missing_right < 3):
            tempX.append("")
            filling_missing_right +=1
            
        docs.append(tempX)
        tempX = []
        label.append(document[i])
    return docs, label

In [14]:
tokens = tokenize(data)
vocab = set(tokens)
print("token length",len(tokens))
print("vocab length",len(vocab))

token length 62486
vocab length 4880


In [117]:
encoded_X = t.texts_to_sequences(docs)
encoded_Y = t.texts_to_sequences(label)
                                 
print(len(encoded_X))
print(len(encoded_Y))
print(type(encoded_Y))

66708
66708
<class 'list'>


In [126]:
print(t.word_index['non-steroid.'])

3337


In [131]:
result = []
for i in label:
    #print(i)
    result.append(t.word_index.get(i))

In [132]:
print(result)

[None, 4, 3, 1355, 476, 1841, 3337, None, 3338, None, None, None, None, None, 311, 126, 3343, 280, 3344, 454, 2318, 2, 2319, 3345, None, 525, 3347, 2319, 3348, 455, 3349, 3350, 476, 3351, 37, 270, 204, 3352, 3353, 20, None, 428, 20, None, 476, 1841, 3354, None, 20, None, 411, 20, 20, None, 108, 235, None, None, None, 3357, 31, 312, 7, 2320, 477, 334, 31, 312, 7, 3358, 4, 3, 41, 271, 465, 1843, 23, 2, 478, 292, 23, 11, None, 359, 8, 402, 4, 351, 1213, 1, 702, 184, 23, 133, 1568, None, 1039, None, None, 3359, 4, 3, 6, 14, 1214, 814, 1, 12, 6, 702, 293, 1569, 73, 3360, 1039, 1844, 30, 294, 27, 403, 775, 61, 133, 1570, 10, 25, 545, 617, 8, 124, 173, 2, 10, 34, 509, 564, 8, 371, 1215, None, 9, 116, 173, 217, 900, 526, 456, 271, 2, 113, 271, 13, 133, None, 90, 43, 304, 27, 1356, 120, 1, None, 70, 37, 546, 8, 236, 641, 26, 250, 584, 1845, 18, 259, 110, None, 3, 259, 110, 205, 61, 429, 145, 14, 392, 113, 8, 80, 456, 271, 375, 10, 34, 113, 8, 80, 133, None, None, None, 149, 57, 157, 127, 1217, 

In [110]:
result = []
for i in encoded_Y:
    result = result + i
print(result)


[8179, 4, 3, 1355, 476, 1841, 279, 5095, 196, 311, 126, 3343, 280, 3344, 454, 2318, 2, 2319, 3348, 3346, 525, 3421, 2319, 3348, 455, 346, 312, 481, 476, 1841, 37, 270, 204, 3352, 402, 428, 1355, 476, 1841, 3355, 281, 411, 95, 108, 235, 547, 1842, 177, 1127, 31, 312, 7, 2320, 477, 334, 31, 312, 7, 4, 3, 41, 271, 23, 1843, 23, 2, 478, 292, 23, 11, 122, 359, 8, 402, 4, 351, 1213, 1, 702, 184, 23, 133, 113, 466, 1039, 177, 177, 4, 3, 6, 14, 1214, 814, 1, 12, 6, 702, 293, 1569, 73, 321, 1039, 1844, 30, 294, 27, 403, 775, 61, 133, 1570, 10, 25, 545, 617, 8, 124, 173, 2, 10, 34, 509, 564, 8, 371, 22, 12, 9, 116, 173, 217, 900, 526, 456, 271, 2, 113, 271, 13, 133, 177, 90, 43, 304, 27, 1356, 120, 1, 177, 70, 37, 546, 8, 236, 641, 26, 250, 584, 92, 18, 259, 110, 3450, 177, 3, 259, 110, 205, 61, 429, 145, 14, 392, 113, 8, 80, 456, 271, 375, 10, 34, 113, 8, 80, 133, 177, 6, 81, 149, 57, 157, 127, 105, 177, 9, 335, 65, 56, 16, 1123, 2366, 88, 232, 87, 478, 292, 1, 122, 2, 3, 43, 4, 11, 77, 585, 1,

In [134]:
docs, label = generate_training_data(data.split(),3)
#print(docs)
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
print(vocab_size)
print(len(docs))
print(len(label))
#print(label)

8180
66708
66708


In [111]:
print(type(result))
print(len(result))

<class 'list'>
64966


In [56]:
print(embedding_matrix.shape)

(8180, 100)


In [10]:
print(id_to_word[0])




In [20]:
vocab_size = len(vocab)
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)

(62486, 4880)


In [175]:
def encode_input(input_string):
    return input_string.split(" ")

In [178]:
def spell_checking(sentence):
    encode_sentence = encode_input(sentence)
    X, y = generate_training_data(encode_sentence, word_to_id, 3)
    X = pad_sequences(X, maxlen=MAX_LENGTH, padding='post')
    error_words = []
    for i in range(len(X)):
        word = encode_sentence[i]
        word_index = word_to_id[word]
        yhat = model.predict_proba(np.array([X[i]]))
        if yhat[:,word_index] < 0.1:
            error_words.append(encode_sentence[i])
        print("input:",np.array([X[i]]))
        print("index:",i)
        print("probability of ---- ", encode_sentence[i], "----given surrouding is:" ,yhat[:,word_index])
        print("__________________________________________________________________________")
    return error_words

In [179]:
spell_checking("chiến tranh thế giới")

input: [[  0   0   0 940 671  24]]
index: 0
probability of ----  chiến ----given surrouding is: [0.9501532]
__________________________________________________________________________
input: [[   0    0 2812  671   24    0]]
index: 1
probability of ----  tranh ----given surrouding is: [0.83102125]
__________________________________________________________________________
input: [[   0 2812  940   24    0    0]]
index: 2
probability of ----  thế ----given surrouding is: [0.996897]
__________________________________________________________________________
input: [[2812  940  671    0    0    0]]
index: 3
probability of ----  giới ----given surrouding is: [0.93344676]
__________________________________________________________________________


[]

In [181]:
spell_checking("chiến tránh thế giới")

input: [[   0    0    0 4530  671   24]]
index: 0
probability of ----  chiến ----given surrouding is: [0.00266537]
__________________________________________________________________________
input: [[   0    0 2812  671   24    0]]
index: 1
probability of ----  tránh ----given surrouding is: [7.4834925e-06]
__________________________________________________________________________
input: [[   0 2812 4530   24    0    0]]
index: 2
probability of ----  thế ----given surrouding is: [0.9553061]
__________________________________________________________________________
input: [[2812 4530  671    0    0    0]]
index: 3
probability of ----  giới ----given surrouding is: [0.240608]
__________________________________________________________________________


['chiến', 'tránh']