In [41]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re

In [42]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [48]:
import os
entries = os.listdir("/media/hieu/CA48F77D48F7669B/TextCorrection/Character-basedTextCorrection/corpus.viwiki/mysmalltest")
#print(entries)
data = ""
for entry in entries:
    path = "/media/hieu/CA48F77D48F7669B/TextCorrection/Character-basedTextCorrection/corpus.viwiki/mysmalltest/" + entry
    with open(path) as f:
        currentdata = f.read()
    data = data + currentdata
print("Loading data from file .........")

Loading data from file .........


In [52]:
# doc = """
# Mới đây, VFF và VPF dự kiến tổ chức cuộc họp vào ngày 31/3 với đại diện các CLB để thống nhất phương án tổ chức V.League ngay sau khi dịch bệnh lắng xuống. Họ đảm bảo chỉ tổ chức giải khi Chính phủ cho phép, trong điều kiện an toàn và trước mắt vẫn thi đấu ở trên sân không khán giả cho đến khi dịch bệnh hết hẳn. 

# Một động thái được cho là bình thường nhưng bầu Đức lại thể hiện sự tức giận. Ông tiếp tục đưa ra tuyên bố "cấm HAGL tham gia bất kỳ hoạt động thể thao nào trong giai đoạn này", thậm chí, không tham gia họp kể cả trực tiếp hay trực tuyến của VFF và VPF
# """
tokenszero = [""]
tokens = tokenize(data)
tokens = tokenszero + tokens
vocab = set(tokens)
print("token length",len(tokens))
print("vocab length",len(vocab))
#print(tokens)

token length 62487
vocab length 4880


In [53]:
word_to_id, id_to_word = mapping(tokens)
#print(word_to_id)
#print(id_to_word)

In [54]:
def generate_training_data(sentence, word_to_id, window_size):
    L = len(sentence)
    X, Y = [], []
    tempX, tempY = [], []
    for i in range(L):
        index_before_target = list(range(max(0, i - window_size), i))           
        index_after_target = list(range(i + 1, min(i + window_size + 1,L)))           
        index_before_after_target = index_before_target + index_after_target
        #print(index_before_after_target)                     
        for j in index_before_after_target:
            tempX.append(word_to_id[sentence[j]])
        #print(tempX)
        filling_missing_left = len(index_before_target)
        filling_missing_right = len(index_after_target)
        while(filling_missing_left < 3):
            tempX.insert(0,0)
            filling_missing_left +=1
        while(filling_missing_right < 3):
            tempX.append(0)
            filling_missing_right +=1
        X.append(tempX)
        Y.append(word_to_id[sentence[i]])
        tempX = []
    return X,Y

In [58]:
X,Y = generate_training_data(tokens, word_to_id, 3)
print(len(X))
print(len(Y))
#print(X[:10])
#print(type(X))
#print("------------------------------------------------------")
#print(Y[:10])
#print(Y.shape)

62487
62487


In [59]:
MAX_LENGTH = 6

X = pad_sequences(X, maxlen=MAX_LENGTH, padding='post')
print(X.shape)

(62487, 6)


In [60]:
vocab_size = len(vocab)
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)

(62487, 4880)


In [61]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=6))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 6, 100)            488000    
_________________________________________________________________
gru_2 (GRU)                  (None, 150)               112950    
_________________________________________________________________
dense_2 (Dense)              (None, 4880)              736880    
Total params: 1,337,830
Trainable params: 1,337,830
Non-trainable params: 0
_________________________________________________________________
None


In [62]:

model.fit(X, Y, epochs=100, verbose=2)


Epoch 1/100
 - 24s - loss: 6.9450 - acc: 0.0222
Epoch 2/100
 - 23s - loss: 6.4883 - acc: 0.0370
Epoch 3/100
 - 23s - loss: 5.8418 - acc: 0.0870
Epoch 4/100
 - 23s - loss: 4.9657 - acc: 0.1849
Epoch 5/100
 - 23s - loss: 4.2674 - acc: 0.2654
Epoch 6/100
 - 23s - loss: 3.7750 - acc: 0.3193
Epoch 7/100
 - 24s - loss: 3.3900 - acc: 0.3670
Epoch 8/100
 - 28s - loss: 3.0745 - acc: 0.4080
Epoch 9/100
 - 28s - loss: 2.8130 - acc: 0.4447
Epoch 10/100
 - 26s - loss: 2.5928 - acc: 0.4774
Epoch 11/100
 - 25s - loss: 2.4041 - acc: 0.5065
Epoch 12/100
 - 25s - loss: 2.2416 - acc: 0.5293
Epoch 13/100
 - 26s - loss: 2.1020 - acc: 0.5545
Epoch 14/100
 - 27s - loss: 1.9861 - acc: 0.5702
Epoch 15/100
 - 25s - loss: 1.8741 - acc: 0.5895
Epoch 16/100
 - 29s - loss: 1.7877 - acc: 0.6027
Epoch 17/100
 - 30s - loss: 1.7035 - acc: 0.6170
Epoch 18/100
 - 31s - loss: 1.6304 - acc: 0.6303
Epoch 19/100
 - 30s - loss: 1.5619 - acc: 0.6400
Epoch 20/100
 - 30s - loss: 1.4984 - acc: 0.6523
Epoch 21/100
 - 30s - loss: 1

<keras.callbacks.callbacks.History at 0x7f4d28212f10>

In [63]:
def encode_input(input_string):
    return input_string.split(" ")

In [82]:
def spell_checking(sentence):
    encode_sentence = encode_input(sentence)
    X, y = generate_training_data(encode_sentence, word_to_id, 3)
    error_words = []
    for i in range(len(X)):
        word = encode_sentence[i]
        word_index = word_to_id[word]
        yhat = model.predict_proba(np.array([X[i]]))
        if yhat[:,word_index] < 0.1:
            error_words.append(encode_sentence[i])
        print("input:",np.array([X[i]]))
        print("index:",i)
        print("probability of ---- ", encode_sentence[i], "----given surrouding is:" ,yhat[:,word_index])
        print("__________________________________________________________________________")
    return error_words

In [89]:
spell_checking("chiến tranh thế giới")

input: [[   0    0    0 4587 1611 1047]]
index: 0
probability of ----  chiến ----given surrouding is: [0.98789483]
__________________________________________________________________________
input: [[   0    0   50 1611 1047    0]]
index: 1
probability of ----  tranh ----given surrouding is: [0.7466447]
__________________________________________________________________________
input: [[   0   50 4587 1047    0    0]]
index: 2
probability of ----  thế ----given surrouding is: [0.6067198]
__________________________________________________________________________
input: [[  50 4587 1611    0    0    0]]
index: 3
probability of ----  giới ----given surrouding is: [0.9147856]
__________________________________________________________________________


[]

In [None]:
temp = np.array([[ 0,  0,  3, 20, 90, 44]])
yhat = model.predict_proba(temp, verbose = 0)
yhat2 = model.predict_classes(temp)
print(yhat)
print(yhat2)

In [None]:
print(np.max(yhat))
result = np.where(yhat[0,:] == 0.3688755)
print(result)
print(yhat[:,72])
print(id_to_word[72])

In [None]:
def generate_input(left, right):
    result = []
    for i in left:
        result.append(word_to_id[i])
    if len(left) < 3:
        result.insert(0,0)
    for i in right:
        result.append(word_to_id[i])
    if len(right) < 3:
        result.append(0)
    return np.array([result])