In [4]:
from collections import Counter
from keras.models import load_model
import nltk
import numpy as np
import re

model = load_model('./spell.h5')

NGRAM = 5
MAXLEN = 40

alphabet = ['\x00',' ','_'] + list('0123456789abcdefghijklmnopqrstuvwxyzáàảãạâấầẩẫậăắằẳẵặóòỏõọôốồổỗộơớờởỡợéèẻẽẹêếềểễệúùủũụưứừửữựíìỉĩịýỳỷỹỵđABCDEFGHIJKLMNOPQRSTUVWXYZÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÉÈẺẼẸÊẾỀỂỄỆÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴĐ')

In [5]:
def encoderData(text):
    x = np.zeros((MAXLEN, len(alphabet)))
    for i, c in enumerate(text[:MAXLEN]):
        x[i, alphabet.index(c)] = 1
    if i < MAXLEN - 1:
        for j in range(i + 1, MAXLEN):
            x[j, 0] = 1
    return x

def decoderData(x):
    x = x.argmax(axis = -1)
    return ''.join(alphabet[i] for i in x)

In [6]:
print(encoderData('Tôi yêu bạn').shape)
print(decoderData(encoderData('Tôi yêu bạn')))

(40, 199)
Tôi yêu bạn                             


In [7]:
def nltkNGRAMS(words, n):
    return ngrams(words.split(),n)

In [8]:
def guess(ngram):
    text = ' '.join(ngram)
    preds = model.predict(np.array([encoderData(text)]))

    return decoderData(preds[0]).strip('\x00')

In [9]:
guess(('Xuwr','ný','ngoon','ngữ'))



'Xuwr ný nkoon nữữ'

In [7]:
def addPunctation(text, correctedText):
    listPunctation = {}

    for (i,word) in enumerate(text.split()):
        if word[0] not in alphabet or word[-1] not in alphabet:
            startPunc = ''
            for c in word:
                if c in alphabet:
                    break
                startPunc += c

            endPunc = ''
            for c in word:
                if c in alphabet:
                    break
                endPunc += c
            endPunc = endPunc[::-1]

            listPunctation[i] = [startPunc,endPunc]

    result = ''
    for (i,word) in enumerate(correctedText.split()):
        if i in listPunctation:
            result += (listPunctation[i][0] + word + listPunctation[i][1]) + ' '
        else:
            result += word + ' '

    return result.strip()

In [8]:
def correctText(text):
    newText = re.sub(r'[^' + ''.join(alphabet) + ']', '', text)

    ngrams = list(nltkNGRAMS(newText,NGRAM,MAXLEN))
    guessedNGRAMS = list(guess(ngram) for ngram in ngrams)
    candidates = [Counter() for _ in range(len(guessedNGRAMS) + NGRAM - 1)]

    for nid, ngram in (enumerate(guessedNGRAMS)):
        for wid, word in (enumerate(re.split('\s', ngram))):
            candidates[nid+wid].update([word])

    correctedText = ' '.join(c.most_common(1)[0][0] for c in candidates if c)
    return addPunctation(text, correctedText)

In [9]:
text = input()

result = correctText(text)
print(result)

text = re.sub(r'[^' + ''.join(alphabet) + ']', '', text)
listText = text.split()

result = re.sub(r'[^' + ''.join(alphabet) + ']', '', result)
listResult = result.split()

correctWord = [(listText[i], listResult[i]) for i in range(len(listText)) if listText[i] != listResult[i]]
correctWord

ỜửÈ vÉ Ễbạ


[('tôi', 'ỜửÈ'), ('yêu', 'vÉ'), ('bạn', 'Ễbạ')]

In [10]:
from tkinter import *
import tkinter



window = Tk()
window.title("Spell Correction")
window.geometry("800x600")
txt = Entry(window, width=100, font=('Times New Roman',12))
txt.grid(column=0,row=0)