In [80]:
from difflib import get_close_matches, SequenceMatcher
import re
from collections import Counter
from pathlib import Path
from enchant import Dict, DictWithPWL

load_clean_dataset()
dataset = 'deep_clean_mobicontrol_data.txt'
word_count_dict, all_letters = load_words_letters(dataset)
vocab = load_vocabulary()
dictionary = DictWithPWL("en_US", 'vocab.txt')


def load_words_letters(dataset):
    hira_kata = 'かめふアうチパズヅさモぴグゆごヒサもシマりはゲひべヘイヤづペユへぽのほけエこツぺぢだをデどヨギぜミキリるろヌばむょラゴにウずしてすぬつスコネせムロたちゾぎゃおピぶンねガヲぱらカダメュョぷそナみノなんクホハニぞトでげワいャぐとザビやソプれぼきバベブジゅじゼまレセルびポがくわタドオケヂフボテえざあよ'
    eng_letters = 'abcdefghijklmnopqrstuvwxyz'
    all_letters = hira_kata + eng_letters
    word_count_dict = Counter(words(open(dataset).read()))
    return word_count_dict, all_letters

def load_vocabulary():
    corpus_vocab_file = '/home/iftekhar/amiebot/exp_amiecore/amieCore/amie_core/core/retriever/Page_Ranking_Experiment/pipelines/vocabulary.txt'
    with open(corpus_vocab_file) as f:
        vocabulary = f.read().splitlines()
    lines = [line.lower() for line in corpus_vocab_file]
    with open('vocab.txt', 'w') as out:
         out.writelines(lines)
    return [vocab.lower() for vocab in vocabulary]

def load_clean_dataset():
    data_file = Path("/home/iftekhar/AI-system/Helpers/Mixed/POL_workshop/processed_texts.txt")
    with open(data_file, encoding='utf-8') as f:
        data_list = f.read().splitlines()

    f = open('deep_clean_mobicontrol_data.txt', 'w')
    f.write(single_character_remover(" ".join(data_list)))
    f.close()

def single_character_remover(text):
    collector = []
    for items in text.split():
        if len(items) < 2:
            replaced = re.sub(r'[ぁ-んァ-ン]', '', items)
            replaced = re.sub(r'[A-Za-z]', '', replaced)
            replaced = re.sub(r'[0-9]', '', replaced)
            collector.append(replaced)
        else:
            collector.append(items)
    return ' '.join([temp.strip(' ') for temp in collector])

def all_substrings(string):
    n = len(string)
    return {string[i:j+1] for i in range(n) for j in range(i,n)}

def longest_match(best_matches, items):
    longest_content = []
    for content in best_matches: 
        longest_content.append(max(all_substrings(content) & all_substrings(items), key=len))
    return max(longest_content, key=len)

def handling_spelling_mistakes(misspelled_word, vocabulary):
    max_term = []
    best_matches = get_close_matches(misspelled_word, vocabulary, n = 5, cutoff = 0.6)
    if best_matches:
        max_term = longest_match(best_matches, misspelled_word)
    return max_term

def probability(word): 
    return word_count_dict[word]/sum(word_count_dict.values())

def words(text): 
    return re.findall(r'\w+', text.lower())

def correction(word, letters): 
    "Most probable spelling correction for word."
    return max(candidates(word, letters), key=probability)

def candidates(word, letters): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word, letters)) or known(edits2(word, letters)) or [word])

def known(words): 
    return set(w for w in words if w in word_count_dict)

def edits1(word, letters):
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word, letters): 
    return (e2 for e1 in edits1(word, letters) for e2 in edits1(e1, letters))

def spelling_checker_suggester(sent, vocab, dictionary, all_letters):
    sent_list = sent.split()
    correct_sentence = {}
    for word in sent_list:
        word = word.lower()
        if word in vocab:
            correct_sentence[word] = word
        else:
            # First find words suggestion from library
            suggested_words = dictionary.suggest(word)
            if suggested_words:
                correct_sentence[word] = suggested_words
            # find longest chunk match using difflib library
            else:            
                matches = handling_spelling_mistakes(word, vocab)
                match_len = len(matches)/len(word)
                # Macthed chunk length is <25% distance away
                if match_len > 0.75 and SequenceMatcher(None, word, matches).ratio() > 0.75:
                    suggested_words = dictionary.suggest(matches)
                    correct_sentence[word] = suggested_words
                # Macthed chunk length is <35% distance away
                elif SequenceMatcher(None, word, matches).ratio() > 0.65 and not suggested_words:
                    suggested_words = correction(word, all_letters).split() + list(known(edits2(word, all_letters)))
                    correct_sentence[word] = list(set(suggested_words))
                else:
                    word = '*' + word + '*'
                    correct_sentence[word] = word
    return correct_sentence

In [81]:
sentence = 1
while sentence:
    sentence = input("Provide a input: ")
    print(spelling_checker_suggester(sentence, vocab, dictionary, all_letters))

Provide a input: 管理 機能 構成 プロフイル 規定 wifi ssid wifie 接続 続禁止 bluetoothe
{'管理': '管理', '機能': '機能', '構成': '構成', 'プロフイル': ['プロパイダ', 'プロトコル', 'プロバイダ'], '規定': '規定', 'wifi': 'wifi', 'ssid': 'ssid', 'wifie': ['wife', 'WiFi'], '接続': '接続', '続禁止': ['禁止'], 'bluetoothe': ['Bluetooth']}
Provide a input: 
{}
