In [128]:
import re
from collections import Counter

def words(text): return re.findall('[a-z]+', text.lower())

def words_and_bigrams(text):
  all_words = words(text)
  bigrams = [ ' '.join(all_words[i:i+2]) for i in range(len(all_words)-1) ]
  return all_words + bigrams

NWORDS = Counter(words_and_bigrams(open('big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet+' ' ]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

# list all possible fusion error candidate
def fusions_edit(word):
   word        = word.replace(' ', '')
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   fusions    = [(a+' '+b) for a, b in splits if a in NWORDS and b in NWORDS]
   if word in NWORDS:
      fusions += [word]
   return fusions

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    fusions = fusions_edit(word)
    # correct with fusion error first
    if fusions:
        return max(fusions, key=lambda w: NWORDS[w]) #key=lambda p: p.totalScore)
    # correct with edit candidate if no fusion candidate found
    return max(candidates, key=lambda w: NWORDS[w])

def get_all_candidates(word):
    return known([word]) | known(edits1(word)) | known_edits2(word) or [word]

print(correct('gardon'))

garden


In [129]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
API_URL = "http://linggle.com/query/"

class Linggle:
    def __getitem__(self, query):
        return self.search(query)

    def search(self, query):
        req = requests.get('http://linggle.com/query/' + query)
        results = req.json()
        results = [item['phrase'] for item in results]
        return results

SE = Linggle()


In [141]:
import Levenshtein
from collections import defaultdict

# ['w1','w2','w3','w4'] : gram_len = 4, 分組公式：len - (num - 1)
def get_groups(word_list, window_size):
    groups = [word_list[i: i + window_size] for i in range(len(word_list) - (window_size - 1))]
#     print(groups)
    return groups

def LCS(X, Y): # longest common subsequence
    m = len(X)
    n = len(Y)
    # An (m+1) times (n+1) matrix
    C = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(1, m+1):
        for j in range(1, n+1):
            if X[i-1] == Y[j-1]: 
                C[i][j] = C[i-1][j-1] + 1
            else:
                C[i][j] = max(C[i][j-1], C[i-1][j])
    return C

WINDOW_SIZE = 3
def get_correct(query):
    words = query.split()
    groups = get_groups(words, WINDOW_SIZE)
    
    for i, group in enumerate(groups):
        diff_words = [(group[i], i) for i in range(WINDOW_SIZE) if not group[i] == correct(group[i])] 
        
        # 可能需要補充句子全正確但是用錯單字的
        if not diff_words: continue
            
        # 篩選出含有 wrong word 機率的 gram
        for wrong_word, index in diff_words:
            # mask sentence
            possible_patterns = SE[' '.join(group[0:index] + ['_'] + group[index+1:])] 
            
            # get all possible combination
            for candidate in list(get_all_candidates(wrong_word)):
                possible_patterns += SE[' '.join(group[0:index] + [candidate] + group[index+1:])]
            
            if possible_patterns: # 應該要有 patterns
                dict_similarity = defaultdict(int)
                
                for pattern in possible_patterns:
                    dict_similarity[pattern[index]] = Levenshtein.distance(pattern[index], wrong_word)

                words[i+index] = min(dict_similarity, key=lambda word: dict_similarity[word])
                return ' '.join(words)
            else:
                print('???')
            
if __name__ == "__main__":
#     with open('test.txt', 'r') as input:
    with open('lab2.test.1.txt', 'r') as input:
        score = 0
        for line in input:
            [raw, answer] = line.strip().split('\t')
            
            correction = get_correct(raw)
            
            score += 1 if correction == answer else -1
            
            if correction == answer:
                print('Right: {}'.format(correction))
            else:
                print('Wrong: {} / {}'.format(correction, answer))
#                 print(correction)
#                 print(line)
        print(score)
                

Wrong: I felt very strang / I felt very strange
Wrong: None / at break time
Wrong: when the brick was finished / when the break was finished
Wrong: in the center when it was snowing / in the winter when it was snowing
Wrong: I thought it was a gost / I thought it was a ghost
Wrong: None / everything except the houses
Wrong: when I first steped / when I first stepped
Wrong: and saw streaks colow people / and saw strange colow people
Wrong: and saw streaks colow people / and saw streagh coloured people
Wrong: I was on an exclation / I was on an escalator
???
Right: I noticed that I was on this thing
Wrong: through the lance / through the fence
Wrong: the hunters killed them / the hunters kill them
Wrong: they kill birds with their narrow / they kill birds with their arrow
Wrong: make a dupe hole / make a deep hole
Right: to tidy up his garden
Wrong: the wind belt the leaves / the wind blew the leaves
???
Wrong: Mr J was very angray / Mr J. was very angry
Wrong: garden full of leads / gar

In [121]:
print(get_all_candidates('nerrow'))

{'harrow', 'neuro', 'sorrow', 'borrow', 'error', 'terror', 'furrow', 'barrow', 'narrow', 'narrows', 'arrow', 'negro', 'neuron', 'burrow', 'marrow', 'morrow'}
