In [1]:
from collections import defaultdict, Counter
from nltk.metrics.distance import edit_distance
from operator import itemgetter
from utils.syntax import *
import numpy as np
import spacy
import json

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

In [3]:
# Read patterns/sents json file
with open('coca.json', 'r', encoding='utf8') as fs:
    BNC = json.load(fs)
    patterns, sents, ngrams = BNC['patterns'], BNC['sents'], BNC['ngrams']

In [8]:
prepositions = ['about', 'across', 'against', 'along', 'among', 'around', 'as', 'at',
                'beside', 'besides', 'between', 'by', 'down', 'during', 
                'except', 'for', 'from', 'in', 'inside', 'into', 'of', 'off', 
                'on', 'onto', 'outside', 'over', 'through', 'to', 'toward', 'towards', 
                'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
# not inlcuded: above / behind / beneath /beyond / below ... 

def normalize(ptn):
    if 'be V-ed' in ptn: print(ptn) # 先不管被動用法
            
    ptn = 'V' + ptn.split('V')[1] # 去頭 (headword 之前的)
    ptn = ' '.join(ptn.split(' ')[:4]) # max lenght: 4-gram
    ptn = ptn.replace('V-ing', 'V').replace('V-ed', 'V') # 除了被動外，完成式和進行式改成原 V
    # ptn = ptn.replace('wh-cl', 'O').replace('cl', 'O') # cl / wh-cl -> O
    # ptn = ptn.replace('to-v', 'ADJ').replace('v-ing', 'ADJ') # v-ing / to-v -> ?
        
    # if / which / who / whom
    # TODO: 還要修改條件？
    ptn = ptn.split(' ')
    if len(ptn) > 2:
        if ptn[1] in prepositions: # V prep. _
            ptn = ptn[:3]
        elif ptn[1] != 'O': # V before O
            ptn = ptn[:1]
        elif ptn[2] in prepositions: # V O prep. O
            ptn = ptn[:4]
        else: # V O O / V O not_prep
            ptn = ptn[:2]
    return ' '.join(ptn)


norm_patterns = defaultdict(lambda: defaultdict(Counter))
norm_ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
norm_sents = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))

for headword in patterns:
    for dep in patterns[headword]:
        for ptn in patterns[headword][dep]:
            norm_patterns[headword][dep][normalize(ptn)] += patterns[headword][dep][ptn]
            norm_ngrams[headword][dep][normalize(ptn)].extend(ngrams[headword][dep][ptn])
            norm_sents[headword][dep][normalize(ptn)].extend(sents[headword][dep][ptn])

In [9]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    # print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ptn, count) for ptn, count in counts.items() if count > avg + std])


def truncate_k(counts, k=10):
    return dict([(ptn, count) for ptn, count in counts.items() if count > k])


def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [10]:
def predict_ratio(ptn, patterns):
    if ptn not in patterns: 
        return 0
    
    # 使用 ptn / first_ptn 百分比
    return patterns[ptn] / patterns[max(patterns, key=patterns.get)]
    
    # if ptn == max(patterns, key=patterns.get): # 保證對
    #    return 1
    # return patterns[ptn] / sum(patterns.values())

    
CONFIDENT, UNCONFIDENT = 0.2, 0.1

def categorize(ratio):
    if ratio > CONFIDENT:     return 'right'
    elif ratio < UNCONFIDENT: return 'wrong'
    else:                     return 'not_sure'
    
    
def get_template(ratio):
    if ratio > CONFIDENT:     return '{{+{}+}}'
    elif ratio < UNCONFIDENT: return '[-{}-]'
    else:                     return '\\*{}*\\'
    
    
def suggest_ptn(bad_ptn, ptns):
    ptns = truncate_k(ptns, ptns[bad_ptn]) if bad_ptn in ptns else ptns # Optimize if exist

    sim_ptns = sorted(ptns, key=ptns.get, reverse=True)
    sim_ptns = sorted(sim_ptns, key=lambda ptn: edit_distance(bad_ptn.split(' '), ptn.split(' ')))
    
    print(sim_ptns[:5])
    
    return sim_ptns[0]


def suggest_ngram(ngram, ngrams):
    ngrams = filter(lambda ng: '@@@' not in ng, set(ngrams)) # workaround
    ngram = ngram.lower()

    sim_ngrams = sorted(ngrams, key=lambda ng: edit_distance(ngram.split(' '), ng.split(' ')))
    
    print(sim_ngrams[:5])
    
    return sim_ngrams[0]


def correct(line):
    edits, suggestions = [], []
    for tk in nlp(line):
        if tk.tag_ in VERBS:
            # 以下拆 def ?
            ptn, ngram = dep_to_pattern(tk)

            ptn = normalize(ptn)
            ptns = norm_patterns[tk.lemma_][tk.dep_]
            # high_ptns  = get_high_freq(ptns)
            
            ratio = predict_ratio(ptn, ptns)
            print(tk.text, tk.dep_, ptn, ratio)
        
            if ratio < CONFIDENT:
                top_ptn = suggest_ptn(ptn, ptns)
                top_ngram = suggest_ngram(ngram, norm_ngrams[tk.lemma_][tk.dep_][top_ptn])
                suggestions.append({
                    'category': categorize(ratio),
                    'tk': tk.text,
                    'bef': ptn,
                    'aft': top_ptn,
                    'ngram': top_ngram
                })
                
            edits.append(get_template(ratio).format(tk.text))
        else:
            edits.append(tk.text)
   
    return ' '.join(edits), suggestions

def main_process(content):
    edit_lines, suggestions = [], []

    for line in content.split('\n'):
        edit, sug = correct(line)
        
        edit_lines.append(edit)
        suggestions.extend(sug)

    return edit_lines, suggestions
 

In [11]:
if __name__ == '__main__':
    from pprint import pprint
    user_input = '''I want to discuss about my life. I rely my ability.'''
    user_input = 'can you speak something or do anything to response me?'
    pprint(main_process(user_input))

speak ROOT V O 0.45176897490172363
do conj V O 1.0
response advcl V O 0
[]


IndexError: list index out of range

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)

app.config['CORS_HEADERS'] = 'Content-Type'
CORS(app)

@app.route("/")
def hello():
    return 'Hello World'

# post /correct data: {content :}
@app.route('/correct' , methods=['POST'])
def start_correct():
    request_data = request.get_json()
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    
    content = request_data['content']
    print(content)
    
    edits, suggestions = main_process(content)
    
    return jsonify({
        'edits': edits,
        'suggestions': suggestions
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1314)

 * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
140.114.86.43 - - [12/Jun/2018 17:26:07] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.86.43 - - [12/Jun/2018 17:26:07] "[37mPOST /correct HTTP/1.1[0m" 200 -


d
I want to discuss about my life. I relied my ability. I am able to do something. I love to apple. I love to do something. I discussed about my life.
I am able to do to eat .
want ROOT V to-v 1.0
discuss xcomp V about O 0
['V O', 'V with O', 'V in O', 'V at O', 'V for O']
['to discuss life', 'to discuss stereotypes', 'to discuss project', 'to discuss mission', 'to discuss testimony']
relied ROOT V O 0.00807899461400359
['V on O', 'V upon O', 'V', 'V on cl']
['i relied on people', 'i relied on systems', 'i relied on sense', 'i relied on wife', 'i relied on that']
am ROOT V 1.0
do xcomp V O 1.0
love ROOT V to O 0.0020557097337855893
['V O', 'V O to O', 'V to-v', 'V cl', 'V']
['i love blender', 'i love designers', 'i love buble', 'i love recognition', 'i love redskins']
love ROOT V to-v 0.1444821324562305
['V O']
['i love blender', 'i love designers', 'i love buble', 'i love recognition', 'i love redskins']
do xcomp V O 1.0
discussed ROOT V about O 0.0007380073800738007
['V O', 'V with O

140.114.86.43 - - [12/Jun/2018 17:26:09] "[37mPOST /correct HTTP/1.1[0m" 200 -


['to do jazz', 'to do arithmetic', 'to do checking', 'to do alteration', 'to do sweeping']
eat xcomp V 1.0


140.114.86.43 - - [12/Jun/2018 17:52:10] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.86.43 - - [12/Jun/2018 17:52:10] "[37mPOST /correct HTTP/1.1[0m" 200 -


Sometimes I clean about my desk because I do n't like a dirty desk .
I like my job , because I contacted with people from several countries .
I learn about it .
clean ROOT V about O 0
['V O', 'V in O', 'V with O', 'V for O', 'V from O']
['i clean paragraphs', 'i clean place', 'i clean kitchen', 'you clean desk', 'i clean house']
do aux V 1.0
like advcl V O 1.0
like ROOT V O 0.9445143797811562
contacted advcl V with O 0.00904977375565611
['V O', 'V O at O', 'V', 'V O for O', 'V O about O']
['i contacted her', 'i contacted agency', 'i contacted him', 'i contacted them', 'i contacted police']
learn ROOT V about O 0.24664932986597318


In [29]:
sort_dict(norm_patterns['rely']['ROOT'])

[('V on O', 3342),
 ('V upon O', 138),
 ('V on cl', 91),
 ('V', 37),
 ('V O', 27),
 ('V on wh-cl', 26),
 ('V on v-ing', 12),
 ('V to O', 11),
 ('V for O', 10),
 ('V on', 9),
 ('V cl', 8),
 ('V O on O', 6),
 ('V in O', 5),
 ('V with O', 2),
 ('V upon', 2),
 ('V on v', 2),
 ('V at O', 2),
 ('V upon cl', 2),
 ('V to-v', 1),
 ('V in cl', 1),
 ('V by O', 1),
 ('V v', 1),
 ('V upon v', 1),
 ('V upon wh-cl', 1),
 ('V upon v-ing', 1),
 ('V on to-v', 1),
 ('V in on', 1),
 ('V on v-ed', 1),
 ('V of O', 1),
 ('V as O', 1)]