In [1]:
from collections import defaultdict, Counter
from nltk.metrics.distance import edit_distance
from operator import itemgetter
from utils.syntax import *
import numpy as np
import spacy
import json

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

In [3]:
# Read patterns/sents json file
with open('coca_sm.json', 'r', encoding='utf8') as fs:
    BNC = json.load(fs)
    patterns, sents, ngrams = BNC['patterns'], BNC['sents'], BNC['ngrams']

In [None]:
prepositions = ['about', 'across', 'against', 'along', 'among', 'around', 'as', 'at',
                'beside', 'besides', 'between', 'by', 'down', 'during', 
                'except', 'for', 'from', 'in', 'inside', 'into', 'of', 'off', 
                'on', 'onto', 'outside', 'over', 'through', 'to', 'toward', 'towards', 
                'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
# not inlcuded: above / behind / beneath /beyond / below ... 

def normalize(ptn):
    if 'be V-ed' in ptn: print(ptn)
            
    ptn = 'V' + ptn.split('V')[1] # 去頭 (headword 之前的)
    ptn = ' '.join(ptn.split(' ')[:4]) # max lenght: 4-gram
    ptn = ptn.replace('V-ing', 'V').replace('V-ed', 'V') # 除了被動外，完成式和進行式改成原 V
    ptn = ptn.replace('wh-cl', 'O').replace('cl', 'O').replace('to-v', 'O').replace('v-ing', 'O') # 將子句/動名詞/to-V 視為受詞
        
    ptn = ptn.split(' ')
    if len(ptn) > 2:
        if ptn[1] in prepositions: # V prep. _
            ptn = ptn[:3]
        elif ptn[1] != 'O': # V before/during O
            ptn = ptn[:1]
        elif ptn[2] in prepositions: # V O prep. O
            ptn = ptn[:4]
        else: # V O O / V O not_prep
            ptn = ptn[:2]
    return ' '.join(ptn)


norm_patterns = defaultdict(lambda: defaultdict(Counter))
norm_ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
norm_sents = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))

for headword in patterns:
    for dep in patterns[headword]:
        for ptn in patterns[headword][dep]:
            norm_patterns[headword][dep][normalize(ptn)] += patterns[headword][dep][ptn]
            norm_ngrams[headword][dep][normalize(ptn)].extend(ngrams[headword][dep][ptn])
            norm_sents[headword][dep][normalize(ptn)].extend(sents[headword][dep][ptn])

In [39]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    # print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ptn, count) for ptn, count in counts.items() if count > avg + std])


def truncate_k(counts, k=10):
    return dict([(ptn, count) for ptn, count in counts.items() if count > k])


def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [131]:
def predict_ratio(ptn, patterns):
    if ptn not in patterns: 
        return 0
    
    # 使用 ptn / first_ptn 百分比
    return patterns[ptn] / patterns[max(patterns, key=patterns.get)]
    
    # if ptn == max(patterns, key=patterns.get): # 保證對
    #    return 1
    # return patterns[ptn] / sum(patterns.values())

    
CONFIDENT, UNCONFIDENT = 0.2, 0.1

def categorize(ratio):
    if ratio > CONFIDENT:     return 'right'
    elif ratio < UNCONFIDENT: return 'wrong'
    else:                     return 'not_sure'
    
    
def get_template(ratio):
    if ratio > CONFIDENT:     return '{{+{}+}}'
    elif ratio < UNCONFIDENT: return '[-{}-]'
    else:                     return '\\*{}*\\'
    
    

def suggest_ptn(bad_ptn, ptns):
    ptns = truncate_k(ptns, ptns[bad_ptn]) if bad_ptn in ptns else ptns # Optimize if exist

    sim_ptns = sorted(ptns, key=ptns.get, reverse=True)
    sim_ptns = sorted(sim_ptns, key=lambda ptn: edit_distance(bad_ptn.split(' '), ptn.split(' ')))
    
    print(sim_ptns[:10])
    
    return sim_ptns[0]


def correct(line):
    edits, suggestions = [], []
    for tk in nlp(line):
        if tk.tag_ in VERBS:
            # 以下拆 def ?
            ptn, ngram = dep_to_pattern(tk)

            ptn = normalize(ptn)
            ptns = norm_patterns[tk.lemma_][tk.dep_]
            # high_ptns  = get_high_freq(ptns)
            
            ratio = predict_ratio(ptn, ptns)
            print(tk.text, tk.dep_, ptn, ratio)
        
            if ratio < CONFIDENT:
                top_ptn = suggest_ptn(ptn, ptns)
                top_ngram = ngrams[tk.lemma_][tk.dep_][top_ptn][0]
                suggestions.append({
                    'category': categorize(ratio),
                    'tk': tk.text,
                    'bef': ptn,
                    'aft': top_ptn,
                    'ngram': top_ngram
                })
                
            edits.append(get_template(ratio).format(tk.text))
        else:
            edits.append(tk.text)
   
    return ' '.join(edits), suggestions

def main_process(content):
    edit_lines, suggestions = [], []

    for line in content.split('\n'):
        edit, sug = correct(line)
        
        edit_lines.append(edit)
        suggestions.extend(sug)

    return edit_lines, suggestions
 

In [132]:
if __name__ == '__main__':
    from pprint import pprint
    user_input = '''I want to discuss about my life. I rely my ability.'''
    pprint(main_process(user_input))

want ROOT V O 1.0
discuss xcomp V about O 0
['V O', 'V with O', 'V in O', 'V for O', 'V during O', 'V among O', 'V to O', 'V at O', 'V', 'V O with O']
rely ROOT V O 0.01032448377581121
['V on O', 'V upon O']
(['I {+want+} to [-discuss-] about my life . I [-rely-] my ability .'],
 [{'aft': 'V O',
   'bef': 'V about O',
   'category': 'wrong',
   'ngram': 'discuss pros',
   'tk': 'discuss'},
  {'aft': 'V on O',
   'bef': 'V O',
   'category': 'wrong',
   'ngram': 'rely on telescopes',
   'tk': 'rely'}])


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)

app.config['CORS_HEADERS'] = 'Content-Type'
CORS(app)

@app.route("/")
def hello():
    return 'Hello World'

# post /correct data: {content :}
@app.route('/correct' , methods=['POST'])
def start_correct():
    request_data = request.get_json()
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    
    content = request_data['content']
    print(content)
    
    edits, suggestions = main_process(content)
    
    return jsonify({
        'edits': edits,
        'suggestions': suggestions
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1314)

 * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
INFO:werkzeug: * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
140.114.77.132 - - [10/Jun/2018 23:03:29] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
INFO:werkzeug:140.114.77.132 - - [10/Jun/2018 23:03:29] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [10/Jun/2018 23:03:29] "[37mPOST /correct HTTP/1.1[0m" 200 -
INFO:werkzeug:140.114.77.132 - - [10/Jun/2018 23:03:29] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability. I am able to do something. I love to apple. I love to do something.
want ROOT V O 1.0
discuss xcomp V about O 0
['V O', 'V with O', 'V in O', 'V for O', 'V during O', 'V among O', 'V to O', 'V at O', 'V', 'V O with O']
rely ROOT V O 0.01032448377581121
['V on O', 'V upon O']
am ROOT V 1.0
do xcomp V O 1.0
love ROOT V to O 0.003084763948497854
['V O', 'V O to O', 'V', 'V O for O', 'V O with O', 'V O in O']
love ROOT V O 1.0
do xcomp V O 1.0
