In [1]:
from collections import defaultdict, Counter
from nltk.metrics.distance import edit_distance
from operator import itemgetter
from utils.syntax import *
import numpy as np
import spacy
import json

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

In [3]:
# Read patterns/sents json file
with open('coca_sm.json', 'r', encoding='utf8') as fs:
    BNC = json.load(fs)
    patterns, sents, ngrams = BNC['patterns'], BNC['sents'], BNC['ngrams']

In [39]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    # print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ptn, count) for ptn, count in counts.items() if count > avg + std])


def normalize_patterns(counts):
    pass


def truncate_k(counts, k=10):
    return dict([(ptn, count) for ptn, count in counts.items() if count > k])


def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [105]:
def predict_ratio(ptn, patterns):
    if ptn not in patterns: 
        return 0
    
    # 使用 ptn / first_ptn 百分比
    return patterns[ptn] / patterns[max(patterns, key=patterns.get)]
    
    # if ptn == max(patterns, key=patterns.get): # 保證對
    #    return 1
    # return patterns[ptn] / sum(patterns.values())

    
CONFIDENT, UNCONFIDENT = 0.2, 0.1

def categorize(ratio):
    if ratio > CONFIDENT:     return 'right'
    elif ratio < UNCONFIDENT: return 'wrong'
    else:                     return 'not_sure'
    
    
def get_template(ratio):
    if ratio > CONFIDENT:     return '{{+{}+}}'
    elif ratio < UNCONFIDENT: return '[-{}-]'
    else:                     return '\\*{}*\\'
    
    

def suggest_ptn(bad_ptn, ptns):
    ptns = truncate_k(ptns, ptns[bad_ptn]) if bad_ptn in ptns else ptns # Optimize if exist

    sim_ptns = sorted(ptns, key=ptns.get, reverse=True)
    sim_ptns = sorted(sim_ptns, key=lambda ptn: edit_distance(bad_ptn.split(' '), ptn.split(' ')))
    
    print(sim_ptns[:10])
    
    return sim_ptns[0]


def correct(line):
    edits, suggestions = [], []
    for tk in nlp(line):
        if tk.tag_ in VERBS:
            # 以下拆 def ?
            ptn, ngram = dep_to_pattern(tk)
    
            ptns = patterns[tk.lemma_][tk.dep_]
            high_ptns  = get_high_freq(ptns)
            
            ratio = predict_ratio(ptn, ptns)
            print(tk.text, tk.dep_, ptn, ratio)
        
            if ratio < CONFIDENT:
                top_ptn = suggest_ptn(ptn, ptns)
                top_ngram = ngrams[tk.lemma_][tk.dep_][top_ptn][0]
                suggestions.append({
                    'category': categorize(ratio),
                    'tk': tk.text,
                    'bef': ptn,
                    'aft': top_ptn,
                    'ngram': top_ngram
                })
                
            edits.append(get_template(ratio).format(tk.text))
        else:
            edits.append(tk.text)
   
    return ' '.join(edits), suggestions

def main_process(content):
    edit_lines, suggestions = [], []

    for line in content.split('\n'):
        edit, sug = correct(line)
        
        edit_lines.append(edit)
        suggestions.extend(sug)

    return edit_lines, suggestions
 

In [106]:
if __name__ == '__main__':
    from pprint import pprint
    user_input = '''I want to discuss about my life. I rely my ability.'''
    pprint(main_process(user_input))

want ROOT S V to-v 1.0
discuss xcomp to V about O 0
['to V O', 'to V with O', 'to V in O', 'to V O O', 'to V during O', 'to V to O', 'to V at O', 'to V', 'to V O with O', 'to V wh-cl']
rely ROOT S V O 0.00856898029134533
['S V on O', 'S V upon O', 'S V on O to-v', 'S V on cl', 'cl S V on O', 'V on O', 'S v V on O', 'v V on O', 'S V on O O', 'S S V on O']
(['I {+want+} to [-discuss-] about my life . I [-rely-] my ability .'],
 [{'aft': 'to V O',
   'bef': 'to V about O',
   'category': 'wrong',
   'ngram': 'to discuss concerns',
   'tk': 'discuss'},
  {'aft': 'S V on O',
   'bef': 'S V O',
   'category': 'wrong',
   'ngram': 'court relies on definition',
   'tk': 'rely'}])


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)

app.config['CORS_HEADERS'] = 'Content-Type'
CORS(app)

@app.route("/")
def hello():
    return 'Hello World'

# post /correct data: {content :}
@app.route('/correct' , methods=['POST'])
def start_correct():
    request_data = request.get_json()
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    
    content = request_data['content']
    print(content)
    
    edits, suggestions = main_process(content)
    
    return jsonify({
        'edits': edits,
        'suggestions': suggestions
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1314)

 * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
140.114.77.132 - - [10/Jun/2018 22:15:10] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [10/Jun/2018 22:15:10] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability. I am able to do something. I love to apple. I love to do something.
want ROOT S V to-v 1.0
discuss xcomp to V about O 0
['to V O', 'to V with O', 'to V in O', 'to V O O', 'to V during O', 'to V to O', 'to V at O', 'to V', 'to V O with O', 'to V wh-cl']
rely ROOT S V O 0.00856898029134533
['S V on O', 'S V upon O', 'S V on O to-v', 'S V on cl', 'cl S V on O', 'V on O', 'S v V on O', 'v V on O', 'S V on O O', 'S S V on O']
am ROOT S V 1.0
do xcomp to V O 1.0
love ROOT S V to O 0.003532736693358455
['S V O', 'S V O O', 'S V O to O', 'S V to-v', 'S V cl', 'cl S V O', 'V O', 'S V', 'S V wh-cl', 'v S V O']
love ROOT S V to-v 0.22774375883184172
do xcomp to V O 1.0


140.114.77.132 - - [10/Jun/2018 22:15:25] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [10/Jun/2018 22:15:25] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability. I am able to do something. I love to apple. I love to do something.
want ROOT S V to-v 1.0
discuss xcomp to V about O 0
['to V O', 'to V with O', 'to V in O', 'to V O O', 'to V during O', 'to V to O', 'to V at O', 'to V', 'to V O with O', 'to V wh-cl']
rely ROOT S V O 0.00856898029134533
['S V on O', 'S V upon O', 'S V on O to-v', 'S V on cl', 'cl S V on O', 'V on O', 'S v V on O', 'v V on O', 'S V on O O', 'S S V on O']
am ROOT S V 1.0
do xcomp to V O 1.0
love ROOT S V to O 0.003532736693358455
['S V O', 'S V O O', 'S V O to O', 'S V to-v', 'S V cl', 'cl S V O', 'V O', 'S V', 'S V wh-cl', 'v S V O']
love ROOT S V to-v 0.22774375883184172
do xcomp to V O 1.0


140.114.77.132 - - [10/Jun/2018 22:15:34] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [10/Jun/2018 22:15:34] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability. I am able to do something. I love to apple. I love to do something.
asdf
want ROOT S V to-v 1.0
discuss xcomp to V about O 0
['to V O', 'to V with O', 'to V in O', 'to V O O', 'to V during O', 'to V to O', 'to V at O', 'to V', 'to V O with O', 'to V wh-cl']
rely ROOT S V O 0.00856898029134533
['S V on O', 'S V upon O', 'S V on O to-v', 'S V on cl', 'cl S V on O', 'V on O', 'S v V on O', 'v V on O', 'S V on O O', 'S S V on O']
am ROOT S V 1.0
do xcomp to V O 1.0
love ROOT S V to O 0.003532736693358455
['S V O', 'S V O O', 'S V O to O', 'S V to-v', 'S V cl', 'cl S V O', 'V O', 'S V', 'S V wh-cl', 'v S V O']
love ROOT S V to-v 0.22774375883184172
do xcomp to V O 1.0


In [103]:
prepositions = ['about', 'along', 'among', 'around', 'as', 'at', 'beside', 'besides', 'between', 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', 'of', 'off', 'on', 'onto', 'outside', 'over', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']

def norm_ptn(counts):
    temp = Counter()
    for ptn, ctn in counts.items():
        if 'be V-ed' in ptn: print(ptn)
            
        ptn = 'V' + ptn.split('V')[1] # 去頭 (headword 之前的)
        ptn = ' '.join(ptn.split(' ')[:4]) # max lenght: 4-gram
        ptn = ptn.replace('V-ing', 'V').replace('V-ed', 'V') # 除了被動外，完成式和進行式改成原 V
        ptn = ptn.replace('wh-cl', 'O').replace('cl', 'O').replace('to-v', 'O').replace('v-ing', 'O') # 將子句/動名詞/to-V 視為受詞
        
        # 第二個是介系詞則只往後再抓一個
        ptn = ptn.split(' ')
        if len(ptn) > 2:
            if ptn[1] in prepositions: # V prep. _
                ptn = ptn[:3]
            elif ptn[2] == 'O' or ptn[2] not in prepositions: # V O O
                ptn = ptn[:2]
            else: # V O prep. O
                ptn = ptn[:4]
        ptn = ' '.join(ptn)
        temp[ptn] += ctn
    pprint(temp)

norm_ptn(patterns['discuss']['xcomp'])

Counter({'V O': 649,
         'V': 86,
         'V O with O': 52,
         'V O in O': 36,
         'V with O': 25,
         'V in O': 13,
         'V O at O': 10,
         'V O without O': 6,
         'V O on O': 4,
         'V O as O': 4,
         'V O for O': 4,
         'V for O': 2,
         'V O along': 2,
         'V O by O': 2,
         'V v': 2,
         'V O for': 1,
         'V O from O': 1,
         'V during O': 1,
         'V before': 1,
         'V O to v': 1,
         'V O among O': 1,
         'V among O': 1,
         'V O under O': 1,
         'V O during O': 1,
         'V to O': 1,
         'V at O': 1,
         'V O as': 1})


In [56]:
# sort_dict(patterns['remind']['ROOT'])

IndexError: string index out of range