In [1]:
from collections import defaultdict, Counter
from operator import itemgetter
from utils.syntax import *
import numpy as np
import spacy
import json

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

In [3]:
# Read patterns/sents json file
with open('bnc.json', 'r', encoding='utf8') as fs:
    BNC = json.load(fs)
    patterns, sents, ngrams = BNC['patterns'], BNC['sents'], BNC['ngrams']

In [35]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    # print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ptn, count) for ptn, count in counts.items() if count > avg + std])

def truncate_k(counts, k=10):
    return dict([(ptn, count) for ptn, count in counts.items() if count >= k])

def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [36]:
def predict_ratio(ptn, patterns):
    if ptn not in patterns: 
        return 0
    if ptn == max(patterns, key=patterns.get): # 保證對
        return 1
    return patterns[ptn] / sum(patterns.values())


def categorize(ratio):
    if ratio > 0.5:
        return 'right', '{{+{}+}}'
    elif ratio < 0.1:
        return 'wrong', '[-{}-]'
    else:
        return 'unsure', '\\*{}*\\'
    
    
def correct(content):
    edits, suggestions = [], []
    for tk in nlp(content):
        if tk.tag_ in VERBS:
            ptn, ngram = dep_to_pattern(tk)
    
            high_ptns = get_high_freq(patterns[tk.lemma_][tk.dep_])
            
            ratio = predict_ratio(ptn, high_ptns)
            
            category, template = categorize(ratio)
            
            if category in ['wrong', 'unsure']:
                top_ptn = max(high_ptns, key=high_ptns.get)
                ngram = ngrams[tk.lemma_][tk.dep_][top_ptn][0]
                suggestions.append({
                    'category': category,
                    'tk': tk.text,
                    'bef': ptn,
                    'aft': top_ptn,
                    'ngram': ngram
                })
                
            edits.append(template.format(tk.text))
        else:
            edits.append(tk.text)
   
    return ' '.join(edits), suggestions

In [37]:
if __name__ == '__main__':
    user_input = '''I want to discuss about my life. I rely my ability.'''
    print(correct(user_input))

('I {+want+} to [-discuss-] about my life . I \\*rely*\\ my ability .', [{'ngram': 'to discuss difficulties', 'tk': 'discuss', 'bef': 'to V about O', 'category': 'wrong', 'aft': 'to V O'}, {'ngram': 'organisations rely on volunteers', 'tk': 'rely', 'bef': 'S V O', 'category': 'unsure', 'aft': 'S V on O'}])


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)
CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'


# post /correct data: {content :}
@app.route('/correct' , methods=['POST'])
def start_correct():
    request_data = request.get_json()
    
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    content = request_data['content']
    print(content)
    
    edit, suggestions = correct(content)
    
    return jsonify({
        'edit': edit,
        'suggestions': suggestions
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1314)

 * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
140.114.77.132 - - [07/Jun/2018 17:54:36] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [07/Jun/2018 17:54:36] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability.


In [27]:
sents['rely']['ROOT']['S v V O']

["people who were able to make their own fun , did n't rely store-bought fun .",
 'they did not rely mighty sinews and glittering steel as a true warrior should .',
 'this mechanism does not rely a disciplinary code nor on a court-like body .']

In [20]:
a = Counter()
for pat, count in patterns['rely']['ROOT'].items():
    a[pat] = count
a.most_common()

[('S V on O', 214),
 ('S V O', 96),
 ('S V upon O', 37),
 ('S be V-ing on O', 26),
 ('S V on O to-v', 18),
 ('S V on cl', 12),
 ('S V cl', 11),
 ('S V O to-v', 11),
 ('v V on O', 9),
 ('cl S V on O', 8),
 ('V on O', 8),
 ('S V for O', 7),
 ('in O S V on O', 6),
 ('S V', 6),
 ('S v V on O', 6),
 ('S V on O for O', 6),
 ('like O S V on O', 5),
 ('V O', 5),
 ('S V on O O', 5),
 ('v V O', 4),
 ('S V O for O', 4),
 ('S V upon O to-v', 3),
 ('cl S V upon O', 3),
 ('for O S V on O', 3),
 ('S v V O', 3),
 ('S S V on O', 3),
 ('cl S V O', 3),
 ('in O S V O', 3),
 ('be S V-ing on O', 3),
 ('S V on O cl', 3),
 ('v S V O', 3),
 ('S for O V on O', 3),
 ('of O S V on O', 2),
 ('S V on O with O', 2),
 ('S V to-v', 2),
 ('S V O in O', 2),
 ('S V on', 2),
 ('S in O V on O', 2),
 ('S V upon cl', 2),
 ('at O S V on O', 2),
 ('S V on for O', 2),
 ('S V O cl', 2),
 ('cl V on O', 2),
 ('S V for O on O', 2),
 ('cl S V O to-v', 2),
 ('in O V on O', 2),
 ('like O S V on O for O', 1),
 ('S V on O for', 1),
 ('V

In [13]:
sents['rely']['dobj']

{'V on O': ['the picture formed relies on the large barbs to catch the eye , and move it on to the smaller tetras .',
  "one of the major differences between speaking and writing is speaking relies more on the hearer 's memory than writing does ."]}