In [1]:
from collections import defaultdict, Counter
from operator import itemgetter
from utils.syntax import *
import numpy as np
import spacy
import json

In [2]:
nlp = spacy.load('en_core_web_lg') # ('en')

In [3]:
# Read patterns/sents json file
with open('bnc.json', 'r', encoding='utf8') as fs:
    BNC = json.load(fs)
    patterns, sents, ngrams = BNC['patterns'], BNC['sents'], BNC['ngrams']

In [35]:
def get_high_freq(counts):
    values = list(counts.values())
    total, avg, std = np.sum(values), np.mean(values), np.std(values)
    # print("Total: {}, Avg: {}, Std: {}".format(total, avg, std))

    return dict([(ptn, count) for ptn, count in counts.items() if count > avg + std])

def truncate_k(counts, k=10):
    return dict([(ptn, count) for ptn, count in counts.items() if count >= k])

def sort_dict(counts):
    return sorted(counts.items(), key=itemgetter(1), reverse=True)

In [36]:
def predict_ratio(ptn, patterns):
    if ptn not in patterns: 
        return 0
    if ptn == max(patterns, key=patterns.get): # 保證對
        return 1
    return patterns[ptn] / sum(patterns.values())


def categorize(ratio):
    if ratio > 0.5:
        return 'right', '{{+{}+}}'
    elif ratio < 0.1:
        return 'wrong', '[-{}-]'
    else:
        return 'unsure', '\\*{}*\\'
    
    
def correct(content):
    edits, suggestions = [], []
    for tk in nlp(content):
        if tk.tag_ in VERBS:
            ptn, ngram = dep_to_pattern(tk)
    
            high_ptns = get_high_freq(patterns[tk.lemma_][tk.dep_])
            
            ratio = predict_ratio(ptn, high_ptns)
            
            category, template = categorize(ratio)
            
            if category in ['wrong', 'unsure']:
                top_ptn = max(high_ptns, key=high_ptns.get)
                ngram = ngrams[tk.lemma_][tk.dep_][top_ptn][0]
                suggestions.append({
                    'category': category,
                    'tk': tk.text,
                    'bef': ptn,
                    'aft': top_ptn,
                    'ngram': ngram
                })
                
            edits.append(template.format(tk.text))
        else:
            edits.append(tk.text)
   
    return ' '.join(edits), suggestions

In [37]:
if __name__ == '__main__':
    user_input = '''I want to discuss about my life. I rely my ability.'''
    print(correct(user_input))

('I {+want+} to [-discuss-] about my life . I \\*rely*\\ my ability .', [{'ngram': 'to discuss difficulties', 'tk': 'discuss', 'bef': 'to V about O', 'category': 'wrong', 'aft': 'to V O'}, {'ngram': 'organisations rely on volunteers', 'tk': 'rely', 'bef': 'S V O', 'category': 'unsure', 'aft': 'S V on O'}])


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)
CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'


# post /correct data: {content :}
@app.route('/correct' , methods=['POST'])
def start_correct():
    request_data = request.get_json()
    
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    content = request_data['content']
    print(content)
    
    edit, suggestions = correct(content)
    
    return jsonify({
        'edit': edit,
        'suggestions': suggestions
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1314)

 * Running on http://0.0.0.0:1314/ (Press CTRL+C to quit)
140.114.77.132 - - [07/Jun/2018 17:54:36] "[37mOPTIONS /correct HTTP/1.1[0m" 200 -
140.114.77.132 - - [07/Jun/2018 17:54:36] "[37mPOST /correct HTTP/1.1[0m" 200 -


I want to discuss about my life. I rely my ability.
