In [1]:
#!/usr/bin/env python
import sys, os, re
from math import log
from itertools import product
from collections import defaultdict, Counter
from nltk.corpus import stopwords # might use spacy stopword

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load('en_core_web_lg') 
nlp.tokenizer = custom_tokenizer(nlp)


In [2]:
def normalize(sent):
    # strip and replace multiple spaces
    return ' '.join(sent.split())


def parse_sent(sent):
    return [ (token.text, token.lemma_, token.tag_) for token in nlp(sent) ]


In [3]:
def read_pats(filename):
    pat_dict = {}
    for line in open(filename, 'r', encoding='utf8'):
        try:
            nos, pat = line.strip().split('\t')
        except:
            # print("Exception:", line)
            pass
            
        for no in nos.split(','):
            if no.startswith('#'): continue

            no = int(no)
            if no in pat_dict:
                pass
                # print("REPEATED:", no)

            pat_dict[no] = pat

    return pat_dict


re_parentheses = re.compile('\((?P<info>.*)\)$')
re_level =  re.compile('([ABC][12])')
# re_level =  re.compile('(; | |;|^)([ABC][12])( |;|$)') # stricter

def read_sents(filename):
    sent_dict = {}
    
    for line in open(filename, 'r', encoding='utf8'):
        no, level, sents = line.strip().split('\t')
        no = int(no)
        
        new_sents = []
        for sent in sents.split('|||'):
            match = re.search(re_parentheses, sent)
            if not match: continue
                
            info   = match.groupdict()['info']
            origin = re.findall(re_level, info)
            origin = origin[0] if origin else None
            sent   = sent[:match.start()]
            
            new_sents.append((origin, normalize(sent)))

        sent_dict[no] = {'level': level, 'sents': new_sents}

    return sent_dict

In [4]:
def group_patterns(pat_dict, sent_dict):
    prev = 0
    pat_groups = [[]]
    
    for no, pat in pat_dict.items():
        level = sent_dict[no]['level']
        
        # create new group
        if level_table[level] < prev:
            pat_groups.append([])
            
        # update old group
        else:
            pat_groups[-1].append((no, level, pat))
        
        prev = level_table[level]  
        
    return pat_groups

In [5]:
# only used for testing
def is_match(sent, pat):
    parse = parse_sent(sent)
    
    words =  ' '.join([x for x, y, z in parse])
    lemmas = ' '.join([y.lower() for x, y, z in parse])
    tags =   ' '.join([z for x, y, z in parse])

    ### rule to catch
    stopwords = re.findall('[a-z]+', pat)
    lemma_tags = ' '.join([z if y not in stopwords else y for _, y, z in parse])
    origin_tags = ' '.join([z if x not in stopwords else x for x, _, z in parse])

    return re.search(pat, lemma_tags) or re.search(pat, origin_tags)
    

In [6]:
# return the index of start token and end token
def align(re_match, tags):
    start, end = re_match.span()

    length = 0
    for i, token in enumerate(tags.split(' ')):
        if length >= start: break
            
        length += len(token) + 1 # space len
            
    match_len = len(re_match.group().split(' '))
    return (i, i+match_len)
    

def match_pats(sent, pat_groups):
    parse = parse_sent(sent)
            
    words =  ' '.join([x for x, y, z in parse])
    lemmas = ' '.join([y.lower() for x, y, z in parse])
    tags =   ' '.join([z for x, y, z in parse])

    ### rule to catch
    group_gets = defaultdict(lambda: [])
    for i, group in enumerate(pat_groups):
        for each in group:
            no, level, pat = each

            stopwords = re.findall('[a-z]+', pat)
            lemma_tags = ' '.join([z if y not in stopwords else y for _, y, z in parse])
            origin_tags = ' '.join([z if x not in stopwords else x for x, _, z in parse])
            
            lemma_match  = re.search(pat, lemma_tags)
            origin_match = re.search(pat, origin_tags)
            
            if not lemma_match and not origin_match:
                continue
            elif lemma_match:
                start, end = align(lemma_match, lemma_tags)
            elif origin_match:
                start, end = align(origin_match, origin_tags)
            else:
                pass
            
            ngram = ' '.join([el[0] for el in parse[start:end]])
            group_gets[i].append((no, level, pat, ngram))
            
    return group_gets


def recommend_pats(group_gets, pat_groups):
    group_recs = {}
    for i, gets in group_gets.items():
        top_level = max(gets, key=lambda x: level_table[x[1]])[1]
        recommend = list(filter(lambda el: level_table[top_level] < level_table[el[1]], pat_groups[i]))
        group_recs[i] = recommend
        
    return group_recs

In [7]:
level_table = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6 }

In [45]:
pat_dict  = read_pats('egp.pattern.txt')
sent_dict = read_sents('egp.train.txt')

### TEMP
delete = [no for no in pat_dict if no > 148 ]
for no in delete: del pat_dict[no]
delete = [no for no in sent_dict if no not in pat_dict]
for no in delete: del sent_dict[no]
###

pat_groups = group_patterns(pat_dict, sent_dict)

In [34]:
def main(content):
    content = normalize(content)
    
    sent_profiles = []
    for sent in nlp(content).sents:
        sent = sent.text
        
        group_gets = match_pats(sent, pat_groups) # match patterns in groups

        if not group_gets: continue # non-match
        
        group_recs  = recommend_pats(group_gets, pat_groups) # recommend patterns in same group
        
        sent_profiles.append((sent, group_gets, group_recs))
    
    return sent_profiles

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, render_template, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)

app.config['CORS_HEADERS'] = 'Content-Type'
CORS(app)


@app.route('/')
def index():
    pass


# post /correct data: { content: str }
@app.route('/profiling', methods=['POST'])
def profiling():
    request_data = request.get_json()
    if not request_data: return jsonify({'edit': 'Should not be empty'})
    
    content = request_data['content']
    print(content)
    
    sent_profiles = main(content)

    return jsonify({'result': sent_profiles})


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1315)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:1315/ (Press CTRL+C to quit)
123.195.194.140 - - [19/Feb/2019 00:32:59] "[37mOPTIONS /profiling HTTP/1.1[0m" 200 -
123.195.194.140 - - [19/Feb/2019 00:32:59] "[37mPOST /profiling HTTP/1.1[0m" 200 -


I eat an apple. However, it is soar.


123.195.194.140 - - [19/Feb/2019 00:36:46] "[37mOPTIONS /profiling HTTP/1.1[0m" 200 -
123.195.194.140 - - [19/Feb/2019 00:36:46] "[37mPOST /profiling HTTP/1.1[0m" 200 -


I eat an apple. However, it is soar.


123.195.194.140 - - [19/Feb/2019 00:37:02] "[37mOPTIONS /profiling HTTP/1.1[0m" 200 -
123.195.194.140 - - [19/Feb/2019 00:37:02] "[37mPOST /profiling HTTP/1.1[0m" 200 -


I eat an apple. However, it is soar.


123.195.194.140 - - [19/Feb/2019 00:37:17] "[37mOPTIONS /profiling HTTP/1.1[0m" 200 -
123.195.194.140 - - [19/Feb/2019 00:37:17] "[37mPOST /profiling HTTP/1.1[0m" 200 -


I eat an apple. However, it is soar.


123.195.194.140 - - [19/Feb/2019 00:38:13] "[37mOPTIONS /profiling HTTP/1.1[0m" 200 -
123.195.194.140 - - [19/Feb/2019 00:38:13] "[37mPOST /profiling HTTP/1.1[0m" 200 -


I eat an apple. However, it is soar.
I am really a good man.


In [35]:
main('I eat an apple. However, it is soar.')

[('However, it is soar.',
  defaultdict(<function __main__.match_pats.<locals>.<lambda>()>,
              {5: [(90, 'A2', '(also|however|so)', 'However'),
                (102, 'B1', '(RB|RBR|RBS) ,', 'However ,')],
               6: [(116, 'A2', '(so|really|too) VB.*|RB', 'However'),
                (119,
                 'B1',
                 '(VB.* .* RB)|(RB .* VB.*)',
                 'However , it is soar .')],
               7: [(128, 'A1', 'RB', 'However'),
                (133, 'A2', '^RB', 'However'),
                (135, 'A2', 'RB', 'However'),
                (137, 'A2', '^RB', 'However')],
               8: [(142, 'A2', 'RB', 'However')]}),
  {5: [(105,
     'B2',
     '(recently|instantly|shortly|permanently|simultaneously|nowadays)'),
    (106, 'B2', '(seriously|urgently|illegally)'),
    (107,
     'C1',
     '(almost|absolutely|awfully|badly|barely|completely|decidedly|deeply|enough|enormously|entirely|extremely|fairly|far|fully|greatly|hardly|highly|how|incredibly|i

In [42]:
%%time

if __name__ == '__main__':
    for no, entry in sent_dict.items():
        level = entry['level']
        sents = entry['sents']
        
        # if no not in patterns_number: continue

        for origin_level, sent in sents:
            if is_match(sent, pat_dict[no]):
                pass
            else:
                print(no, sent)
                
            # main process
#             print(sent)
#             group_gets = match_pats(sent, pat_groups) # match patterns in groups
#             print(group_gets)
#             group_recs  = recommend_pats(group_gets, pat_groups) # recommend patterns in same group
#             print(group_recs)

12 The latter ĄV fat, ugly and sick ĄV blows his top when Ralph tells the others about the fat boy's nice name, as he wanted to keep it secret.
19 For further information, contact Joey Hung.
47 I love her because she is friendly.
51 Maria realised that being kind and trying to make other people happy is always the best way!
94 I'm sorry you can't find it.
123 There are probably very few of us who have never been to a zoo.
143 [about a film] It came out just yesterday.
CPU times: user 11.7 s, sys: 23.7 s, total: 35.3 s
Wall time: 1.78 s


In [None]:
79  3.93 s
148 7.63 s
prog = re.compile(pattern)
result = prog.match(string)


In [None]:
parse_sent("The cars will fly around the buildings like planes, and they will be faster and cheaper.")
