In [None]:
#!/usr/bin/env python
%load_ext autoreload
%autoreload 2

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    import re
    
    infix_re  = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load('en_core_web_lg') 
nlp.tokenizer = custom_tokenizer(nlp)

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

from utils.preprocess import normalize
from utils.grammar import iterate_all_patterns, iterate_all_gets
from utils.vocabulary import level_vocab

In [None]:
def main_profiling(content):
    # content = normalize(content)
    
    sent_profiles = []
    for sent in nlp(content, disable=['ner']).sents:
        parse = nlp(normalize(sent.text), disable=['ner'])
        
        # 1. find less-overlapped patterns and patterns for non-matching words
        gets, scratches = iterate_all_patterns(parse) # match patterns in groups
        # print(gets)
        # if not gets: continue # non-match
        
        # 2. recommend related higher pattern in the same group
        recs  = iterate_all_gets(gets)
        # print(recs)
        sent_profiles.append({'sent': sent.text, 
                              'parse': ' '.join([tk.text for tk in parse]), 
                              'scratches': scratches, 'gets': gets, 'recs': recs })

    return sent_profiles


def main_vocabuing(sentence):
    sentence = normalize(sentence)
    parse = nlp(sentence)

    # 1. get vocabulary level
    vocabs = level_vocab(parse)
    
    return vocabs
        

In [None]:
no = Egp.get_possible("state-of-the-art")
print(no)
print(Egp.get_patterns()[no].pattern)
print(Egp.get_statement(no))
print(Egp.get_highlight(no))

In [None]:
group_gets, scratches = iterate_all_patterns(nlp("He is nice and friendly."))

# # group_recs  = iterate_all_gets(group_gets) # recommend patterns in same group

In [None]:
# %%time

# if __name__ == '__main__':
#     for no, entry in Egp.get_examples().items():
#         level = entry['level']
#         sents = entry['sents']
        
#         # if no not in patterns_number: continue

#         for origin_level, sent in sents:
# #             parse = nlp(sent)
# #             if is_match(parse, Egp.get_patterns()[no]):
# #                 pass
# #             else:
# #                 print(no, Egp.get_patterns()[no].pattern, sent)
                
#             # main process
#             print(sent)
#             group_gets = iterate_all_patterns(parse, pat_groups) # match patterns in groups
#             print(group_gets)
#             group_recs  = iterate_all_gets(group_gets, pat_groups) # recommend patterns in same group
#             print(group_recs)

In [None]:
# doc = nlp("it is the biggest and oldest museum in libya . … it is the biggest and <w> oldest museum </w> in libya .")
# for a in doc:
#     print(a.text, a.lemma_, a.norm_, a.tag_, a.pos_, a.i)
# doc.text

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, render_template, request, jsonify
from flask_cors import CORS, cross_origin

app = Flask(__name__)

app.config['CORS_HEADERS'] = 'Content-Type'
CORS(app)


@app.route('/')
def index():
    pass


# post /profiling data: { content: str }
@app.route('/profiling', methods=['POST'])
def profiling():
    request_data = request.get_json()
    if not request_data: return jsonify({'result': 'Should not be empty'})
    
    content = request_data['content']
    print(content)
    
    sent_profiles = main_profiling(content)

    return jsonify({'profiles': sent_profiles})


# post /vocabuing data: { sentence: str }
@app.route('/vocabuing', methods=['POST'])
def vocabuing():
    request_data = request.get_json()
    if not request_data: return jsonify({'result': 'Should not be empty'})
    
    sentence = request_data['sentence']
    
    vocabs = main_vocabuing(sentence)

    return jsonify({'vocabs': vocabs})


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=1316)

In [None]:
# egs = []

# for index, entry in Egp.get_examples().items():
#     if index not in Egp.get_patterns(): continue
        
#     eg = []
#     for sent in entry['sents']:
#         level, sent = sent
#         parse = nlp(normalize(sent))
    
#         matches = match_pat(parse, index, Egp.get_patterns()[index])
#         if not matches: continue
            
#         sent = []
#         for tk in parse:
#             starts = [match[0] for match in matches]
#             ends = [match[1] for match in matches]                

#             if tk.i in starts:
#                 sent.extend(['<w>', tk.text])
#             elif tk.i in ends:
#                 sent.extend(['</w>', tk.text])
#             else:
#                 sent.append(tk.text)
        
#         sent = ['I' if tk == 'i' else tk for tk in sent]
#         sent = ' '.join(sent)
#         eg.append(sent)

#     if not eg: egs.append((index, entry['sents'][0][1]))
#     else:      egs.append((index, eg[0]))
        
# with open('egp.highlights.txt', 'w', encoding='utf8') as ws:
#     for line in egs:
#         print(*line, sep='\t', file=ws)