In [1]:
#!/usr/bin/env python
import sys, os, re
from math import log
from itertools import product
from collections import defaultdict, Counter
from nltk.corpus import stopwords # might use spacy stopword

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load('en_core_web_lg') 
nlp.tokenizer = custom_tokenizer(nlp)


In [2]:
def norm_sent(sent):
    # strip and replace multiple spaces
    return ' '.join(sent.split())


def parse_sent(text):
    doc = nlp(text.strip())
    
    # TODO: why nlp sent.text again?
    return [ (token.text, token.lemma_, token.tag_) for sent in doc.sents for token in nlp(sent.text) ]


In [3]:
def read_pats(filename):
    pat_dict = {}
    for line in open(filename, 'r', encoding='utf8'):
        try:
            nos, pat = line.strip().split('\t')
        except:
            # print("Exception:", line)
            pass
            
        for no in nos.split(','):
            if no.startswith('#'): continue

            no = int(no)
            if no in pat_dict:
                pass
                # print("REPEATED:", no)

            pat_dict[no] = re.compile(pat)

    return pat_dict


re_parentheses = re.compile('\((?P<info>.*)\)$')
re_level =  re.compile('([ABC][12])')
# re_level =  re.compile('(; | |;|^)([ABC][12])( |;|$)') # stricter

def read_sents(filename):
    sent_dict = {}
    
    for line in open(filename, 'r', encoding='utf8'):
        no, level, sents = line.strip().split('\t')
        no = int(no)
        
        new_sents = []
        for sent in sents.split('|||'):
            match = re.search(re_parentheses, sent)
            if not match: continue
                
            info   = match.groupdict()['info']
            origin = re.findall(re_level, info)
            origin = origin[0] if origin else None
            sent   = sent[:match.start()]
            
            new_sents.append((origin, norm_sent(sent)))

        sent_dict[no] = {'level': level, 'sents': new_sents}

    return sent_dict

In [4]:
def group_patterns(pat_dict, sent_dict):
    prev = 0
    pat_groups = [[]]
    
    for no, pat in pat_dict.items():
        level = sent_dict[no]['level']
        
        # create new group
        if level_table[level] < prev:
            pat_groups.append([])
            
        # update old group
        else:
            pat_groups[-1].append((no, level, pat))
        
        prev = level_table[level]  
        
    return pat_groups

In [12]:
# only used for testing
def is_match(sent, pat):
    parse = parse_sent(sent)
    
    words =  ' '.join([x for x, y, z in parse])
    lemmas = ' '.join([y.lower() for x, y, z in parse])
    tags =   ' '.join([z for x, y, z in parse])

    ### rule to catch
    stopwords = re.findall('[a-z]+', pat)
    lemma_tags = ' '.join([z if y not in stopwords else y for _, y, z in parse])
    origin_tags = ' '.join([z if x not in stopwords else x for x, _, z in parse])

    return re.search(pat, lemma_tags) or re.search(pat, origin_tags)
    

In [13]:
def recommend(sent, pat_groups):
    parse = parse_sent(sent)
            
    words =  ' '.join([x for x, y, z in parse])
    lemmas = ' '.join([y.lower() for x, y, z in parse])
    tags =   ' '.join([z for x, y, z in parse])

    ### rule to catch
    all_gets = set()
    for group in pat_groups:
        group_gets = []
        for each in group:
            no, level, pat = each

            stopwords = re.findall('[a-z]+', pat)
            lemma_tags = ' '.join([z if y not in stopwords else y for _, y, z in parse])
            origin_tags = ' '.join([z if x not in stopwords else x for x, _, z in parse])

            if re.search(pat, lemma_tags) or re.search(pat, origin_tags): 
                group_gets.append((level, pat)) # no
                all_gets.add(level)

            else:
                pass

        if group_gets:
#             print("Got:\t", group_gets)
            top_level = sorted(group_gets, key=lambda x: x[0], reverse=True)[0][0]

            recommend = list(filter(lambda el: level_table[top_level] < level_table[el[1]], group))
#             print("Rec:\t", recommend)
            
#     print(sorted(all_gets, key=lambda x: level_table[level]))
#     print()

In [14]:
level_table = {"A1": 1, "A2": 2, 
               "B1": 3, "B2": 4, 
               "C1": 5, "C2": 6 }

In [15]:
pat_dict  = read_pats('egp.pattern.txt')
sent_dict = read_sents('egp.train.txt')

In [17]:
%%time

if __name__ == '__main__':

    ### TEMP
    delete = [no for no in pat_dict if no > 148 ]
    for no in delete: del pat_dict[no]
    delete = [no for no in sent_dict if no not in pat_dict]
    for no in delete: del sent_dict[no]
    ###
    
    pat_groups = group_patterns(pat_dict, sent_dict)
    
    for no, entry in sent_dict.items():
        level = entry['level']
        sents = entry['sents']
        
        # if no not in patterns_number: continue

        for origin_level, sent in sents:
            if is_match(sent, pat_dict[no]):
                pass
            else:
                print(no, sent)
            recommend(sent, pat_groups)

12 The latter ĄV fat, ugly and sick ĄV blows his top when Ralph tells the others about the fat boy's nice name, as he wanted to keep it secret.
19 For further information, contact Joey Hung.
47 I love her because she is friendly.
51 Maria realised that being kind and trying to make other people happy is always the best way!
94 I'm sorry you can't find it.
143 [about a film] It came out just yesterday.
CPU times: user 50 s, sys: 1min 42s, total: 2min 32s
Wall time: 7.67 s


In [None]:
79  3.93 s
148 7.63 s


In [None]:
parse_sent("It came out just yesterday.")
