In [8]:
#!/usr/bin/env python
%load_ext autoreload
%autoreload 2

import sys, os, re
from math import log
from itertools import product
from collections import defaultdict, Counter
from nltk.corpus import stopwords # might use spacy stopword

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    infix_re  = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load('en_core_web_lg') 
nlp.tokenizer = custom_tokenizer(nlp)


In [9]:
from utils.config import *
from utils.io import *
from utils.preprocess import *

In [10]:
def group_patterns(pat_dict, sent_dict):
    prev = 0
    pat_groups = [[]]
    
    for no, pat in pat_dict.items():
        level = sent_dict[no]['level']
        
        # create new group
        if level_table[level] < prev:
            pat_groups.append([])
            
        # update old group
        else:
            pat_groups[-1].append((no, level, pat))
        
        prev = level_table[level]  
        
    return pat_groups

In [11]:
# only used for testing
re_token = re.compile('([a-z-]+)|[,\.:;!\?]')
def is_match(sent, pat):
    parse = nlp(sent)
    
    ### rule to catch
    stopwords = re_token.findall(pat.pattern)
    lemma_tags  = ' '.join([tk.tag_ if tk.lemma_ not in stopwords else tk.lemma_ for tk in parse])
    origin_tags = ' '.join([tk.tag_ if tk.text not in stopwords else tk.text for tk in parse])

    return pat.search(lemma_tags) or pat.search(origin_tags)
    

In [12]:
# return the index of start token and end token
def align(re_match, tags):
    start, end = re_match.span()

    length = 0
    for i, token in enumerate(tags.split(' ')):
        if length >= start: break
            
        length += len(token) + 1 # space len
            
    match_len = len(re_match.group().split(' '))
    return (i, i+match_len)
    

re_token = re.compile('[a-z-]+|[,\.:;!\?]')
def match_pat(parse, pat):
    
    stopwords = re_token.findall(pat.pattern)
    lemma_tags  = ' '.join([tk.tag_ if tk.lemma_ not in stopwords else tk.lemma_ for tk in parse])
    origin_tags = ' '.join([tk.tag_ if tk.text not in stopwords else tk.text for tk in parse])

    
    lemma_match  = pat.search(lemma_tags)
    origin_match = pat.search(origin_tags)

    if lemma_match:
        start, end = align(lemma_match, lemma_tags)
        return True, (start, end)
    elif origin_match:
        start, end = align(origin_match, origin_tags)
        return True, (start, end)
    
    return False, (0, 0)


def iterate_pats(sent, pat_groups):
    parse = nlp(sent)
            
    ### rule to catch
    group_gets = {}
    for i, group in enumerate(pat_groups):
        gets = []
        for each in group:
            no, level, pat = each

            is_match, (start, end) = match_pat(parse, pat)
            if not is_match: continue
            
            ngram = ' '.join([el.text for el in parse[start:end]])
            gets.append((no, level, pat.pattern, ngram))
        
        if not gets: continue
        
        print(gets)
        
        gets.sort(key=lambda el: len(el[3].split())) # sort by length
        get = max(gets, key=lambda el: el[1]) # max level
        group_gets[i] = get
        
    return group_gets


def recommend_pats(group_gets, pat_groups):
    group_recs = {}
    for i, get in group_gets.items():
        no, level, pat, ngram = get
        
        recs = filter(lambda el: level_table[level] < level_table[el[1]], pat_groups[i])
        recs = map(lambda el: (el[0], el[1], el[2].pattern), recs)
        group_recs[i] = list(recs)
        
    return group_recs

In [13]:
pat_dict  = read_pats('egp.regex.pattern.txt')
sent_dict = read_sents('egp.train.txt')

### TEMP
delete = [no for no in pat_dict if no > 149 ]
for no in delete: del pat_dict[no]
delete = [no for no in sent_dict if no not in pat_dict]
for no in delete: del sent_dict[no]
###

pat_groups = group_patterns(pat_dict, sent_dict)

In [None]:
def main(content):
    content = normalize(content)
    
    sent_profiles = []
    for sent in nlp(content).sents:
        sent = sent.text
        
        group_gets = iterate_pats(sent, pat_groups) # match patterns in groups

        if not group_gets: continue # non-match
        
        group_recs  = recommend_pats(group_gets, pat_groups) # recommend patterns in same group
        
        sent_profiles.append((sent, group_gets, group_recs))
    
    return sent_profiles

In [None]:
iterate_pats("There have been so many embarrassing moments in my life. It's very difficult to pick the most embarrassing.", pat_groups)


In [None]:
%%time

if __name__ == '__main__':
    for no, entry in sent_dict.items():
        level = entry['level']
        sents = entry['sents']
        
        # if no not in patterns_number: continue

        for origin_level, sent in sents:
            if is_match(sent, pat_dict[no]):
                pass
            else:
                print(no, pat_dict[no].pattern, sent)
                
            # main process
#             print(sent)
#             group_gets = iterate_pats(sent, pat_groups) # match patterns in groups
#             print(group_gets)
#             group_recs  = recommend_pats(group_gets, pat_groups) # recommend patterns in same group
#             print(group_recs)

In [None]:
match_pat(parse_sent("Who cares?"), re.compile('^who VB.? \?$'))


In [77]:
from utils.explacy import *

explacy.print_parse_info(nlp, "The house is beautiful, and so much bigger than the previous one.")

Dep tree       Token     Dep type Lemma     Norm      Part of Sp tag
────────────── ───────── ──────── ───────── ───────── ────────── ───
           ┌─► The       det      the       The       DET        DT 
        ┌─►└── house     nsubj    house     house     NOUN       NN 
┌┬──────┴┬┬┬── is        ROOT     be        is        VERB       VBZ
││       ││└─► beautiful acomp    beautiful beautiful ADJ        JJ 
││       │└──► ,         punct    ,         ,         PUNCT      ,  
││       └───► and       cc       and       and       CCONJ      CC 
││         ┌─► so        advmod   so        so        ADV        RB 
││      ┌─►└── much      advmod   much      much      ADV        RB 
│└─►┌───┴───── bigger    conj     big       bigger    ADJ        JJR
│   └─►┌────── than      prep     than      than      ADP        IN 
│      │  ┌──► the       det      the       the       DET        DT 
│      │  │┌─► previous  amod     previous  previous  ADJ        JJ 
│      └─►└┴── one       pobj     

In [65]:
for a in nlp("It was the biggest TV in the bazaar, with its huge, black screen."):
    print(a.text, a.lemma_, a.norm_, list(a.ancestors), a.n_lefts)

It -PRON- it [was] 0
was be was [] 1
the the the [TV, was] 0
biggest big biggest [TV, was] 0
TV tv TV [was] 2
in in in [TV, was] 0
the the the [bazaar, in, TV, was] 0
bazaar bazaar bazaar [in, TV, was] 1
, , , [TV, was] 0
with with with [TV, was] 0
its -PRON- its [screen, with, TV, was] 0
huge huge huge [screen, with, TV, was] 0
, , , [screen, with, TV, was] 0
black black black [screen, with, TV, was] 0
screen screen screen [with, TV, was] 4
. . . [was] 0
