In [9]:
#!/usr/bin/env python
%load_ext autoreload
%autoreload 2

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    import re
    
    infix_re  = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load('en_core_web_lg') 
nlp.tokenizer = custom_tokenizer(nlp)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import gzip, json
from nltk.tokenize.treebank import TreebankWordDetokenizer

from collections import Counter, defaultdict
from utils.preprocess import normalize
from utils.grammar import generate_candidates, iterate_all_patterns, iterate_all_gets
from utils.vocabulary import level_vocab

In [11]:
class DotDict(dict):
    
    def __getattr__(self, name):
        return self[name]
    
    
def construct(block):
    lines = [line for line in block.split('\n') if line]
    if len(lines) < 2: return []

    text, lines = lines[0].replace('# text = ', ''), lines[1:]
    parses = [ DotDict() for _ in range(len(lines)) ]

    for i, line in enumerate(lines):
        index, token, lemma, norm, pos, tag, dep, head, children = line.split('\t')
        children = children.strip()
        
        parses[i].update({
            'i': int(index),
            'text': token,
            'norm_': norm,
            'lemma_': lemma,
            'dep_': dep,
            'pos_': pos,
            'tag_': tag,
            'head': parses[int(head)], # if head = -1, it's root
            'children': [parses[int(e)] for e in children.split(',')] if children else [],
            'doc': parses
        })
        
    return parses

In [13]:
fs = gzip.open('bnc.parse.txt.gz', 'rt', encoding='utf8')
# fs = open('bnc.parse.txt', 'r', encoding='utf8')
contents = fs.read().split('\n\n')

In [14]:
counters = defaultdict(Counter)
sentences = defaultdict(lambda: defaultdict(lambda: []))

In [16]:
total = len(contents)

for i, entry in enumerate(contents):
    parse = construct(entry)

    # 1. generate possible sentences
    parses = generate_candidates(parse)

    # 2. find patterns for each candidate
    gets = [get for parse in parses for get in iterate_all_patterns(parse)]

    # 3. remove duplicate
    uniq_gets = []
    [uniq_gets.append(get) for get in gets if get not in uniq_gets]
    gets = uniq_gets
    
    for get in uniq_gets:
        # counters[get['no']][get['match']][get['ngram']] += 1
        counters[get['no']][ get['match'] + '|' + get['ngram'] ] += 1
        sentences[get['no']][ get['match'] + '|' + get['ngram'] ].append(
            ' '.join([ '<w>' + tk.text + '</w>' if tk.i in get['indices'] else tk.text for tk in parse ])
        )
    
    if i % 10000 == 0:
        print('{} / {}'.format(i, total))
        with open('counters.json', 'w') as f:
            json.dump(counters, f)

        with open('sentences.json', 'w') as f:
            json.dump(sentences, f)

0 / 5190445


KeyboardInterrupt: 

In [None]:
# with open('counters.json', 'r') as f:
#     counters = json.load(f)

# with open('sentences.json', 'r') as f:
#     sentences = json.load(f)