In [None]:
from collections import defaultdict, Counter
from tqdm import tqdm

from utils.syntax import *
from utils.counts import *
# from utils.parse import parse

In [None]:
class DotDict(dict):
    
    def __getattr__(self, name):
        return self[name]
    
def construct(block):
    lines = [line for line in block.split('\n') if line]
    parsed = [ DotDict() for _ in range(len(lines)) ]

    for i, line in enumerate(lines):
        index, token, lemma, dep, tag, pos, head, children = line.split('\t')

        parsed[i].update({
            'i': int(index),
            'text': token,
            'lemma_': lemma,
            'dep_': dep,
            'pos_': pos,
            'tag_': tag,
            'head': parsed[int(head)],
            'children': [parsed[int(ch)] for ch in children.split(',') if ch],
            'doc': parsed
        })
        
    return parsed

In [None]:
# block = '''0	we	-PRON-	nsubj	PRP	PRON	1	
# 1	want	want	ROOT	VBP	VERB	1	0,3
# 2	to	to	aux	TO	PART	3	
# 3	discuss	discuss	xcomp	VB	VERB	1	2,4
# 4	something	something	dobj	NN	NOUN	3	'''

# entry = construct(block)

In [None]:
import gzip

fs = gzip.open('../coca.spacy.dep.txt.gz', 'rt', encoding='utf8')
contents = fs.read().split('\n\n')

In [None]:
patterns = defaultdict(lambda: defaultdict(Counter))
ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
sents = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
    
def retrieve_dep(entry):
    sent = ' '.join([tk.text for tk in entry])

    if len(entry) > 30: return # skip long sentence
    if '@@@' in sent: return

    sent_score = score(sent)
    
    for token in entry:
        if token.tag_ in POS['VERB']: # or tag == VERB
            ptn_tks, ngram_tks = dep_to_ptns_ngrams(token)
            ptn, ngram = ' '.join(ptn_tks), ' '.join(ngram_tks)

            patterns[token.lemma_][token.dep_][ptn] += 1
            ngrams[token.lemma_][token.dep_][ptn].append(ngram)
            sents[token.lemma_][token.dep_][ptn].append((sent, sent_score))
            
        elif token.tag_ in POS['ADJ']:
            pass
        elif token.tag_ in POS['NOUN']:
            pass

In [None]:
for entry in tqdm(contents):
    retrieve_dep(construct(entry))

### Minimize Ngrams and Sentences
* ngrams count < 10
* top 100 common sentences

In [None]:
slim_patterns = defaultdict(lambda: defaultdict(Counter))
slim_ngrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))
slim_sents = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: [])))

for word in patterns:
    for dep in patterns[word]:
        
        truncated_ptns = truncate_k(patterns[word][dep], 1) # remove pattern whose count <= ?
        
        for ptn, cnt in truncated_ptns.items():
            slim_patterns[word][dep][ptn] = cnt
            
            slim_ngrams[word][dep][ptn].extend(ngrams[word][dep][ptn])
            
            sorted_sents = sort_dict(sents[word][dep][ptn])
            slim_sents[word][dep][ptn].extend([s for (s, sent_score) in sorted_sents[:100]])

### Store in sqlite or json

In [None]:
import json, sqlite3

In [None]:
# with open('static/data/coca.patterns.slim.json', 'w', encoding='utf8') as ws:
#     json.dump({ 'patterns': slim_patterns, 'ngrams': slim_ngrams, 'sents': slim_sents }, ws)

In [None]:
conn = sqlite3.connect('static/data/rules.db')
cursor = conn.cursor()

cursor.execute('DROP TABLE IF EXISTS rules;')

cursor.execute('''CREATE TABLE rules 
(word NCHAR, dep CHARACTER, ptn NCHAR, norm_ptn NCHAR, count INTEGER, ngrams TEXT, sentences TEXT, 
PRIMARY KEY(word, dep, ptn));''')
    
for headword in slim_patterns:
    for dep in slim_patterns[headword]:
        for ptn in slim_patterns[headword][dep]:
            count = slim_patterns[headword][dep][ptn]
            
            cursor.execute("INSERT INTO rules VALUES (?, ?, ?, ?, ?, ?, ?);", 
                           (headword, dep, ptn, normalize(ptn), count, 
                            json.dumps(slim_ngrams[headword][dep][ptn]), 
                            json.dumps(slim_sents[headword][dep][ptn])))

conn.commit()
conn.close()