# Extract Triples from Parsed Documents

### Imports

In [None]:
from multiprocessing import Pool
from pathlib import Path
import unicodedata
import shutil
import json
import sys
import os

from spacy.tokens import Doc, Span, DocBin, Token
import en_core_web_lg
import pandas as pd

sys.path.insert(0, '../')

import pathvecs.matchers as matchers

### Definitions

In [None]:
def clean_triples_df(df):

    # Allow neither (src) or (dst) terms to be non-utf8 characters
    df = df[df.apply(lambda col: col.str.encode('ascii', errors='ignore').str.decode('utf-8')).all(axis=1)]

    # Allow neither (src) or (dst) terms to be space characters
    df = df[df.apply(lambda col: ~col.str.isspace()).all(axis=1)]

    # Disallow newlines
    df = df[df.apply(lambda col: ~col.str.contains('\n')).all(axis=1)]

    # Disallow equals signs (tend to be parts of wikipedia markdown / css elements)
    df = df[df.apply(lambda col: ~col.str.contains('=')).all(axis=1)]
    
    return df

In [None]:
def normalize_triples_df(df):
    
    # Normalize spaces as underscores
    df['src'] = df['src'].str.replace(' ', '_')
    df['dst'] = df['dst'].str.replace(' ', '_')
    
    return df

In [None]:
def docs_to_triples(fp):
    
    IGNORED_DEPS = {'dep', 'det', 'punct', 'pobj', 'ROOT', 'prep', 'cc'}
    dep_triples = []
    
    loaded_doc_bin = DocBin().from_disk(str(fp))
    for doc in loaded_doc_bin.get_docs(nlp.vocab):
        doc = pronouns_matcher(doc)
        doc = triple_matcher(doc)
        
        for sent in doc.sents:
            for token in sent:
                
                dst = token
                if dst._.antecedent is not None:
                    dst = dst._.antecedent
                
                dep = dst.dep_
                if dep in IGNORED_DEPS:
                    continue
                
                src = token.head
                if src._.antecedent is not None:
                    src = src._.antecedent
                
                # edge cases
                if src == dst:
                    continue

                triple = (src.lemma_.lower(), dep, dst.lemma_.lower())
                dep_triples.append(triple)
        
        for triple in doc._.triples:
            
            src = triple.src
            if src._.antecedent is not None:
                src = src._.antecedent
    
            dst = triple.dst
            if dst._.antecedent is not None:
                dst = dst._.antecedent

            src_text = src.lemma_.lower()
            dst_text = dst.lemma_.lower()

            # Dont create / add frames that are internal to an entity name
            if src == dst:
                continue
            
            if triple.edge.startswith('prep_'):
                dep_triples.append((src_text, triple.edge, dst_text))
            
            else:
                # Treat the edge as if it were an active transitive verb
                dep_triples.append((triple.edge, 'nsubj', src_text))
                dep_triples.append((triple.edge, 'dobj', dst_text))
    
    df = pd.DataFrame(dep_triples, columns=['src', 'path', 'dst'])
    df = clean_triples_df(df)
    df = normalize_triples_df(df)

    # df['forward_context'] = df['dst'] + '/' + df['dep']
    # df['reverse_context'] = df['src'] + '/' + df['dep'] + '-1'

    # Save triples
    outfp = str(fp).replace('parses', 'triples')
    outfp = outfp.replace('.spacy', '.df')
    df.to_parquet(outfp)

    return [outfp]

### Config

In [None]:
# Place where the pipeline artifacts are going
data_path = Path('../').resolve().joinpath('data')

# Name of the input parse folder in data/parses
dataset = 'wikipedia_20220101'

# Number of DocBin files to process (1000 articles / file)
N = 1000

# Triple patterns to use
triple_patterns = [
    'prep',
    'intransitive_verb_prep',
    'appos_noun_prep',
    'be_noun_prep',
    'poss_noun_appos',
    'poss_noun_prep'
]

### Load Model and Pipeline Extensions

In [None]:
nlp = en_core_web_lg.load()
triple_matcher = nlp.add_pipe('triple_matcher', config={'use_patterns': triple_patterns})
pronouns_matcher = nlp.add_pipe('map_relative_pronouns')

### Prepare Output Folder
**Deletes anything from previous runs!**

In [None]:
output_folder = data_path.joinpath('triples', dataset)
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)
else:
    shutil.rmtree(output_folder)
    os.mkdir(output_folder)

### Run

In [None]:
parse_files = list(data_path.joinpath('parses', dataset).rglob('*.spacy'))[:N]

In [None]:
with Pool(processes=4) as pool:
    
    num_done = 0
    for output in pool.imap_unordered(docs_to_triples, parse_files):
        num_done += 1
        if num_done % (len(parse_files) // 10) == 0:
            print('{}/{}'.format(num_done, len(parse_files)))