In [1]:
import pandas as pd

#train_df = pd.read_csv('data/classifier_input_train.csv', index_col=0)
train_df = pd.read_csv('data/classifier_input_train3.csv', index_col=0)
#val_df = pd.read_csv('data/classifier_input_val.csv', index_col=0)
val_df = pd.read_csv('data/classifier_input_val3.csv', index_col=0)

## Create pipeline components to use in SpaCy NLP pipeline

In [2]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc, Span, Token

Class to tag employee nouns as entities

In [3]:
def get_case_combos(str_list, fast=False):
    """Return a list with original, lower, upper, and title case."""
    
    if not fast: # Preserve some rational ordering
        case_combos = [s.lower() for s in str_list] + [s.upper() for s in str_list] 
        case_combos = case_combos + [s.title() for s in str_list if s.title() not in case_combos] 
        case_combos = case_combos + [s for s in str_list if s not in case_combos]
        return case_combos
    
    case_combos = str_list + [s.lower() for s in str_list] + [s.upper() for s in str_list] + [s.title() for s in str_list]
    return list(set(case_combos))

# Templated from: https://spacy.io/usage/processing-pipelines#custom-components 
class EmpNounRecognizer(object):
    """A spaCy v2.0 pipeline component that sets entity annotations
    based on list of terms. Terms are labelled as EMP_NOUN. Additionally,
    ._.has_emp_noun and ._.is_emp_noun is set on the Doc/Span and Token
    respectively."""
    name = 'employee_nouns'  # component name, will show up in the pipeline

    def __init__(self, nlp, terms=tuple(), label='EMP_NOUN'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of terms is long, it's very efficient
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('EMP_NOUN', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_emp_noun', default=False, force=True)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_emp_noun == True.
        Doc.set_extension('has_emp_noun', getter=self.has_emp_noun, force=True)
        Span.set_extension('has_emp_noun', getter=self.has_emp_noun, force=True)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_emp_noun', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_emp_noun(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is an employee noun. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_emp_noun' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_emp_noun') for t in tokens])

emp_terms_list = ["associates", "employees", "equivalents", "FTEs", "FTE's", "headcount", "individuals", 
                  "people", "persons", "team members", "workers", "workforce"]
emp_terms_list = get_case_combos(emp_terms_list)

Create entities that mark employee types (e.g., full-time, temporary, etc.)

In [4]:
class EmpTypeRecognizer(object):
    """A spaCy v2.0 pipeline component that sets entity annotations
    based on list of terms. Terms are labelled as EMP_TYPE. Additionally,
    ._.has_emp_type and ._.is_emp_type is set on the Doc/Span and Token
    respectively."""
    name = 'employee_types'  # component name, will show up in the pipeline
    
    def __init__(self, nlp, terms_dict, label='EMP_TYPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of terms is long, it's very efficient
        self.matcher = PhraseMatcher(nlp.vocab)
        for match_label in terms_dict.keys():
            patterns = [nlp(term) for term in terms_dict[match_label]]
            #patterns = [nlp(term) for term in terms]
            
            self.matcher.add(match_label, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_emp_type', default=False, force=True)
        Token.set_extension('is_part_time', default=False, force=True)
        Token.set_extension('is_full_time', default=False, force=True)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_emp_type == True.
        Doc.set_extension('has_emp_type', getter=self.has_emp_type, force=True)
        Span.set_extension('has_emp_type', getter=self.has_emp_type, force=True)
        Doc.set_extension('has_part_time', getter=self.has_part_time, force=True)
        Span.set_extension('has_part_time', getter=self.has_part_time, force=True)
        Doc.set_extension('has_full_time', getter=self.has_full_time, force=True)
        Span.set_extension('has_full_time', getter=self.has_full_time, force=True)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for match_id, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=match_id)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_emp_type', True)
                if doc.vocab.strings[match_id] == 'PART_TIME':
                    token._.set('is_part_time', True)
                elif doc.vocab.strings[match_id] == 'FULL_TIME':
                    token._.set('is_full_time', True)
                    
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_emp_type(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is an employee type. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_emp_type' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_emp_type') for t in tokens])
    
    def has_part_time(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is indicates part time. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_part_time' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_part_time') for t in tokens])
    
    def has_full_time(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is indicates full time. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_full_time' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_full_time') for t in tokens])


part_time_terms = get_case_combos(["half-time", "half time", "part-time", "part time"])
full_time_terms = get_case_combos(["full-time", "full time", "40-hour equivalent", "40 hour equivalent", "full-time equivalent", "full time equivalent"])
emp_type_dict = {'PART_TIME': part_time_terms, 
                'FULL_TIME': full_time_terms}

In [5]:
class NumberWordRecognizer(object):
    """A spaCy v2.0 pipeline component that sets entity annotations
    based on list of terms. Terms are labelled as NUM_WORD. Additionally,
    ._.has_num_word and ._.is_num_word is set on the Doc/Span and Token
    respectively."""
    name = 'number_words'  # component name, will show up in the pipeline

    def __init__(self, nlp, terms=tuple(), label='NUM_WORD'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of terms is long, it's very efficient
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('NUM_WORD', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_num_word', default=False, force=True)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_num_word == True.
        Doc.set_extension('has_num_word', getter=self.has_num_word, force=True)
        Span.set_extension('has_num_word', getter=self.has_num_word, force=True)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        
        matches = self.matcher(doc)
        spans = []  # collect the matched spans here

        for _, start, end in matches:
            spans.append(doc[start:end])
            # Set custom attribute on each token of the entity
        for span in spans:
            span.merge()
            for token in span:
                token._.set('is_num_word', True)
        return doc  

    def has_num_word(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a nubmer word. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_num_word' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_num_word') for t in tokens])

singles_word_list = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
teens_word_list = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"]
tens_word_list = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
magnitude_word_list = ["hundred", "thousand", "million", "billion"]

teen_unit_combos = [x.join([y,z])  for x in [" ", "-"] for y in get_case_combos(tens_word_list) for z in get_case_combos(singles_word_list) ]
number_word_list = get_case_combos(singles_word_list) + get_case_combos(teens_word_list) + get_case_combos(tens_word_list) + get_case_combos(magnitude_word_list) + teen_unit_combos

len(number_word_list)

1389

YearMatcher: Add custom `_.is_year` attribute to year tokens that are part of Date entities

In [6]:
class YearMatcher(object):
    name = 'year_matcher'
    
    def __init__(self, nlp, pattern_list, match_id='Year'):
        # register a new token extension to flag year tokens
        Token.set_extension('is_year', default=False, force=True)
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(match_id, None, pattern_list)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []  # collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        for span in spans:
            span.merge()   # merge
            for token in span:
                token._.is_year = True  # mark token as a year
        return doc

year_patterns = [{'ENT_TYPE': 'DATE', 'TAG' : 'CD', 'SHAPE' : 'dddd'}]

In [7]:
nlp = spacy.load('en_core_web_lg')

emp_noun_recognizer = EmpNounRecognizer(nlp, emp_terms_list)
nlp.add_pipe(emp_noun_recognizer, last=True) 

emp_type_recognizer = EmpTypeRecognizer(nlp, emp_type_dict)
nlp.add_pipe(emp_type_recognizer, last=True) 

number_word_recognizer = NumberWordRecognizer(nlp, number_word_list)
nlp.add_pipe(number_word_recognizer, last=True) 

year_matcher = YearMatcher(nlp, year_patterns)
nlp.add_pipe(year_matcher, last=True) 

print('Pipeline', nlp.pipe_names) 

Pipeline ['tagger', 'parser', 'ner', 'employee_nouns', 'employee_types', 'number_words', 'year_matcher']


## Code for relationship extraction and helper functions

### Function: extract_emp_relations

In [164]:
from collections import namedtuple

def find_root_tok(tok):
    """Return (tok's root node, num steps to reach root)."""
    
    steps = 0 
    root_tok = tok
    while root_tok.dep_ != 'ROOT':
        steps +=1 
        root_tok = root_tok.head
    return (root_tok, steps)

def find_verb_tok(tok):
    """Return first verb ancestor of tok."""
    verb_tok = 0
    for a in tok.ancestors:
        if a.pos_ == 'VERB' and a.dep_ in ['ROOT', 'ccomp']:
            return a
    return verb_tok
    
def find_tok_side_of_root(tok, root_tok):
    """Return 'right' or 'left' if tok is in subtree of root."""
    
    for a in [tok] + list(tok.ancestors): # The ancestors of a token will either be in root.rights or root.lefts
        if a in root_tok.lefts:
            return 'left'
        elif a in root_tok.rights:
            return 'right'
    else:
        return None

def find_subject(root_tok, verbose=False):
    """Return list of nominal subject"""
    subjects = [w for w in root_tok.lefts if w.dep_ == 'nsubj']
    try:
#        for i, s in enumerate(subjects):
#            print("nsubj "+str(i)+" of " + str(root_tok) + " is : " + str(s))
        return subjects[0]
    except:
        if verbose == True:
            print("No nsubj found left of ROOT. Noun phrases left of root are:")
            print([x for x in list(root_tok.doc.noun_chunk)])
        return False
        
def get_org_span(tok):
    """Return the entity span if token has ORG ent_type_."""
    if tok.ent_type_ == 'ORG':
        subject = [e for e in tok.doc.ents if tok in e][0]
        return subject
    return tok

def check_emp_type_flags(tok):
    """Return 'Part-Time' or 'Full-Time' if corresponding flags
    are set to True."""
    
    if tok._.is_part_time == True:
        return 'Part-Time Employees'
    if tok._.is_full_time == True:
        return 'Full-Time Employees'
    return 'Other Employees'

def find_emp_type_tok(tok, verbose=False):
    """Return child token left of tok if emp_type flagged or ADJ."""
    
    flagged_toks = [t for t in tok.children if t._.is_emp_type == True]
    if flagged_toks:
        type_conjs = [t for t in list(flagged_toks[0].conjuncts) if t._.is_emp_type == True]
        if type_conjs:
            if verbose == True:
                print("type_conjs: ", type_conjs)
                print("type_conjs: ", type_conjs)
        if verbose == True:
            print("Flagged_toks: ", flagged_toks)
        return flagged_toks[0]
    
    candidate_tok = tok.doc[tok.i - 1]  
    while candidate_tok.is_punct == True:
        candidate_tok = tok.doc[candidate_tok.i - 1]
    if candidate_tok.head == tok:
        if candidate_tok.pos_ == 'ADJ' or candidate_tok.dep_ == 'compound':
            return candidate_tok
        if verbose == True:
            print("Candidate tok: ", candidate_tok)
            print("Candidate tok.pos_:  ", candidate_tok.pos_)
            print("Candidate tok.dep_:  ", candidate_tok.dep_)
    if verbose == True:
            print("No toks, returning 0.")
    return 0

def get_nummod_tok(tok, years, verbose=False):
    """Return tok.children that are nummod and card entities."""
    
    num_toks = [c for c in tok.children if c.dep_ == 'nummod' and c.ent_type_ == 'CARDINAL']
    if num_toks:
        if verbose == True:
            print("Num_toks are: " + str(num_toks))
        num_tok = num_toks[0]
        num_tok_conj = [c for c in num_tok.children if c.dep_=='conj' and c.tag_ == 'CD']
        
        if num_tok_conj:
            if verbose == True:
                print("num_tok has conjugate children:" + str(num_tok_conj))
                print("num_tok subtree is :" + str(list(num_tok.subtree)))
            cards = [(c.i, c) for c in num_tok.subtree if c.tag_== 'CD' and c.ent_type_ == 'CARDINAL']
            
            if len(years) == len(cards):
                order_indices = [years.index(y) for y in sorted(years, reverse=True, key = lambda x: x[1])]
                #year_emps = [(years[i][1].text, cards[i][1].text) for i in order_indices]
                #num_tok = max(year_emps)[1]
                year_emps = sorted([(years[i][1], cards[i][1]) for i in order_indices], reverse=True, key = lambda x: x[0].text)
                num_tok = year_emps[0][1]
                if verbose == True:
                    print("years: " + str(years))
                    print("cards: " + str(cards))
                    print("order_indices: " + str(order_indices))
                    print("year_emps: " + str(year_emps))
        return num_tok
    
    return 0    

def extract_emp_relations(doc, verb_list=False, verbose=False):
    """Return tuple of extracted relations."""
    if not verb_list:
        verb_list = ['be', 'employ', 'have']
    
    relation_tuples = []
    
    tuple_field_names = ["sent_num", "word_num", "subject", "verb", 
                         "quantity", "quantity_type", "type_token" , "word", "word_dep",  "depth", "sentence"]
    RelationDetails = namedtuple('RelationDetails', tuple_field_names)
    
    for sent_id, sent in enumerate(doc.sents):
        
        # Find the root token
        root_tok, depth = find_root_tok(sent[0])
        
        match_pairs = []
        num_tok, num_tok_conj, subject, year_conj = (False, False, False, False)
        years = [(y.i, y) for y in root_tok.subtree if y._.is_year == True] # Need to change to root.subtree to only return the word's sentence

        for word_id, word in enumerate(filter(lambda w: w.ent_type_ == 'EMP_NOUN', sent)):  
            
            if verbose == True:
                print("Word_id is : " + str(word_id))
                print("Word is : " + str(word))
            
            # Find first verb ancestor 
            verb_tok = find_verb_tok(word)
            if not verb_tok:
                continue
            
            # If verb does not have expected lemma, move to next sentence
            if root_tok.lemma_ not in verb_list:
                root_tok = verb_tok
                if verbose == True:
                        print("Root token lemma not one of ['be', 'employ', 'have']. ")
                        print("Root token, lemma are : " + str(root_tok) + " " + str(root_tok.lemma_))
                        print(list(root_tok.subtree))
                
                if verb_tok.lemma_ not in verb_list:
                    if verbose == True:
                        print("verb token lemma not one of ['be', 'employ', 'have']. ")
                        print("verb token, lemma are : " + str(verb_tok) + " " + str(verb_tok.lemma_))
                        print(list(verb_tok.subtree))
                    continue

            emp_type_tok = find_emp_type_tok(word)
            emp_type = 'Other Employees'
            if emp_type_tok:
                emp_type = check_emp_type_flags(emp_type_tok)
            parts_found = []
            # Find out if the employee noun is in subject (left) or predicate (right)
            left_side = []; right_side = []
            emp_tok_side = find_tok_side_of_root(word, root_tok)
            if emp_tok_side == 'left':
                left_side.append(word)
            elif emp_tok_side == 'right':
                right_side.append(word)
            else:
                if verbose == True:
                    print("No ancestor of'" + str(word) + "' is in root.rights or root.lefts.")    

            if verbose == True:
                print("Dep_ of EMP_NOUN is: " + str(word.dep_))
            if word.dep_ in ('attr', 'dobj', 'compound') or word.dep_ == 'pobj' and word.head.dep_ == 'prep':
                num_tok = get_nummod_tok(word, years, verbose = verbose)      
                if num_tok:
                    match_pairs.append((num_tok, word))
                else:
                    cards = [e for e in word.doc.ents if e.label_ == 'CARDINAL' and e.root in root_tok.rights]
                    if cards:
                        cards = cards + [c for c in word.doc.ents if c.root in cards[0].root.subtree and c not in cards and c.label_ == 'CARDINAL']           
                        if word in left_side:    
                            if len(years) > 0:                       
                                emp_counts = [(c.start, c) for c in sorted(cards, reverse=False, key = lambda x: x.start)]                      
                                order_indices = [years.index(y) for y in sorted(years, reverse=True, key = lambda x: x[1])]
                                try: 
                                    #year_emps = [(years[i][1].text, emp_counts[i][1].text) for i in order_indices]
                                    year_emps = sorted([(years[i][1], emp_counts[i][1]) for i in order_indices], reverse=True, key = lambda x: x[0].text)
                                    if verbose == True:
                                        print("years: " + str(years))
                                        print("emp_counts: " + str(emp_counts))
                                        print("order_indices: " + str(order_indices))
                                        print("year_emps: " + str(year_emps))
                                    #num_tok = max(year_emps)[1]
                                    num_tok = year_emps[0][1]
                                except:
                                    print(str("==" * 20))
                                    print("Length of emp_counts is : " + str(len(emp_counts)) + 
                                         " while length of years is : " + str(len(years)))
                                    num_tok = cards[0]
                                    print(str("-" * 20))
                                    print(word.doc.text)
                                if verbose == True:
                                    print("Sentence has multiple years:" + str(years))
                                    print("First card subtree is :" + str(list(cards[0].subtree)))
                                    print("years: " + str(years))
                                    print("cards: " + str(cards))
                                    print("emp_counts: " + str(emp_counts))
                    #                print("order_indices: " + str(order_indices))                   
                                match_pairs.append((num_tok, word))
                        else:
                            if verbose == True:
                                print("Emp_tok is in right side; appending first card.")
                            match_pairs.append((cards[0], word))
                    elif verb_tok.dep_ == 'relcl':
                        cards = [e for e in word.doc.ents if e.label_ == 'CARDINAL' and e.root in verb_tok.lefts]
                        if cards:
                            match_pairs.append((cards[0], word))
                            num_tok = cards[0]
                if verbose == True:
                    print("Root is at "+str(depth)+" steps from "+str(word)+".")
                subject = find_subject(root_tok)
                if not subject: # For debugging
                    if verbose == True:
                        print("No nsubj found left of ROOT. Noun phrases left of root are:")
                        left_filter = lambda e: e.root in root_tok.lefts
                        print_df(make_span_df(doc, entities=False, span_filter_func=left_filter))
                else:
                    if subject == word.head.head: # If word is part of prep phrase of subject
                        subject = doc[subject.left_edge.i : subject.right_edge.i + 1]
                    else: 
                        subject = get_org_span(subject) # Use full span of ORG entity if subject tok is in ORG 
                    parts_found.append(subject)
                    match_pairs.append((subject, word))
                    #[print(str(p) + '  :  ' + str(p.dep_)) for p in subject.subtree]
                    sub_poss = [p for p in subject.subtree if p.dep_ == 'poss']
    #                if sub_poss:
    #                    sub_poss = sub_poss[0]
    #                    match_pairs.append((sub_poss, word))
                    if root_tok:
                        parts_found.append(root_tok)
                    if num_tok:
                        parts_found.append(num_tok)
                        parts_found.append(emp_type)
                        parts_found.append(emp_type_tok)
                        parts_found.append(word)
                    elif word.head.head.head.pos_ == 'VERB':
                        if verbose == True:
                            print("No num_tok. ")
                        cards = [c for c in word.head.head.head.rights if c.tag_== 'CD' and c.ent_type_ == 'CARDINAL']
                        years = [(y.i, y) for y in root_tok.subtree if y._.is_year == True]
                        match_pairs.append((years, cards))
                        if cards:
                            match_pairs.append((cards[0], word))

            elif word.dep_ == 'conj':
                num_tok = get_nummod_tok(word, years, verbose = verbose)
                head_num_tok = [w for w in [word.head] if w.tag_ == 'CD' and w.ent_type_ == 'CARDINAL']            
                if verbose == True:
                    print("Emp_noun token has dep_ == 'conj'.")
                    print("Child num_tok: " + str(num_tok))
                    print("Head num_tok: " + str(head_num_tok))
                if num_tok and head_num_tok:
                    if verbose == True:
                        print("child_num_tok and head_num_toks")                      
                    num_toks = [num_tok] + head_num_tok
                    years = [(y.i, y) for y in root_tok.subtree if y._.is_year == True]
                    if verbose == True:
                        print("years: " + str(years))   
                        print("num_toks: " + str(num_toks))   
                    if head_num_tok[0].dep_ == 'conj' or head_num_tok[0].head.ent_type_ == 'CARDINAL' :
                        num_toks = num_toks + [w for w in [head_num_tok[0].head] if w.tag_ == 'CD']
                    if len(years) > len(num_toks):
                        if doc[head_num_tok[0].i - 2].ent_type_ == 'CARDINAL':
                            num_toks = num_toks + [doc[head_num_tok[0].i - 2]] 
                        head_num_conjucts = [c for c in head_num_tok[0].conjuncts if c.tag_ == 'CD']
                        if head_num_conjucts:
                            if head_num_conjucts[0].ent_type_ != 'CARDINAL':
                                print(str("==" * 30))
                                print("Potential series token :" + str(head_num_conjucts[0]) + 
                                     " does not have CARDINAL entity type. ")
                                print("Entity type is: " + str(head_num_conjucts[0].ent_type_))
                                print("Token index: " + str(head_num_conjucts[0].i))
                                print("Doc is: " + str(word.doc))
                            num_toks = num_toks + head_num_conjucts
                    emp_counts = sorted([(c.i, c) for c in num_toks], key = lambda x: x[0])
                    if verbose == True:
                        print("emp_counts: " + str(emp_counts))
                    order_indices = [years.index(y) for y in sorted(years, reverse=True, key = lambda x: x[1])]
                    if verbose == True:
                        print("order_indices: " + str(order_indices))
                    try:
                        year_emps = sorted([(years[i][1], emp_counts[i][1]) for i in order_indices], reverse=True, key = lambda x: x[0].text)
                    except:
                        print(str("==" * 20))
                        print("Error on doc:")
                        print(str("-" * 20))
                        print(word.doc.text)
                        print(str("-" * 20))
                        print("Error sentence:")
                        print(sent)
                        
                    if verbose == True:
                        print("year_emps: " + str(year_emps))
                    #num_tok = max(year_emps)[1]
                    num_tok = year_emps[0][1]
                match_pairs.append((num_tok, word))
                
                if not subject:
                    try:
                        subject = get_org_span(find_subject(root_tok))
                    except:
                        continue
                parts_found.append(subject)
                parts_found.append(root_tok)
                parts_found.append(num_tok)
                parts_found.append(emp_type)
                parts_found.append(emp_type_tok)
                parts_found.append(word)
            
            else:
                continue
            
            if all([subject, root_tok, num_tok, emp_type ]):
                if verbose == True:
                    print(tuple(parts_found))
                details = [sent_id, word_id] + parts_found + [word.dep_, depth, sent.text]
                relation_tuples.append(RelationDetails(*details))        
    return relation_tuples

### Function: print_doc_info 

In [9]:
def make_tok_df(doc, tok_filter_func=False):
    """Return a dataframe showing attributes for each token in doc."""
    
    if tok_filter_func:
        toks = list(filter(tok_filter_func, doc))
    else:
        toks = doc
    doc_dict = {'tok_ent' : [tok.ent_type_ for tok in toks], 
        'toks' : [tok for tok in toks], 
        'lemma' : [tok.lemma_ for tok in toks], 
        'dep' : [tok.dep_ for tok in toks], 
        'head' : [tok.head for tok in toks], 
        'h_dep' : [tok.head.dep_ for tok in toks], 
        'dep_def' : [spacy.explain(tok.dep_) for tok in toks],
        'pos' : [tok.pos_ for tok in toks], 
        'tag' : [tok.tag_ for tok in toks], 
        'tag_def' : [spacy.explain(tok.tag_) for tok in toks], 
               }
    columns = [ 'tok_ent', 'toks', 'lemma', 'dep', 'head', 'h_dep', 'pos', 'tag',  'dep_def', 'tag_def' ]
    return pd.DataFrame(doc_dict, columns=columns)

def make_span_df(doc, entities=True, span_filter_func=False):
    """Return df showing attributes for each entity or noun chunk in doc."""
    
    columns = ['tok_i', 'entity', 'ent_label', 'root', 'root_ent', 
               'root_dep','dep_def', 'root_head',  'root_head_dep', 
               'root_head_pos'  ]
    if entities:
        target_spans = doc.ents
        df_name = 'doc_entities'
    else: 
        target_spans = list(doc.noun_chunks)
        df_name = 'doc_noun_chunks'
        
    if span_filter_func:
        target_spans = list(filter(span_filter_func, target_spans))
        
    doc_dict = {'tok_i' : [e.start for e in target_spans], 
        'entity' : [e.text for e in target_spans], 
        'ent_label' : [e.label_ for e in target_spans], 
        'root' : [e.root.text for e in target_spans], 
        'root_ent' : [e.root.ent_type_ for e in target_spans], 
        'root_dep' : [e.root.dep_ for e in target_spans], 
        'dep_def' : [spacy.explain(e.root.dep_) for e in target_spans],
        'root_head' : [e.root.head for e in target_spans], 
        'root_head_dep' : [e.root.head.dep_ for e in target_spans], 
        'root_head_pos' : [e.root.head.pos_ for e in target_spans]}
#    try :
    df = pd.DataFrame(doc_dict, columns=columns) 
    df.name = df_name
    if entities:
        df_cols = [x for x in df.columns.tolist() if x != 'root_ent']
        df = df[df_cols]
    else:
        df.columns = ['tok_i', 'noun_chunk', 'ent_label', 'root', 'root_ent', 'root_dep','dep_def',
                      'root_head',  'root_head_dep', 'root_head_pos' ]
        df_cols = [x for x in df.columns.tolist() if x != 'ent_label']
        df = df[df_cols]
#    except: 
#        df = [(k, doc_dict[k]) for k in doc_dict.keys()]
    return df

from IPython.display import display, HTML
def print_df(df, print_df_name=True):
    """Display rendered HTML version of df in Jupyter notebook"""
    
    if print_df_name and hasattr(df, 'name'):
        print("DataFrame is named " + str(df.name))
    display(HTML(df.to_html()))

def print_doc_info(doc):
    """Print dfs for doc entities, noun_chunks, and CARDINAL toks"""
    
    print("doc is: ")
    print(doc)
    print('-' * 50)
    print("Entities are: ")
    print_df(make_span_df(doc), print_df_name=False)
    print('-' * 50)
    print("Noun chunks are: ")
    print_df(make_span_df(doc, entities=False), print_df_name=False)
    print('-' * 50)
    print("Cardinal entities are: ")
    card_filter = lambda w: w.ent_type_ == 'CARDINAL'
    print_df(make_tok_df(doc, card_filter), print_df_name=False)

## Define sentence structure types

In [150]:
from spacy import displacy

### `Company_noun` in subject,  `Emp_num` and `EMP_NOUN` in predicate

#### [Company]  [`was`| `had`] `Emp_Num` `EMP_NOUN`

Examples: 


`"At December 31, 2016, Bio-Rad had approximately 8,250 employees."`

Desired output: 

`(Bio-Rad, had, 8,250, employees)`

In [15]:
doc1 = "At December 31, 2016, Bio-Rad had approximately 8,250 employees."
ex1 = nlp(doc1)

print_doc_info(ex1)

ex1_emp_tok = ex1[12]
print(ex1_emp_tok.dep_)
print([x for x in ex1_emp_tok.children])
print(list(list(ex1_emp_tok.head.lefts)[0].subtree))

doc is: 
At December 31, 2016, Bio-Rad had approximately 8,250 employees.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,"December 31, 2016",DATE,December,pobj,object of preposition,At,prep,ADP
1,6,Bio-Rad,ORG,Rad,nsubj,nominal subject,had,ROOT,VERB
2,10,"approximately 8,250",CARDINAL,8250,nummod,,employees,dobj,NOUN
3,12,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,December,December,DATE,pobj,object of preposition,At,prep,ADP
1,6,Bio-Rad,Rad,ORG,nsubj,nominal subject,had,ROOT,VERB
2,10,"approximately 8,250 employees",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,8250,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,8250,8250,nummod,employees,dobj,NUM,CD,,cardinal number


dobj
[8,250]
[At, December, 31, ,, 2016]


In [15]:
displacy.render(ex1, style='ent', jupyter=True, options={'distance': 90})

In [None]:
displacy.render(ex1, style='dep', jupyter=True, options={'distance': 90})

In [17]:
print_df(make_tok_df(ex1))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,At,at,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,DATE,December,december,pobj,At,prep,PROPN,NNP,object of preposition,"noun, proper singular"
2,DATE,31,31,nummod,December,pobj,NUM,CD,,cardinal number
3,DATE,",",",",punct,December,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
4,DATE,2016,2016,nummod,December,pobj,NUM,CD,,cardinal number
5,,",",",",punct,had,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
6,ORG,Bio,bio,compound,Rad,nsubj,PROPN,NNP,,"noun, proper singular"
7,ORG,-,-,punct,Rad,nsubj,PUNCT,HYPH,punctuation,"punctuation mark, hyphen"
8,ORG,Rad,rad,nsubj,had,ROOT,PROPN,NNP,nominal subject,"noun, proper singular"
9,,had,have,ROOT,had,ROOT,VERB,VBD,,"verb, past tense"


In [23]:
#[w for w in ex1_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex1_emp_tok) )
for c in ex1_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex1_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex1_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex1_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex1_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex1_emp_tok.head.i))
    print('Token head is: ' + str(ex1_emp_tok.head) + '       head.dep_:  '+ str(ex1_emp_tok.head.dep_) )
    for i, d in enumerate(ex1_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex1_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex1_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex1_emp_tok.dep_))
print('Token head: ' + str(ex1_emp_tok.head))
print('       head.dep_: ' + str(ex1_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex1_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex1_emp_tok.head.pos_) + "'" )

print('Token conjuncts: ' )
print(str([c for c in ex1_emp_tok.conjuncts]))
print('Token head lefts: ' )
for w in ex1_emp_tok.head.lefts:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token head rights: ' )
for w in ex1_emp_tok.head.rights:
    print(str(w) + '       rights.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex1_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex1_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 8,250  has dep_ == 'nummod'
       Token.child.i : 11
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: dobj
Token head: had
       head.dep_: ROOT
       head.tag_: 'VBD'
       head.pos_: 'VERB'
Token conjuncts: 
[]
Token head lefts: 
At       head.left.dep_:prep
,       head.left.dep_:punct
Rad       head.left.dep_:nsubj
Rad is nsubj. Subtree is:
Bio
-
Token head rights: 
employees       rights.left.dep_:dobj
.       rights.left.dep_:punct
Token ancestors: 
had       ancestor.dep_:ROOT
Token children: 
8,250       child.dep_:nummod


In [14]:
extract_emp_relations(ex1)

[RelationDetails(sent_num=0, word_num=0, subject=Bio-Rad, verb=had, quantity=8,250, quantity_type='Other Employees', type_token=0, word=employees, word_dep='dobj', depth=1, sentence='At December 31, 2016, Bio-Rad had approximately 8,250 employees.')]

#### `we` [`employed` | `have`] `Emp_num` `EMP_NOUN`

`"As of December 31, 2016, the Company had 455 employees, an increase of 17 employees from the prior year end."`

Desired output: 

`(Company, had, 455, employees)`

In [25]:
ex2 = nlp("As of December 31, 2016, the Company had 455 employees, an increase of 17 employees from the prior year end.")

print_df(make_tok_df(ex2))

In [27]:
print_doc_info(ex2)

doc is: 
As of December 31, 2016, the Company had 455 employees, an increase of 17 employees from the prior year end.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"December 31, 2016",DATE,December,pobj,object of preposition,of,prep,ADP
1,10,455,CARDINAL,455,nummod,,employees,dobj,NOUN
2,11,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB
3,16,17,CARDINAL,17,nummod,,employees,pobj,NOUN
4,17,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP
5,19,the prior year end,DATE,end,pobj,object of preposition,from,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,December,December,DATE,pobj,object of preposition,of,prep,ADP
1,7,the Company,Company,,nsubj,nominal subject,had,ROOT,VERB
2,10,455 employees,employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB
3,13,an increase,increase,,appos,appositional modifier,employees,dobj,NOUN
4,16,17 employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP
5,19,the prior year end,end,DATE,pobj,object of preposition,from,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,455,455,nummod,employees,dobj,NUM,CD,,cardinal number
1,CARDINAL,17,17,nummod,employees,pobj,NUM,CD,,cardinal number


In [28]:
ex2_emp_tok = ex2[11]

ex2_emp_tok_2 = ex2[17]
ex2_emp_tok_2

In [None]:
#displacy.render(ex2, style='dep', jupyter=True)

In [31]:
# Properties of the first emp_tok
print('Token:  ' + str(ex2_emp_tok) )
for c in ex2_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex2_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex2_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex2_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex2_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex2_emp_tok.head.i))
    print('Token head is: ' + str(ex2_emp_tok.head) + '       head.dep_:  '+ str(ex2_emp_tok.head.dep_) )
    for i, d in enumerate(ex2_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex2_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex2_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex2_emp_tok.dep_))
print('Token head: ' + str(ex2_emp_tok.head))
print('       head.dep_: ' + str(ex2_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex2_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex2_emp_tok.head.pos_) + "'" )

print('Token conjuncts: ' )
print(str([c for c in ex2_emp_tok.conjuncts]))
print('Token head lefts: ' )
for w in ex2_emp_tok.head.lefts:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token head rights: ' )
for w in ex2_emp_tok.head.rights:
    print(str(w) + '       rights.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex2_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex2_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 455  has dep_ == 'nummod'
       Token.child.i : 10
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: dobj
Token head: had
       head.dep_: ROOT
       head.tag_: 'VBD'
       head.pos_: 'VERB'
Token conjuncts: 
[]
Token head lefts: 
As       head.left.dep_:prep
,       head.left.dep_:punct
Company       head.left.dep_:nsubj
Company is nsubj. Subtree is:
the
Token head rights: 
employees       rights.left.dep_:dobj
.       rights.left.dep_:punct
Token ancestors: 
had       ancestor.dep_:ROOT
Token children: 
455       child.dep_:nummod
,       child.dep_:punct
increase       child.dep_:appos


In [32]:
# Properties of the second emp_tok
print('Token:  ' + str(ex2_emp_tok_2) )
for c in ex2_emp_tok_2.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex2_emp_tok_2.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex2_emp_tok_2.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex2_emp_tok_2.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex2_emp_tok_2.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex2_emp_tok_2.head.i))
    print('Token head is: ' + str(ex2_emp_tok_2.head) + '       head.dep_:  '+ str(ex2_emp_tok_2.head.dep_) )
    for i, d in enumerate(ex2_emp_tok_2.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex2_emp_tok_2.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex2_emp_tok_2.head.dep_ == ''

print('Token dep_: ' + str(ex2_emp_tok_2.dep_))
print('Token head: ' + str(ex2_emp_tok_2.head))
print('       head.dep_: ' + str(ex2_emp_tok_2.head.dep_))
print("       head.tag_: '"+ str(ex2_emp_tok_2.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex2_emp_tok_2.head.pos_) + "'" )

print('Token conjuncts: ' )
print(str([c for c in ex2_emp_tok_2.conjuncts]))
print('Token head lefts: ' )
for w in ex2_emp_tok_2.head.lefts:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token head rights: ' )
for w in ex2_emp_tok_2.head.rights:
    print(str(w) + '       rights.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex2_emp_tok_2.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex2_emp_tok_2.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 17  has dep_ == 'nummod'
       Token.child.i : 16
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: pobj
Token head: of
       head.dep_: prep
       head.tag_: 'IN'
       head.pos_: 'ADP'
Token conjuncts: 
[]
Token head lefts: 
Token head rights: 
employees       rights.left.dep_:pobj
Token ancestors: 
of       ancestor.dep_:prep
increase       ancestor.dep_:appos
employees       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Token children: 
17       child.dep_:nummod


In [237]:
extract_emp_relations(ex2)

Dep_ of EMP_NOUN is: dobj
Num_toks are: [455]
Root is at 1 steps from employees.
(Company, had, 455, 'Other', 0, employees)
Dep_ of EMP_NOUN is: pobj
Num_toks are: [17]
Root is at 1 steps from employees.
(Company, had, 17, 'Other', 0, employees)


[[(0, 0, Company, had, 455, 'Other', 0, employees),
  (0, 1, Company, had, 17, 'Other', 0, employees)]]

#### `Emp_num` below `EMP_NOUN` in parse tree, `org_tok` left of verb

Example sentece:

`"As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees."`

Desired output: 

`(AEP, had, 17,634, employees)`

In [34]:
ex3 = nlp("As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees.")

print_df(make_tok_df(ex3))

print([tok for tok in ex3 if tok.ent_type_ == 'ORG'])

ex3_emp_tok = ex3[16]
ex3_emp_num_ent = ex3.ents[2]
print(ex3_emp_num_ent.root.is_digit)

In [37]:
print_doc_info(ex3)

doc is: 
As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"December 31, 2016",DATE,December,pobj,object of preposition,of,prep,ADP
1,10,AEP,ORG,AEP,pobj,object of preposition,of,prep,ADP
2,15,17634,CARDINAL,17634,nummod,,employees,pobj,NOUN
3,16,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,December,December,DATE,pobj,object of preposition,of,prep,ADP
1,7,the subsidiaries,subsidiaries,,nsubj,nominal subject,had,ROOT,VERB
2,10,AEP,AEP,ORG,pobj,object of preposition,of,prep,ADP
3,12,a total,total,,dobj,direct object,had,ROOT,VERB
4,15,"17,634 employees",employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,17634,17634,nummod,employees,pobj,NUM,CD,,cardinal number


In [42]:
#[w for w in ex3_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex3_emp_tok) )
for c in ex3_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex3_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex3_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex3_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex3_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex3_emp_tok.head.i))
    print('Token head is: ' + str(ex3_emp_tok.head) + '       head.dep_:  '+ str(ex3_emp_tok.head.dep_) )
    for i, d in enumerate(ex3_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex3_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex3_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex3_emp_tok.dep_))
print('Token head: ' + str(ex3_emp_tok.head))
print('Token head: ' + str(ex3_emp_tok.head.head))
print('       head.dep_: ' + str(ex3_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex3_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex3_emp_tok.head.pos_) + "'" )

print('Token conjuncts: ' )
print(str([c for c in ex3_emp_tok.conjuncts]))
print('Token root lefts: ' )
for w in ex3_emp_tok.head.head.head.lefts:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token head rights: ' )
for w in ex3_emp_tok.head.head.head.rights:
    print(str(w) + '       rights.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex3_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Check for ancestor in root.rights: ')
for w in ex3_emp_tok.ancestors:
    if w in ex3_emp_tok.head.head.head.lefts:
        print("Ancestor '" + str(w) + "' of emp_tok is in root.lefts")
        break
    elif w in ex3_emp_tok.head.head.head.lefts:
        print("Ancestor '" + str(w) + "' of emp_tok is in root.rights")
        break
    else:
        print("Ancestor '" + str(w) + "' of emp_tok is not in tok.rights or tok.lefts.")
else:
    print("No ancestor of emp_tok is not in tok.rights or tok.lefts.")
print('Token children: ')
for w in ex3_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 17,634  has dep_ == 'nummod'
       Token.child.i : 15
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: pobj
Token head: of
Token head: total
       head.dep_: prep
       head.tag_: 'IN'
       head.pos_: 'ADP'
Token conjuncts: 
[]
Token root lefts: 
As       head.left.dep_:prep
,       head.left.dep_:punct
subsidiaries       head.left.dep_:nsubj
subsidiaries is nsubj. Subtree is:
the
Token head rights: 
total       rights.left.dep_:dobj
.       rights.left.dep_:punct
Token ancestors: 
of       ancestor.dep_:prep
total       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Check for ancestor in root.rights: 
Ancestor 'of' of emp_tok is not in tok.rights or tok.lefts.
Ancestor 'total' of emp_tok is not in tok.rights or tok.lefts.
Ancestor 'had' of emp_tok is not in tok.rights or tok.lefts.
No ancestor of emp_tok is not in tok.rights or tok.lefts.
Token children: 
17,634       child.dep_:nummod


In [None]:
#displacy.render(ex3, style='dep', jupyter=True, options={'distance': 110})

In [44]:
print(ex3_emp_tok.head)
print(ex3_emp_tok.head.dep_)
print(ex3_emp_tok.head.head)
print(ex3_emp_tok.head.head.dep_)
print(ex3_emp_tok.head.head.head)
print(ex3_emp_tok.head.head.head.dep_)
print(ex3_emp_tok.head.head.head.pos_)
print(ex3_emp_tok.head.head.head.tag_)
print([c for c in ex3_emp_tok.head.head.head.children])
print([c.dep_ for c in ex3_emp_tok.head.head.head.children])
print([c.tag_ for c in ex3_emp_tok.head.head.head.children])
print([r for r in ex3_emp_tok.head.head.head.subtree if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex3_emp_tok.head.head.head.children])
print([e for e in ex3.ents])
print([e.label_ for e in ex3.ents])
print([e for e in ex3.ents if e.label_ == 'DATE'])

of
prep
total
dobj
had
ROOT
VERB
VBD
[As, ,, subsidiaries, total, .]
['prep', 'punct', 'nsubj', 'dobj', 'punct']
['IN', ',', 'NNS', 'NN', '.']
[31, 2016, 17,634]
['', '', '', '', '']
[December 31, 2016, AEP, 17,634, employees]
['DATE', 'ORG', 'CARDINAL', 'EMP_NOUN']
[December 31, 2016]


In [238]:
extract_emp_relations(ex3)

Dep_ of EMP_NOUN is: pobj
Num_toks are: [17,634]
Root is at 1 steps from employees.
(subsidiaries, had, 17,634, 'Other', 0, employees)


[[(0, 0, subsidiaries, had, 17,634, 'Other', 0, employees)]]

#### `Org_tok` [employment_noun] `was` `Emp_Num` `EMP_NOUN`

`"Alcoa's total worldwide employment at the end of 2016 was approximately 14,000 employees in 15 countries."`

Desired output: 

`(employment, was, 14,000, employees)`

In [47]:
ex4 = nlp("Alcoa's total worldwide employment at the end of 2016 was approximately 14,000 employees in 15 countries.")

print_df(make_tok_df(ex4))

ex4_emp_tok = ex4[13]

In [49]:
print_doc_info(ex4)

doc is: 
Alcoa's total worldwide employment at the end of 2016 was approximately 14,000 employees in 15 countries.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,Alcoa,ORG,Alcoa,poss,possession modifier,employment,nsubj,NOUN
1,6,the end of 2016,DATE,end,pobj,object of preposition,at,prep,ADP
2,11,"approximately 14,000",CARDINAL,14000,nummod,,employees,attr,NOUN
3,13,employees,EMP_NOUN,employees,attr,attribute,was,ROOT,VERB
4,15,15,CARDINAL,15,nummod,,countries,pobj,NOUN


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,Alcoa's total worldwide employment,employment,,nsubj,nominal subject,was,ROOT,VERB
1,6,the end,end,DATE,pobj,object of preposition,at,prep,ADP
2,11,"approximately 14,000 employees",employees,EMP_NOUN,attr,attribute,was,ROOT,VERB
3,15,15 countries,countries,,pobj,object of preposition,in,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,14000,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,14000,14000,nummod,employees,attr,NUM,CD,,cardinal number
2,CARDINAL,15,15,nummod,countries,pobj,NUM,CD,,cardinal number


In [51]:
#[w for w in ex4_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token dep_: ' + str(ex4_emp_tok.dep_))
print('Token head: ' + str(ex4_emp_tok.head) + '       head.dep_:'+ str(ex4_emp_tok.head.dep_) )
for w in ex4_emp_tok.head.lefts:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex4_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex4_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token dep_: attr
Token head: was       head.dep_:ROOT
employment       head.left.dep_:nsubj
employment is nsubj. Subtree is:
Alcoa
total
worldwide
Alcoa
Token ancestors: 
was       ancestor.dep_:ROOT
Token children: 
14,000       child.dep_:nummod
in       child.dep_:prep


In [None]:
#displacy.render(ex4, style='dep', jupyter=True, options={'distance': 90})

In [53]:
extract_emp_relations(ex4)

Dep_ of EMP_NOUN is: attr
Num_toks are: [14,000]
Root is at 2 steps from employees.
(employment, was, 14,000, 'Other', 0, employees)


[[(0, 0, employment, was, 14,000, 'Other', 0, employees)]]

#### `Org_tok` that is not the company

Desired output: 

`(Company, employed, 1,562, persons)`

In [54]:
ex5 = nlp("The Company and its subsidiaries employed 1,562 persons at December 31, 2016, 114 of whom are covered by a collective bargaining agreement with District 10 of the International Association of Machinists.")

print_df(make_tok_df(ex5))

ex5_emp_tok = ex5[7]

ex5_emp_tok.dep_

In [56]:
print_doc_info(ex5)

doc is: 
The Company and its subsidiaries employed 1,562 persons at December 31, 2016, 114 of whom are covered by a collective bargaining agreement with District 10 of the International Association of Machinists.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,6,1562,CARDINAL,1562,nummod,,persons,dobj,NOUN
1,7,persons,EMP_NOUN,persons,dobj,direct object,employed,ROOT,VERB
2,9,"December 31, 2016",DATE,December,pobj,object of preposition,at,prep,ADP
3,14,114,CARDINAL,114,nsubjpass,nominal subject (passive),covered,relcl,VERB
4,25,District 10,FAC,District,pobj,object of preposition,with,prep,ADP
5,28,the International Association of Machinists,ORG,Association,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,The Company,Company,,nsubj,nominal subject,employed,ROOT,VERB
1,3,its subsidiaries,subsidiaries,,conj,conjunct,Company,nsubj,PROPN
2,6,"1,562 persons",persons,EMP_NOUN,dobj,direct object,employed,ROOT,VERB
3,9,December,December,DATE,pobj,object of preposition,at,prep,ADP
4,16,whom,whom,,pobj,object of preposition,of,prep,ADP
5,20,a collective bargaining agreement,agreement,,pobj,object of preposition,by,agent,ADP
6,25,District,District,FAC,pobj,object of preposition,with,prep,ADP
7,28,the International Association,Association,ORG,pobj,object of preposition,of,prep,ADP
8,32,Machinists,Machinists,ORG,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,1562,1562,nummod,persons,dobj,NUM,CD,,cardinal number
1,CARDINAL,114,114,nsubjpass,covered,relcl,NUM,CD,nominal subject (passive),cardinal number


In [None]:
#displacy.render(ex5, style='dep', jupyter=True, options={'distance': 110})

In [60]:
print(ex5_emp_tok.head)
print(ex5_emp_tok.head.dep_)
print([c for c in ex5_emp_tok.head.lefts])
print([c.dep_ for c in ex5_emp_tok.head.head.head.children])
print([c.tag_ for c in ex5_emp_tok.head.head.head.children])
print([r for r in ex5_emp_tok.head.head.head.rights if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex5_emp_tok.head.head.head.children])
print([e for e in ex5.ents])
print([e.label_ for e in ex5.ents])
print([e for e in ex5.ents if e.label_ == 'DATE'])

employed
ROOT
[Company]
['nsubj', 'dobj', 'prep', 'punct']
['NNP', 'NNS', 'IN', '.']
[]
['', 'B', '', '']
[1,562, persons, December 31, 2016, 114, District 10, the International Association of Machinists]
['CARDINAL', 'EMP_NOUN', 'DATE', 'CARDINAL', 'FAC', 'ORG']
[December 31, 2016]


In [61]:
extract_emp_relations(ex5)

Dep_ of EMP_NOUN is: dobj
Num_toks are: [1,562]
Root is at 2 steps from persons.
(Company, employed, 1,562, 'Other', 0, persons)


[[(0, 0, Company, employed, 1,562, 'Other', 0, persons)]]

### `Emp_noun` and `Company_noun`  in subject, `Emp_num` in predicate

#### `"The number of full-time employees of the Company was approximately 31,800 at December 31, 2016 and 32,300 at December 31, 2015."`

Desired output:  
`(number, was, 31,800, full-time employees)`

In [62]:
ex6 = nlp("The number of full-time employees of the Company was approximately 31,800 at December 31, 2016 and 32,300 at December 31, 2015.")

print_df(make_tok_df(ex6))

ex6_emp_tok = ex6[4]

ex6_emp_tok.dep_

In [64]:
print_doc_info(ex6)

doc is: 
The number of full-time employees of the Company was approximately 31,800 at December 31, 2016 and 32,300 at December 31, 2015.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,3,full-time,FULL_TIME,full-time,compound,,employees,pobj,NOUN
1,4,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP
2,7,Company,ORG,Company,pobj,object of preposition,of,prep,ADP
3,9,"approximately 31,800",CARDINAL,31800,attr,attribute,was,ROOT,VERB
4,12,"December 31, 2016",DATE,December,pobj,object of preposition,at,prep,ADP
5,17,32300,CARDINAL,32300,conj,conjunct,was,ROOT,VERB
6,19,"December 31, 2015",DATE,December,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,The number,number,,nsubj,nominal subject,was,ROOT,VERB
1,3,full-time employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP
2,6,the Company,Company,ORG,pobj,object of preposition,of,prep,ADP
3,12,December,December,DATE,pobj,object of preposition,at,prep,ADP
4,19,December,December,DATE,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,31800,attr,ADV,RB,adverbial modifier,adverb
1,CARDINAL,31800,31800,attr,was,ROOT,NUM,CD,attribute,cardinal number
2,CARDINAL,32300,32300,conj,was,ROOT,NUM,CD,conjunct,cardinal number


In [67]:
print([c for c in ex6[4].children])

print('Token children: ')
for w in ex6[10].children:
    print(str(w) + '       child.dep_:' + str(w.dep_))
    
print('Token children: ')
for w in ex6[17].children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

[full-time, of]


In [69]:
[c for c in ex6[8].conjuncts]

[32,300]

In [70]:
print(ex6_emp_tok.head.dep_)
print(ex6_emp_tok.head.head)
print(ex6_emp_tok.head.head.dep_)
print(ex6_emp_tok.head.head.head)
print(ex6_emp_tok.head.head.head.dep_)
print(ex6_emp_tok.head.head.head.pos_)
print(ex6_emp_tok.head.head.head.tag_)
print([c for c in ex6_emp_tok.head.head.head.children])
print([c.dep_ for c in ex6_emp_tok.head.head.head.children])
print([c.tag_ for c in ex6_emp_tok.head.head.head.children])
print([r for r in ex6_emp_tok.head.head.head.rights if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex6_emp_tok.head.head.head.children])
print([e for e in ex6.ents])
print([e.label_ for e in ex6.ents])
print([e for e in ex6.ents if e.label_ == 'DATE'])

prep
number
nsubj
was
ROOT
VERB
VBD
[number, 31,800, at, and, 32,300, .]
['nsubj', 'attr', 'prep', 'cc', 'conj', 'punct']
['NN', 'CD', 'IN', 'CC', 'CD', '.']
[31,800, 32,300]
['', 'I', '', '', 'B', '']
[full-time, employees, Company, approximately 31,800, December 31, 2016, 32,300, December 31, 2015]
['FULL_TIME', 'EMP_NOUN', 'ORG', 'CARDINAL', 'DATE', 'CARDINAL', 'DATE']
[December 31, 2016, December 31, 2015]


In [None]:
#displacy.render(ex6, style='dep', jupyter=True, options={'distance': 110})

In [783]:
extract_emp_relations(ex6, verbose=True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: pobj
year_emps: [('2015', '32,300'), ('2016', 'approximately 31,800')]
max year_emps: ('2016', 'approximately 31,800')
Sentence has multiple years:[(15, 2016), (22, 2015)]
First card subtree is :[approximately, 31,800]
years: [(15, 2016), (22, 2015)]
cards: [approximately 31,800, 32,300]
emp_counts: [(9, approximately 31,800), (17, 32,300)]
Root is at 2 steps from employees.
(The number of full-time employees of the Company, was, 'approximately 31,800', 'Full-Time Employees', full-time, employees)


[RelationDetails(sent_num=0, word_num=0, s=The number of full-time employees of the Company, v=was, quantity='approximately 31,800', quantity_type='Full-Time Employees', type_token=full-time, word=employees)]

#### `Total workforce level at December 31, 2016 was approximately 150,500.`

Examples:  
`"Total workforce level at December 31, 2016 was approximately 150,500."`

Desired output:  
`(workforce, was, 150,500, 'Other', 'Total', workforce)`

In [153]:
ex10 = nlp("Total workforce level at December 31, 2016 was approximately 150,500.")

print_df(make_tok_df(ex10))

In [155]:
print_doc_info(ex10)

doc is: 
Total workforce level at December 31, 2016 was approximately 150,500.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,workforce,EMP_NOUN,workforce,compound,,level,nsubj,NOUN
1,4,"December 31, 2016",DATE,2016,nsubj,nominal subject,was,ROOT,VERB
2,10,150500,CARDINAL,150500,attr,attribute,was,ROOT,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,Total workforce level,level,,nsubj,nominal subject,was,ROOT,VERB
1,4,December,December,DATE,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,150500,150500,attr,was,ROOT,NUM,CD,attribute,cardinal number


In [156]:
ex10_emp_tok = ex10[1]
ex10_emp_tok.dep_

'compound'

In [157]:
print([c for c in ex10[1].children])

[]


In [158]:
print('Token children: ')
for w in ex10[10].children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
approximately       child.dep_:advmod


In [161]:
print(ex10_emp_tok.head.dep_)
print(ex10_emp_tok.head.head)
print(ex10_emp_tok.head.head.dep_)
print(ex10_emp_tok.head.head.head)
print(ex10_emp_tok.head.head.head.dep_)
print(ex10_emp_tok.head.head.head.pos_)
print(ex10_emp_tok.head.head.head.tag_)
print([c for c in ex10_emp_tok.head.head.head.children])
print([c.dep_ for c in ex10_emp_tok.head.head.head.children])
print([c.tag_ for c in ex10_emp_tok.head.head.head.children])
print([r for r in ex10_emp_tok.head.head.head.rights if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex10_emp_tok.head.head.head.children])
print([e for e in ex10.ents])
print([e.label_ for e in ex10.ents])
print([e for e in ex10.ents if e.label_ == 'DATE'])

nsubj
was
ROOT
was
ROOT
VERB
VBD
[level, ,, 2016, 150,500, .]
['nsubj', 'punct', 'nsubj', 'attr', 'punct']
['NN', ',', 'CD', 'CD', '.']
[150,500]
['', 'I', 'I', 'B', '']
[workforce, December 31, 2016, 150,500]
['EMP_NOUN', 'DATE', 'CARDINAL']
[December 31, 2016]


In [159]:
#[w for w in ex10_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex10_emp_tok) )
for c in ex10_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex10_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex10_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex10_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex10_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex10_emp_tok.head.i))
    print('Token head is: ' + str(ex10_emp_tok.head) + '       head.dep_:  '+ str(ex10_emp_tok.head.dep_) )
    for i, d in enumerate(ex10_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex10_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex10_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex10_emp_tok.dep_))
print('Token head: ' + str(ex10_emp_tok.head))
print('       head.dep_: ' + str(ex10_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex10_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex10_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print('Token conjuncts: ' )
print(str([c for c in ex10_emp_tok.conjuncts]))
for w in ex10_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex10_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex10_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  workforce
Token dep_: compound
Token head: level
       head.dep_: nsubj
       head.tag_: 'NN'
       head.pos_: 'NOUN'
Token head lefts: 
Token conjuncts: 
[]
at       head.right.dep_:prep
Token ancestors: 
level       ancestor.dep_:nsubj
was       ancestor.dep_:ROOT
Token children: 


In [None]:
#displacy.render(ex10, style='dep', jupyter=True, options={'distance': 110})

In [190]:
extract_emp_relations(ex10)

Dep_ of EMP_NOUN is: compound
year_emps: [('2016', '150,500')]
max year_emps: ('2016', '150,500')
Sentence has multiple years:[(7, 2016)]
First card subtree is :[approximately, 150,500]
years: [(7, 2016)]
cards: [150,500]
emp_counts: [(10, 150,500)]
Root is at 2 steps from workforce.
(level, was, '150,500', 'Other', 0, workforce)


[[(0, 0, level, was, '150,500', 'Other', 0, workforce)]]

### `Emp_num` above and below `EMP_NOUN` in parse tree

#### One emp_num is a conj child of the other emp_num

Example text:

`At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.`

Desired output: 

`(we, had, 56,400, employees)`

In [73]:
ex7 = nlp("At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.")

print_df(make_tok_df(ex7))

ex7_emp_tok = ex7[14]

ex7_emp_num_tok = ex7[11]

print(ex7_emp_tok.dep_)

print(ex7_emp_num_tok)

print([c for c in ex7_emp_num_tok.conjuncts])

In [75]:
print_doc_info(ex7)

doc is: 
At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,"December 31, 2016 and 2015",DATE,December,pobj,object of preposition,At,prep,ADP
1,10,"approximately 56,400",CARDINAL,56400,nummod,,employees,dobj,NOUN
2,13,66400,CARDINAL,66400,conj,conjunct,56400,nummod,NUM
3,14,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,December,December,DATE,pobj,object of preposition,At,prep,ADP
1,8,we,we,,nsubj,nominal subject,had,ROOT,VERB
2,10,"approximately 56,400 and 66,400 employees",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,56400,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,56400,56400,nummod,employees,dobj,NUM,CD,,cardinal number
2,CARDINAL,66400,66400,conj,56400,nummod,NUM,CD,conjunct,cardinal number


In [81]:
print('Token children: ')
for w in ex7_emp_num_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
approximately       child.dep_:advmod
and       child.dep_:cc
66,400       child.dep_:conj


In [82]:
#[w for w in ex7_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex7_emp_tok) )
for c in ex7_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex7_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex7_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex7_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex7_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex7_emp_tok.head.i))
    print('Token head is: ' + str(ex7_emp_tok.head) + '       head.dep_:  '+ str(ex7_emp_tok.head.dep_) )
    for i, d in enumerate(ex7_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex7_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex7_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex7_emp_tok.dep_))
print('Token head: ' + str(ex7_emp_tok.head))
print('       head.dep_: ' + str(ex7_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex7_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex7_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print('Token conjuncts: ' )
print(str([c for c in ex7_emp_tok.conjuncts]))
for w in ex7_emp_tok.head.rights:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex7_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex7_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 56,400  has dep_ == 'nummod'
       Token.child.i : 11
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
       Token GC dep_ is conj.
       Token GC is 66,400
       gc.tag_ == 'CD' 
       gc.i : 13
       Token head is: 56,400       head.dep_:  nummod
0  December 31, 2016 and 2015
       CD tok in date: 2016
       CD tok in date: 2015
Token dep_: dobj
Token head: had
       head.dep_: ROOT
       head.tag_: 'VBD'
       head.pos_: 'VERB'
Token head lefts: 
Token conjuncts: 
[]
employees       head.left.dep_:dobj
,       head.left.dep_:punct
respectively       head.left.dep_:advmod
.       head.left.dep_:punct
Token ancestors: 
had       ancestor.dep_:ROOT
Token children: 
56,400       child.dep_:nummod


In [None]:
#displacy.render(ex7, style='dep', jupyter=True, options={'distance': 110})

In [84]:
print(ex7_emp_tok.head)
print(ex7_emp_tok.head.dep_)
print(ex7_emp_tok.head.head)
print(ex7_emp_tok.head.head.dep_)
print(ex7_emp_tok.head.head.head)
print(ex7_emp_tok.head.head.head.dep_)
print(ex7_emp_tok.head.head.head.pos_)
print(ex7_emp_tok.head.head.head.tag_)
print([c for c in ex7_emp_tok.head.head.head.children])
print([c.dep_ for c in ex7_emp_tok.head.head.head.children])
print([c.tag_ for c in ex7_emp_tok.head.head.head.children])
print([r for r in ex7_emp_tok.head.head.head.subtree if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex7_emp_tok.head.head.head.children])
print([e for e in ex7.ents])
print([e.label_ for e in ex7.ents])
print([e for e in ex7.ents if e.label_ == 'DATE'])

had
ROOT
had
ROOT
had
ROOT
VERB
VBD
[At, ,, we, employees, ,, respectively, .]
['prep', 'punct', 'nsubj', 'dobj', 'punct', 'advmod', 'punct']
['IN', ',', 'PRP', 'NNS', ',', 'RB', '.']
[31, 2016, 2015, 56,400, 66,400]
['', '', '', 'B', '', '', '']
[December 31, 2016 and 2015, approximately 56,400, 66,400, employees]
['DATE', 'CARDINAL', 'CARDINAL', 'EMP_NOUN']
[December 31, 2016 and 2015]


In [791]:
extract_emp_relations(ex7, verbose=True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: dobj
Num_toks are: [56,400]
num_tok has conjugate children:[66,400]
num_tok subtree is :[approximately, 56,400, and, 66,400]
years: [(4, 2016), (6, 2015)]
cards: [(11, 56,400), (13, 66,400)]
order_indices: [1, 0]
year_emps: [(2016, 56,400), (2015, 66,400)]
Root is at 1 steps from employees.
(we, had, 56,400, 'Other', 0, employees)


[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity=56,400, quantity_type='Other', type_token=0, word=employees)]

#### `At March 31, 2016, 2015 and 2014, we had 3,066, 2,982 and 2,843 employees, respectively. `

Desired output: 
`(we, had, 3,066, 'Other Employees', 0, employees)`

In [591]:
ex12 = nlp("At March 31, 2016, 2015 and 2014, we had 3,066, 2,982 and 2,843 employees, respectively.")

print_df(make_tok_df(ex12))

ex12_emp_tok = ex12[17]

ex12_emp_num_tok = ex12[16]
ex12_emp_num_tok_2 = ex12[14]
ex12_emp_num_tok_3 = ex12[12]

#ex12_emp_tok = ex12[20]
#ex12_emp_num_tok = ex12[19]
#ex12_emp_num_tok_2 = ex12[17]
#ex12_emp_num_tok_3 = ex12[15]

print(ex12_emp_tok)
print(ex12_emp_tok.dep_)
print(ex12_emp_num_tok)
print([c for c in ex12_emp_num_tok_3.conjuncts])

In [593]:
print_doc_info(ex12)

doc is: 
At March 31, 2016, 2015 and 2014, we had 3,066, 2,982 and 2,843 employees, respectively.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,"March 31, 2016, 2015 and 2014",DATE,March,pobj,object of preposition,At,prep,ADP
1,12,3066,CARDINAL,3066,dobj,direct object,had,ROOT,VERB
2,14,2982,CARDINAL,2982,conj,conjunct,3066,dobj,NUM
3,16,2843,CARDINAL,2843,nummod,,employees,conj,NOUN
4,17,employees,EMP_NOUN,employees,conj,conjunct,2982,conj,NUM


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,March,March,DATE,pobj,object of preposition,At,prep,ADP
1,10,we,we,,nsubj,nominal subject,had,ROOT,VERB
2,16,"2,843 employees",employees,EMP_NOUN,conj,conjunct,2982,conj,NUM


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,3066,3066,dobj,had,ROOT,NUM,CD,direct object,cardinal number
1,CARDINAL,2982,2982,conj,3066,dobj,NUM,CD,conjunct,cardinal number
2,CARDINAL,2843,2843,nummod,employees,conj,NUM,CD,,cardinal number


In [599]:
print('Token children: ')
for w in ex12_emp_num_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

print('Token rights: ')
for w in ex12_emp_num_tok_3.head.rights:
    print(str(w) + '       right.dep_:' + str(w.dep_))

Token children: 


In [601]:
#[w for w in ex12_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex12_emp_tok) )
for c in ex12_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex12_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex12_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex12_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex12_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex12_emp_tok.head.i))
    print('Token head is: ' + str(ex12_emp_tok.head) + '       head.dep_:  '+ str(ex12_emp_tok.head.dep_) )
    for i, d in enumerate(ex12_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex12_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex12_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex12_emp_tok.dep_))
print('Token head: ' + str(ex12_emp_tok.head))
print('       head.dep_: ' + str(ex12_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex12_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex12_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print("       " + str([c for c in ex12_emp_tok.head.lefts]))
print('Token conjuncts: ' )
print("       " + str([c for c in ex12_emp_tok.conjuncts]))
for w in ex12_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex12_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex12_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 2,843  has dep_ == 'nummod'
       Token.child.i : 16
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_ is conj.
       token.head.tag_ == 'CD' 
       token.head.i : 14
Token head is: 2,982       head.dep_:  conj
0  March 31, 2016, 2015 and 2014
CD tok in date: 2016
CD tok i: 4
CD tok in date: 2015
CD tok i: 6
CD tok in date: 2014
CD tok i: 8
Token dep_: conj
Token head: 2,982
       head.dep_: conj
       head.tag_: 'CD'
       head.pos_: 'NUM'
Token head lefts: 
       []
Token conjuncts: 
       []
and       head.right.dep_:cc
employees       head.right.dep_:conj
Token ancestors: 
2,982       ancestor.dep_:conj
3,066       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Token children: 
2,843       child.dep_:nummod


In [None]:
#displacy.render(ex12, style='dep', jupyter=True)

In [603]:
print(ex12_emp_tok.head)
print(ex12_emp_tok.head.dep_)
print(ex12_emp_tok.head.head)
print(ex12_emp_tok.head.head.dep_)
print(ex12_emp_tok.head.head.head)
print(ex12_emp_tok.head.head.head.dep_)
print(ex12_emp_tok.head.head.head.pos_)
print(ex12_emp_tok.head.head.head.tag_)
print([c for c in ex12_emp_tok.head.head.head.children])
print([c.dep_ for c in ex12_emp_tok.head.head.head.children])
print([c.tag_ for c in ex12_emp_tok.head.head.head.children])
print([r for r in ex12_emp_tok.head.head.head.subtree if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex12_emp_tok.head.head.head.children])
print([e for e in ex12.ents])
print([e.label_ for e in ex12.ents])
print([e for e in ex12.ents if e.label_ == 'DATE'])

2,982
conj
3,066
dobj
had
ROOT
VERB
VBD
[At, ,, we, 3,066, respectively, .]
['prep', 'punct', 'nsubj', 'dobj', 'advmod', 'punct']
['IN', ',', 'PRP', 'CD', 'RB', '.']
[31, 2016, 2015, 2014, 3,066, 2,982, 2,843]
['', '', '', 'B', '', '']
[March 31, 2016, 2015 and 2014, 3,066, 2,982, 2,843, employees]
['DATE', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'EMP_NOUN']
[March 31, 2016, 2015 and 2014]


In [604]:
extract_emp_relations(ex12, verbose = True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: conj
Num_toks are: [2,843]
Emp_noun token has dep_ == 'conj'.
Child num_tok: 2,843
Head num_tok: [2,982]
child_num_tok and head_num_toks
years: [(4, 2016), (6, 2015), (8, 2014)]
num_toks: [2,843, 2,982]
emp_counts: [(12, 3,066), (14, 2,982), (16, 2,843)]
order_indices: [2, 1, 0]
year_emps: [('2014', '2,843'), ('2015', '2,982'), ('2016', '3,066')]
(we, had, '3,066', 'Other', 0, employees)


[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity='3,066', quantity_type='Other', type_token=0, word=employees)]

#### `We had a total of 9,832, 9,058, and 8,806 employees as of December 31, 2016, 2015, and 2014, respectively. `
Desired output: 
`(we, had, 3,066, 'Other Employees', 0, employees)`

In [559]:
ex13 = nlp("We had a total of 9,832, 9,058, and 8,806 employees as of December 31, 2016, 2015, and 2014, respectively.")
#ex13 = nlp("We had 9,832, 9,058, and 8,806 employees as of December 31, 2016, 2015, and 2014, respectively.")

print_df(make_tok_df(ex13))

ex13_emp_tok = ex13[11]

ex13_emp_num_tok = ex13[10]
ex13_emp_num_tok_2 = ex13[7]
ex13_emp_num_tok_3 = ex13[5]

#ex13_emp_tok = ex13[8]
#ex13_emp_num_tok = ex13[7]
#ex13_emp_num_tok_2 = ex13[4]
#ex13_emp_num_tok_3 = ex13[2]

print(ex13_emp_tok)
print(ex13_emp_tok.dep_)
print(ex13_emp_num_tok)
print([c for c in ex13_emp_num_tok_2.conjuncts])

In [522]:
print_doc_info(ex13)

doc is: 
We had 9,832, 9,058, and 8,806 employees as of December 31, 2016, 2015, and 2014, respectively.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,9832,CARDINAL,9832,dobj,direct object,had,ROOT,VERB
1,4,9058,CARDINAL,9058,conj,conjunct,9832,dobj,NUM
2,7,8806,CARDINAL,8806,nummod,,employees,conj,NOUN
3,8,employees,EMP_NOUN,employees,conj,conjunct,9058,conj,NUM
4,11,"December 31, 2016, 2015",DATE,December,pobj,object of preposition,of,prep,ADP
5,19,2014,DATE,2014,conj,conjunct,December,pobj,PROPN


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,We,We,,nsubj,nominal subject,had,ROOT,VERB
1,7,"8,806 employees",employees,EMP_NOUN,conj,conjunct,9058,conj,NUM
2,11,December,December,DATE,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,9832,9832,dobj,had,ROOT,NUM,CD,direct object,cardinal number
1,CARDINAL,9058,9058,conj,9832,dobj,NUM,CD,conjunct,cardinal number
2,CARDINAL,8806,8806,nummod,employees,conj,NUM,CD,,cardinal number


In [498]:
print('Token children: ')
for w in ex13_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
8,806       child.dep_:nummod
as       child.dep_:prep
respectively       child.dep_:advmod


In [644]:
#[w for w in ex13_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex13_emp_tok) )
for c in ex13_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex13_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex13_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex13_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex13_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex13_emp_tok.head.i))
    print('Token head is: ' + str(ex13_emp_tok.head) + '       head.dep_:  '+ str(ex13_emp_tok.head.dep_) )
    for i, d in enumerate(ex13_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex13_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex13_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex13_emp_tok.dep_))
print('Token head: ' + str(ex13_emp_tok.head))
print('       head.dep_: ' + str(ex13_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex13_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex13_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print("       " + str([c for c in ex13_emp_tok.head.lefts]))
print('Token head conjuncts: ' )
print("       " + str([c for c in ex13_emp_tok.head.conjuncts]))
print('Token conjuncts: ' )
print("       " + str([c for c in ex13_emp_tok.conjuncts]))
for w in ex13_emp_tok.head.rights:
    print(str(w) + '       head.left.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex13_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex13_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 8,806  has dep_ == 'nummod'
       Token.child.i : 7
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_ is conj.
       token.head.tag_ == 'CD' 
       token.head.i : 4
Token head is: 9,058       head.dep_:  conj
4  December 31, 2016, 2015
CD tok in date: 2016
CD tok i: 14
CD tok in date: 2015
CD tok i: 16
5  2014
CD tok in date: 2014
CD tok i: 19
Token dep_: conj
Token head: 9,058
       head.dep_: conj
       head.tag_: 'CD'
       head.pos_: 'NUM'
Token head lefts: 
       []
Token head conjuncts: 
       []
Token conjuncts: 
       []
,       head.left.dep_:punct
and       head.left.dep_:cc
employees       head.left.dep_:conj
Token ancestors: 
9,058       ancestor.dep_:conj
9,832       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Token children: 
8,806       child.dep_:nummod
as       child.dep_:prep
respectively       child.dep_:advmod


In [None]:
#displacy.render(ex13, style='dep', jupyter=True)

In [503]:
print(ex13_emp_tok.head)
print(ex13_emp_tok.head.dep_)
print(ex13_emp_tok.head.head)
print(ex13_emp_tok.head.head.dep_)
print(ex13_emp_tok.head.head.head)
print(ex13_emp_tok.head.head.head.dep_)
print(ex13_emp_tok.head.head.head.pos_)
print(ex13_emp_tok.head.head.head.tag_)
print([c for c in ex13_emp_tok.head.head.head.children])
print([c.dep_ for c in ex13_emp_tok.head.head.head.children])
print([c.tag_ for c in ex13_emp_tok.head.head.head.children])
print([r for r in ex13_emp_tok.head.head.head.subtree if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex13_emp_tok.head.head.head.children])
print([e for e in ex13.ents])
print([e.label_ for e in ex13.ents])
print([e for e in ex13.ents if e.label_ == 'DATE'])

9,058
appos
9,832
pobj
of
prep
ADP
IN
[9,832]
['pobj']
['CD']
[9,832, 9,058, 8,806, 31, 2016, 2015, 2014]
['B']
[9,832, 9,058, 8,806, employees, December 31, 2016, 2015, 2014]
['CARDINAL', 'CARDINAL', 'CARDINAL', 'EMP_NOUN', 'DATE', 'DATE']
[December 31, 2016, 2015, 2014]


In [621]:
extract_emp_relations(ex13, verbose = True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: conj
Num_toks are: [8,806]
Emp_noun token has dep_ == 'conj'.
Child num_tok: 8,806
Head num_tok: [9,058]
child_num_tok and head_num_toks
years: [(17, 2016), (19, 2015), (22, 2014)]
num_toks: [8,806, 9,058]
emp_counts: [(5, 9,832), (7, 9,058), (10, 8,806)]
order_indices: [2, 1, 0]
year_emps: [('2014', '8,806'), ('2015', '9,058'), ('2016', '9,832')]
(We, had, '9,832', 'Other', 0, employees)


[RelationDetails(sent_num=0, word_num=0, s=We, v=had, quantity='9,832', quantity_type='Other', type_token=0, word=employees)]

#### `We had 17,912, 14,533 and 10,625 employees as of December 31, 2014, 2015 and 2016, respectively.`  
Desired output: 
`(we, had, 3,066, 'Other Employees', 0, employees)`

In [622]:
ex14 = nlp("We had 17,912, 14,533 and 10,625 employees as of December 31, 2014, 2015 and 2016, respectively.")

print_df(make_tok_df(ex14))

ex14_emp_tok = ex14[7]
ex14_emp_num_tok = ex14[6]
ex14_emp_num_tok_2 = ex14[4]
ex14_emp_num_tok_3 = ex14[2]

print(ex14_emp_tok)
print(ex14_emp_tok.dep_)
print(ex14_emp_num_tok)
print([c for c in ex14_emp_num_tok_3.conjuncts])

In [624]:
print_doc_info(ex14)

doc is: 
We had 17,912, 14,533 and 10,625 employees as of December 31, 2014, 2015 and 2016, respectively.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,17912,CARDINAL,17912,dobj,direct object,had,ROOT,VERB
1,4,14533,CARDINAL,14533,conj,conjunct,17912,dobj,NUM
2,6,10625,CARDINAL,10625,nummod,,employees,conj,NOUN
3,7,employees,EMP_NOUN,employees,conj,conjunct,17912,dobj,NUM
4,10,"December 31, 2014, 2015 and 2016",DATE,December,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,We,We,,nsubj,nominal subject,had,ROOT,VERB
1,6,"10,625 employees",employees,EMP_NOUN,conj,conjunct,17912,dobj,NUM
2,10,December,December,DATE,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,17912,17912,dobj,had,ROOT,NUM,CD,direct object,cardinal number
1,CARDINAL,14533,14533,conj,17912,dobj,NUM,CD,conjunct,cardinal number
2,CARDINAL,10625,10625,nummod,employees,conj,NUM,CD,,cardinal number


In [640]:
print('Token children: ')
for w in ex14_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
10,625       child.dep_:nummod
as       child.dep_:prep


In [664]:
#[w for w in ex14_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex14_emp_tok) )
for c in ex14_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex14_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex14_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex14_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex14_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex14_emp_tok.head.i))
    print('Token head is: ' + str(ex14_emp_tok.head) + '       head.dep_:  '+ str(ex14_emp_tok.head.dep_) )
    for i, d in enumerate(ex14_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex14_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex14_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex14_emp_tok.dep_))
print('Token head: ' + str(ex14_emp_tok.head))
print('       head.dep_: ' + str(ex14_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex14_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex14_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print("       " + str([c for c in ex14_emp_tok.head.lefts]))
print('Token head conjuncts: ' )
print("       " + str([c for c in ex14_emp_tok.head.conjuncts if c.ent_type_ == 'CARDINAL']))
print("       " + str([c for c in ex14_emp_tok.head.conjuncts if c.ent_type == 'CARDINAL']))
print('Token conjuncts: ' )
print("       " + str([c for c in ex14_emp_tok.conjuncts]))
for w in ex14_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex14_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex14_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 10,625  has dep_ == 'nummod'
       Token.child.i : 6
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_ is conj.
       token.head.tag_ == 'CD' 
       token.head.i : 2
Token head is: 17,912       head.dep_:  dobj
4  December 31, 2014, 2015 and 2016
CD tok in date: 2014
CD tok i: 13
CD tok in date: 2015
CD tok i: 15
CD tok in date: 2016
CD tok i: 17
Token dep_: conj
Token head: 17,912
       head.dep_: dobj
       head.tag_: 'CD'
       head.pos_: 'NUM'
Token head lefts: 
       []
Token head conjuncts: 
       [14,533]
       []
Token conjuncts: 
       []
,       head.right.dep_:punct
14,533       head.right.dep_:conj
employees       head.right.dep_:conj
Token ancestors: 
17,912       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Token children: 
10,625       child.dep_:nummod
as       child.dep_:prep


In [None]:
#displacy.render(ex14, style='dep', jupyter=True)

In [667]:
print(ex14_emp_tok.head)
print(ex14_emp_tok.head.dep_)
print(ex14_emp_tok.head.head)
print(ex14_emp_tok.head.head.dep_)
print(ex14_emp_tok.head.head.head)
print(ex14_emp_tok.head.head.head.dep_)
print(ex14_emp_tok.head.head.head.pos_)
print(ex14_emp_tok.head.head.head.tag_)
print([c for c in ex14_emp_tok.head.head.head.children])
print([c.dep_ for c in ex14_emp_tok.head.head.head.children])
print([c.tag_ for c in ex14_emp_tok.head.head.head.children])
print([r for r in ex14_emp_tok.head.head.head.subtree if r.tag_== 'CD'])
print([c.ent_iob_ for c in ex14_emp_tok.head.head.head.children])
print([e for e in ex14.ents])
print([e.label_ for e in ex14.ents])
print([e for e in ex14.ents if e.label_ == 'DATE'])

17,912
dobj
had
ROOT
had
ROOT
VERB
VBD
[We, 17,912, respectively, .]
['nsubj', 'dobj', 'advmod', 'punct']
['PRP', 'CD', 'RB', '.']
[17,912, 14,533, 10,625, 31, 2014, 2015, 2016]
['', 'B', '', '']
[17,912, 14,533, 10,625, employees, December 31, 2014, 2015 and 2016]
['CARDINAL', 'CARDINAL', 'CARDINAL', 'EMP_NOUN', 'DATE']
[December 31, 2014, 2015 and 2016]


In [666]:
extract_emp_relations(ex14, verbose = True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: conj
Num_toks are: [10,625]
Emp_noun token has dep_ == 'conj'.
Child num_tok: 10,625
Head num_tok: [17,912]
child_num_tok and head_num_toks
years: [(13, 2014), (15, 2015), (17, 2016)]
num_toks: [10,625, 17,912]
emp_counts: [(2, 17,912), (4, 14,533), (6, 10,625)]
order_indices: [2, 1, 0]
year_emps: [('2016', '10,625'), ('2015', '14,533'), ('2014', '17,912')]
(We, had, '10,625', 'Other', 0, employees)


[RelationDetails(sent_num=0, word_num=0, s=We, v=had, quantity='10,625', quantity_type='Other', type_token=0, word=employees)]

### Dealing with units

#### `The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.`

Example text:  

`"The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively."`

In [66]:
thousands_sent_doc = nlp("The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.")

In [67]:
print_df(make_tok_df(thousands_sent_doc))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,The,the,det,number,nsubj,DET,DT,determiner,determiner
1,,number,number,nsubj,was,ROOT,NOUN,NN,nominal subject,"noun, singular or mass"
2,,of,of,prep,number,nsubj,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
3,,regular,regular,amod,employees,pobj,ADJ,JJ,adjectival modifier,adjective
4,EMP_NOUN,employees,employee,pobj,of,prep,NOUN,NNS,object of preposition,"noun, plural"
5,,was,be,ROOT,was,ROOT,VERB,VBD,,"verb, past tense"
6,CARDINAL,71.1,71.1,compound,thousand,attr,NUM,CD,,cardinal number
7,CARDINAL,thousand,thousand,attr,was,ROOT,NUM,CD,attribute,cardinal number
8,,",",",",punct,thousand,attr,PUNCT,",",punctuation,"punctuation mark, comma"
9,CARDINAL,73.5,73.5,compound,thousand,appos,NUM,CD,,cardinal number


In [68]:
list(list(thousands_sent_doc[4].head.head.head.rights)[0].subtree)

[71.1, thousand, ,, 73.5, thousand, ,, and, 75.3, thousand]

In [69]:
[r for r in thousands_sent_doc[4].head.head.head.lefts]

[number]

In [70]:
print_doc_info(thousands_sent_doc)

doc is: 
The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,4,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP
1,6,71.1 thousand,CARDINAL,thousand,attr,attribute,was,ROOT,VERB
2,9,73.5 thousand,CARDINAL,thousand,appos,appositional modifier,thousand,attr,NUM
3,13,75.3 thousand,CARDINAL,thousand,conj,conjunct,thousand,attr,NUM
4,16,years ended 2016,DATE,ended,advcl,adverbial clause modifier,was,ROOT,VERB
5,20,2015,DATE,2015,conj,conjunct,2016,npadvmod,NUM
6,22,2014,DATE,2014,conj,conjunct,2015,conj,NUM


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,The number,number,,nsubj,nominal subject,was,ROOT,VERB
1,3,regular employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP
2,16,years,years,DATE,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,71.1,71.1,compound,thousand,attr,NUM,CD,,cardinal number
1,CARDINAL,thousand,thousand,attr,was,ROOT,NUM,CD,attribute,cardinal number
2,CARDINAL,73.5,73.5,compound,thousand,appos,NUM,CD,,cardinal number
3,CARDINAL,thousand,thousand,appos,thousand,attr,NUM,CD,appositional modifier,cardinal number
4,CARDINAL,75.3,75.3,compound,thousand,conj,NUM,CD,,cardinal number
5,CARDINAL,thousand,thousand,conj,thousand,attr,NUM,CD,conjunct,cardinal number


In [71]:
nsub = thousands_sent_doc[1]

thousands_sent_doc[nsub.left_edge.i : nsub.right_edge.i + 1]

In [77]:
year_tok = thousands_sent_doc[18]
year_tok._.is_year

True

In [79]:
thousand_tok = thousands_sent_doc[7]
thousand_tok._.is_num_word

True

In [92]:
displacy.render(thousands_sent_doc, style='ent', jupyter=True)

In [96]:
displacy.render(thousands_sent_doc, jupyter=True)

In [241]:
extract_emp_relations(thousands_sent_doc)

Dep_ of EMP_NOUN is: pobj
year_emps: [('2014', '75.3 thousand'), ('2015', '73.5 thousand'), ('2016', '71.1 thousand')]
max year_emps: ('2016', '71.1 thousand')
Sentence has multiple years:[(18, 2016), (20, 2015), (22, 2014)]
First card subtree is :[71.1, thousand, ,, 73.5, thousand, ,, and, 75.3, thousand]
years: [(18, 2016), (20, 2015), (22, 2014)]
cards: [71.1 thousand, 73.5 thousand, 75.3 thousand]
emp_counts: [(6, 71.1 thousand), (9, 73.5 thousand), (13, 75.3 thousand)]
Root is at 2 steps from employees.
(The number of regular employees, was, '71.1 thousand', 'Other', regular, employees)


[[(0,
   0,
   The number of regular employees,
   was,
   '71.1 thousand',
   'Other',
   regular,
   employees)]]

### Identifying full-time, part-time, etc.

#### ```As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.```

Example text:

`As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.`  

Desired output: 

`(we, employed, 41,000, full-time Team Members)`   
`(we, employed, 33,000, part-time Team Members)`

In [170]:
ex8 = nlp("As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.")

print_df(make_tok_df(ex8))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,As,as,prep,employed,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,,of,of,prep,As,prep,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
2,DATE,February,february,pobj,of,prep,PROPN,NNP,object of preposition,"noun, proper singular"
3,DATE,23,23,nummod,February,pobj,NUM,CD,,cardinal number
4,DATE,",",",",punct,February,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
5,DATE,2017,2017,nummod,February,pobj,NUM,CD,,cardinal number
6,,",",",",punct,employed,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
7,,we,-PRON-,nsubj,employed,ROOT,PRON,PRP,nominal subject,"pronoun, personal"
8,,employed,employ,ROOT,employed,ROOT,VERB,VBD,,"verb, past tense"
9,CARDINAL,approximately,approximately,advmod,41000,nummod,ADV,RB,adverbial modifier,adverb


In [201]:
print_doc_info(ex8)

doc is: 
As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"February 23, 2017",DATE,February,pobj,object of preposition,of,prep,ADP
1,9,"approximately 41,000",CARDINAL,41000,nummod,,Team Members,dobj,PROPN
2,11,full-time,FULL_TIME,full-time,compound,,Team Members,dobj,PROPN
3,12,Team Members,EMP_NOUN,Team Members,dobj,direct object,employed,ROOT,VERB
4,14,"approximately 33,000",CARDINAL,33000,nummod,,Team Members,conj,PROPN
5,16,part-time,PART_TIME,part-time,compound,,Team Members,conj,PROPN
6,17,Team Members,EMP_NOUN,Team Members,conj,conjunct,Team Members,dobj,PROPN


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,February,February,DATE,pobj,object of preposition,of,prep,ADP
1,7,we,we,,nsubj,nominal subject,employed,ROOT,VERB
2,9,"approximately 41,000 full-time Team Members",Team Members,EMP_NOUN,dobj,direct object,employed,ROOT,VERB
3,14,"approximately 33,000 part-time Team Members",Team Members,EMP_NOUN,conj,conjunct,Team Members,dobj,PROPN


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,41000,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,41000,41000,nummod,Team Members,dobj,NUM,CD,,cardinal number
2,CARDINAL,approximately,approximately,advmod,33000,nummod,ADV,RB,adverbial modifier,adverb
3,CARDINAL,33000,33000,nummod,Team Members,conj,NUM,CD,,cardinal number


In [171]:
ex8_emp_tok = ex8[12]
ex8_emp_num_tok = ex8[10]
ex8_emp_tok_2 = ex8[17]

print(ex8[ex8_emp_tok.i - 1])
print(check_emp_type_flags(find_emp_type_tok(ex8_emp_tok)))
print([t for t in ex8_emp_tok.children if t._.is_emp_type == True])
print([t for t in ex8_emp_tok_2.children if t._.is_emp_type == True][0].pos_)
print(ex8_emp_tok.dep_)
print(ex8_emp_num_tok)
print([c for c in ex8_emp_tok.conjuncts])

full-time
Full-Time Employees
[full-time]
ADJ
dobj
41,000
[Team Members]


In [109]:
print('Token children: ')
for w in ex8_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
41,000       child.dep_:nummod
full-time       child.dep_:compound
and       child.dep_:cc
Team Members       child.dep_:conj


In [172]:
#[w for w in ex8_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex8_emp_tok) )
for c in ex8_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex8_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex8_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex8_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex8_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex8_emp_tok.head.i))
    print('Token head is: ' + str(ex8_emp_tok.head) + '       head.dep_:  '+ str(ex8_emp_tok.head.dep_) )
    for i, d in enumerate(ex8_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex8_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex8_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex8_emp_tok.dep_))
print('Token head: ' + str(ex8_emp_tok.head))
print('       head.dep_: ' + str(ex8_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex8_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex8_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print('Token conjuncts: ' )
print(str([c for c in ex8_emp_tok.conjuncts]))
for w in ex8_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex8_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex8_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  Team Members
Token child: 41,000  has dep_ == 'nummod'
       Token.child.i : 10
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: dobj
Token head: employed
       head.dep_: ROOT
       head.tag_: 'VBD'
       head.pos_: 'VERB'
Token head lefts: 
Token conjuncts: 
[Team Members]
Team Members       head.right.dep_:dobj
.       head.right.dep_:punct
Token ancestors: 
employed       ancestor.dep_:ROOT
Token children: 
41,000       child.dep_:nummod
full-time       child.dep_:compound
and       child.dep_:cc
Team Members       child.dep_:conj


In [None]:
displacy.render(ex8, style='dep', jupyter=True, options={'distance': 110})

In [112]:
displacy.render(ex8, style='ent', jupyter=True, options={'distance': 110})

In [202]:
extract_emp_relations(ex8)

[RelationDetails(sent_num=0, word_num=0, s=we, v=employed, quantity=41,000, quantity_type='Full-Time Employees', type_token=full-time, word=Team Members, sentence=As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.),
 RelationDetails(sent_num=0, word_num=1, s=we, v=employed, quantity=33,000, quantity_type='Part-Time Employees', type_token=part-time, word=Team Members, sentence=As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.)]

#### ```As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.```

Example text:

`As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.`  

Desired output: 

`(we, had, 19,000, (0, 'Other') ,employees)`  
`(we, had, 18,000, ('full-time', 'Full-Time') ,employees)`  

In [116]:
ex9 = nlp("As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.")

In [117]:
print_df(make_tok_df(ex9))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,As,as,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,,of,of,prep,As,prep,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
2,DATE,September,september,pobj,of,prep,PROPN,NNP,object of preposition,"noun, proper singular"
3,DATE,30,30,nummod,September,pobj,NUM,CD,,cardinal number
4,DATE,",",",",punct,September,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
5,DATE,2016,2016,nummod,September,pobj,NUM,CD,,cardinal number
6,,",",",",punct,had,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
7,,we,-PRON-,nsubj,had,ROOT,PRON,PRP,nominal subject,"pronoun, personal"
8,,had,have,ROOT,had,ROOT,VERB,VBD,,"verb, past tense"
9,CARDINAL,approximately,approximately,advmod,19000,nummod,ADV,RB,adverbial modifier,adverb


In [118]:
print_doc_info(ex9)

doc is: 
As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"September 30, 2016",DATE,September,pobj,object of preposition,of,prep,ADP
1,9,"approximately 19,000",CARDINAL,19000,nummod,,employees,dobj,NOUN
2,11,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB
3,15,"approximately 18,000",CARDINAL,18000,nsubj,nominal subject,were,relcl,VERB
4,18,full-time,FULL_TIME,full-time,compound,,employees,attr,NOUN
5,19,employees,EMP_NOUN,employees,attr,attribute,were,relcl,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,September,September,DATE,pobj,object of preposition,of,prep,ADP
1,7,we,we,,nsubj,nominal subject,had,ROOT,VERB
2,9,"approximately 19,000 employees",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB
3,18,full-time employees,employees,EMP_NOUN,attr,attribute,were,relcl,VERB


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,19000,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,19000,19000,nummod,employees,dobj,NUM,CD,,cardinal number
2,CARDINAL,approximately,approximately,advmod,18000,nsubj,ADV,RB,adverbial modifier,adverb
3,CARDINAL,18000,18000,nsubj,were,relcl,NUM,CD,nominal subject,cardinal number


In [119]:
ex9_emp_tok = ex9[11]
ex9_emp_num_tok = ex9[16]
ex9_emp_tok_2 = ex9[19]

In [120]:
ex9[ex9_emp_tok.i - 1]

19,000

In [121]:
check_emp_type_flags(find_emp_type_tok(ex9_emp_tok_2))

'Full-Time'

In [122]:
[t for t in ex9_emp_tok.children if t._.is_emp_type == True]

[]

In [123]:
[t for t in ex9_emp_tok_2.children if t._.is_emp_type == True]

[full-time]

In [124]:
ex9_emp_tok.dep_

'dobj'

In [125]:
ex9_emp_num_tok

18,000

In [126]:
[c for c in ex9_emp_tok.conjuncts]

[]

In [127]:
print('Token children: ')
for w in ex9_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token children: 
19,000       child.dep_:nummod
,       child.dep_:punct
were       child.dep_:relcl


In [128]:
#[w for w in ex9_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex9_emp_tok_2) )
for c in ex9_emp_tok_2.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex9_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex9_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex9_emp_tok_2.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex9_emp_tok_2.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex9_emp_tok.head.i))
    print('Token head is: ' + str(ex9_emp_tok.head) + '       head.dep_:  '+ str(ex9_emp_tok.head.dep_) )
    for i, d in enumerate(ex9_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex9_emp_tok_2.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex9_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex9_emp_tok_2.dep_))
print('Token head: ' + str(ex9_emp_tok_2.head))
print('       head.dep_: ' + str(ex9_emp_tok_2.head.dep_))
print("       head.tag_: '"+ str(ex9_emp_tok_2.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex9_emp_tok_2.head.pos_) + "'" )
print('Token head lefts: ' )
print(str([c for c in ex9_emp_tok_2.head.lefts]))
print('Token conjuncts: ' )
print(str([c for c in ex9_emp_tok.conjuncts]))
for w in ex9_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex9_emp_tok_2.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex9_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token dep_: attr
Token head: were
       head.dep_: relcl
       head.tag_: 'VBD'
       head.pos_: 'VERB'
Token head lefts: 
[18,000]
Token conjuncts: 
[]
employees       head.right.dep_:dobj
.       head.right.dep_:punct
Token ancestors: 
were       ancestor.dep_:relcl
employees       ancestor.dep_:dobj
had       ancestor.dep_:ROOT
Token children: 
19,000       child.dep_:nummod
,       child.dep_:punct
were       child.dep_:relcl


In [129]:
displacy.render(ex9, style='dep', jupyter=True, options={'distance': 110})

In [130]:
displacy.render(ex9, style='ent', jupyter=True, options={'distance': 110})

In [131]:
list(ex9[19].head.lefts)

[18,000]

In [132]:
extract_emp_relations(ex9)

Dep_ of EMP_NOUN is: dobj
Num_toks are: [19,000]
Root is at 1 steps from employees.
(we, had, 19,000, 'Other', 0, employees)
Dep_ of EMP_NOUN is: attr
Root is at 1 steps from employees.
(we, had, approximately 18,000, 'Full-Time', full-time, employees)


[[(0, 0, we, had, 19,000, 'Other', 0, employees),
  (0, 1, we, had, approximately 18,000, 'Full-Time', full-time, employees)]]

### Dealing with sub-clauses

#### `Including our full and part-time personnel, we estimate that we have the equivalent of 12 full time employees.`

In [168]:
ex11 = nlp("Including our full and part-time personnel, we estimate that we have the equivalent of 12 full time employees.")

print_df(make_tok_df(ex11))

print(ex11_emp_tok = ex11[17])

#Subtree of 'have'
print(list(list(ex11_emp_tok.head.head.head.rights)[0].subtree))

print([r for r in ex11_emp_tok.head.head.head.lefts])

In [173]:
print_doc_info(ex11)

doc is: 
Including our full and part-time personnel, we estimate that we have the equivalent of 12 full time employees.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,4,part-time,PART_TIME,part-time,conj,conjunct,full,amod,ADJ
1,15,12,CARDINAL,12,nummod,,employees,pobj,NOUN
2,16,full time,FULL_TIME,full time,compound,,employees,pobj,NOUN
3,17,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,our full and part-time personnel,personnel,,pobj,object of preposition,Including,prep,VERB
1,7,we,we,,nsubj,nominal subject,estimate,ROOT,VERB
2,10,we,we,,nsubj,nominal subject,have,ccomp,VERB
3,12,the equivalent,equivalent,,dobj,direct object,have,ccomp,VERB
4,15,12 full time employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,12,12,nummod,employees,pobj,NUM,CD,,cardinal number


In [174]:
#displacy.render(ex11, style='ent', jupyter=True)

In [None]:
#displacy.render(ex11, jupyter=True, options = {'distance' : 130} )

In [188]:
#[w for w in ex11_emp_tok.head.lefts if w.dep_ == 'nsubj']
print('Token:  ' + str(ex11_emp_tok) )
for c in ex11_emp_tok.children:
    if c.dep_ == 'nummod':
        print("Token child: " + str(c) + "  has dep_ == 'nummod'")
        print("       Token.child.i : " + str(c.i))
        print('       child.dep_: ' + str(c.dep_))
        print("       child.tag_: '"+ str(c.tag_) + "'" )
        print("       child.pos_: '"+ str(c.pos_) + "'" )
        for gc in c.children:
            if gc.dep_ == 'conj':
                print('       Token GC dep_ is conj.')
                print('       Token GC is ' + str(gc))
                if gc.tag_ == 'CD':
                    print("       gc.tag_ == 'CD' ")
                    print("       gc.i : " + str(gc.i))
                print('       Token head is: ' + str(c) + '       head.dep_:  '+ str(c.dep_) )
                for i, d in enumerate(ex11_emp_tok.doc.ents):
                    if d.label_ == 'DATE':
                        print(str(i) + "  " + str(d))
                        for tok in ex11_emp_tok.doc[d.start:d.end + 1]:
                            if tok._.is_year == True:
                                print("       CD tok in date: " + str(tok))
                
if ex11_emp_tok.dep_ == 'conj':
    print('Token dep_ is conj.')
    if ex11_emp_tok.head.tag_ == 'CD':
        print("       token.head.tag_ == 'CD' ")
        print("       token.head.i : " + str(ex11_emp_tok.head.i))
    print('Token head is: ' + str(ex11_emp_tok.head) + '       head.dep_:  '+ str(ex11_emp_tok.head.dep_) )
    for i, d in enumerate(ex11_emp_tok.doc.ents):
        if d.label_ == 'DATE':
            print(str(i) + "  " + str(d))
            for tok in ex11_emp_tok.doc[d.start:d.end + 1]:
                if tok._.is_year == True:
                    print("CD tok in date: " + str(tok))
                    print("CD tok i: " + str(tok.i))
#    if ex11_emp_tok.head.dep_ == ''

print('Token dep_: ' + str(ex11_emp_tok.dep_))
print('Token head: ' + str(ex11_emp_tok.head))
print('       head.dep_: ' + str(ex11_emp_tok.head.dep_))
print("       head.tag_: '"+ str(ex11_emp_tok.head.tag_) + "'" )
print("       head.pos_: '"+ str(ex11_emp_tok.head.pos_) + "'" )
print('Token head lefts: ' )
print(str([c for c in ex11_emp_tok.head.lefts]))
print('Token conjuncts: ' )
print(str([c for c in ex11_emp_tok.conjuncts]))
for w in ex11_emp_tok.head.rights:
    print(str(w) + '       head.right.dep_:' + str(w.dep_))
    if w.dep_ == 'nsubj':
        print(str(w) + ' is nsubj. Subtree is:' )
        [print(x) for x in w.lefts]
        [print(s)for s in w.subtree if s.dep_ == 'poss']
print('Token ancestors: ')
for w in ex11_emp_tok.ancestors:
    print(str(w) + '       ancestor.dep_:' + str(w.dep_))
print('Token children: ')
for w in ex11_emp_tok.children:
    print(str(w) + '       child.dep_:' + str(w.dep_))

Token:  employees
Token child: 12  has dep_ == 'nummod'
       Token.child.i : 15
       child.dep_: nummod
       child.tag_: 'CD'
       child.pos_: 'NUM'
Token dep_: pobj
Token head: of
       head.dep_: prep
       head.tag_: 'IN'
       head.pos_: 'ADP'
Token head lefts: 
[]
Token conjuncts: 
[]
employees       head.right.dep_:pobj
Token ancestors: 
of       ancestor.dep_:prep
equivalent       ancestor.dep_:dobj
have       ancestor.dep_:ccomp
estimate       ancestor.dep_:ROOT
Token children: 
12       child.dep_:nummod
full time       child.dep_:compound


In [245]:
extract_emp_relations(ex11)

Root token lemma not one of ['be', 'employ', 'have']. 
Root token, lemma are : have have
[that, we, have, the, equivalent, of, 12, full time, employees]
Dep_ of EMP_NOUN is: pobj
Num_toks are: [12]
Root is at 1 steps from employees.
(we, have, 12, 'Full-Time Employees', full time, employees)


[[(0, 0, we, have, 12, 'Full-Time Employees', full time, employees)]]

### Testing on sentences

In [14]:
test_sents = ["As of September 30, 2016, we employed approximately 7,300 employees world-wide.", 
"As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees.", 
"At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.", 
"At December 31, 2016, we had approximately 9,400 full-time employees.", 
"As of October 29, 2016, we employed approximately 10,000 individuals worldwide.", 
"The number of full-time employees of the Company was approximately 31,800 at December 31, 2016 and 32,300 at December 31, 2015.", 
"As of December 31, 2016, we had 1,469 total employees.", 
"ADP employed approximately 57,000 persons as of June 30, 2016.", 
"At December 31, 2016, we employed approximately 26,400 employees.", 
"The Company and its subsidiaries employed 1,562 persons at December 31, 2016, 114 of whom are covered by a collective bargaining agreement with District 10 of the International Association of Machinists.", 
"As of December 31, 2016, the Company had 455 employees, an increase of 17 employees from the prior year end.", 
"As of December 31, 2016, we had approximately 17,500 employees worldwide.", 
"Based in Neenah, Wisconsin, at December 31, 2016, the Company employed approximately 17,500 individuals and had 59 manufacturing facilities.", 
"As of January 31, 2017, we employed 7,683 individuals.", 
"At December 31, 2016, Bio-Rad had approximately 8,250 employees.", 
"As of December 31, 2016, we had approximately 8,500 full-time employees and 600 contractors.", 
"Alcoa's total worldwide employment at the end of 2016 was approximately 14,000 employees in 15 countries.", 
"As of December 31, 2016, we employed approximately 2,100 people.", 
"At December 31, 2016, the Company had approximately 11,500 employees.",
"As of February 23, 2017, we employed approximately 41,000 full-time Team Members and approximately 33,000 part-time Team Members.", 
"As of December 31, 2016, we had 699 full-time employees and 202 temporary employees.", 
"As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.", 
"As of December 31, 2016, we had 2,646 employees, 1,581 of whom were pilots.", 
"The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.",
"We are a small company with approximately 61 employees.",
"Total workforce level at December 31, 2016 was approximately 150,500.",           
"Currently, the Company and its subsidiaries have an aggregate of 35 employees.",
"we employ only 31 employees", 
"We currently have 21 employees",
"We currently employ 26 full-time employees",
"Including our full and part-time personnel, we estimate that we have \
the equivalent of 12 full time employees.",
"As a REIT, we employ only 31 employees and have a cost-effective \
management structure."
             ]

Nominal subjects are almost always left of root token. 

In [134]:
for i,t in enumerate(test_sents):
    nsubs = [t for t in nlp(t) if t.dep_ == 'nsubj']
    if len(nsubs) > 1:
        print(str(i))
        for i2, tok in enumerate(nsubs):
            print("Tok "+str(i2 + 1) + " of " + str(len(nsubs)))
            print("Token root: " + str(find_root_tok(tok)[0]))
            print("Token side of root: " + find_tok_side_of_root(tok, find_root_tok(tok)[0]))
            print("Sentence is :" + str(nlp(t).text))
            print("Token, dep_, index :" + str(tok) + " " + str(tok.dep_) + " " + str(tok.i))
            print("POS, tag, lemma :" + str(tok.pos_) + " " + str(tok.tag_) + " " + str(tok.lemma_))

21
Tok 1 of 2
Token root: had
Token side of root: left
Sentence is :As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.
Token, dep_, index :we nsubj 7
POS, tag, lemma :PRON PRP -PRON-
Tok 2 of 2
Token root: had
Token side of root: right
Sentence is :As of September 30, 2016, we had approximately 19,000 employees, of which approximately 18,000 were full-time employees.
Token, dep_, index :18,000 nsubj 16
POS, tag, lemma :NUM CD 18,000
22
Tok 1 of 2
Token root: had
Token side of root: left
Sentence is :As of December 31, 2016, we had 2,646 employees, 1,581 of whom were pilots.
Token, dep_, index :we nsubj 7
POS, tag, lemma :PRON PRP -PRON-
Tok 2 of 2
Token root: had
Token side of root: right
Sentence is :As of December 31, 2016, we had 2,646 employees, 1,581 of whom were pilots.
Token, dep_, index :1,581 nsubj 12
POS, tag, lemma :NUM CD 1,581


In [135]:
for i,t in enumerate(test_sents):
    print(str(i))
    emp_nouns = [t for t in nlp(t) if t.ent_type_ == 'EMP_NOUN']
    for i2, tok in enumerate(emp_nouns):
        print("Tok "+str(i2 + 1) + " of " + str(len(emp_nouns)))
        print("Sentence is :" + str(nlp(t).text))
        print("Token, dep_, index :" + str(tok) + " " + str(tok.dep_) + " " + str(tok.i))
        print("POS, tag, lemma :" + str(tok.pos_) + " " + str(tok.tag_) + " " + str(tok.lemma_))

0
Tok 1 of 1
Sentence is :As of September 30, 2016, we employed approximately 7,300 employees world-wide.
Token, dep_, index :employees dobj 11
POS, tag, lemma :NOUN NNS employee
1
Tok 1 of 1
Sentence is :As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees.
Token, dep_, index :employees pobj 16
POS, tag, lemma :NOUN NNS employee
2
Tok 1 of 1
Sentence is :At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.
Token, dep_, index :employees dobj 14
POS, tag, lemma :NOUN NNS employee
3
Tok 1 of 1
Sentence is :At December 31, 2016, we had approximately 9,400 full-time employees.
Token, dep_, index :employees dobj 11
POS, tag, lemma :NOUN NNS employee
4
Tok 1 of 1
Sentence is :As of October 29, 2016, we employed approximately 10,000 individuals worldwide.
Token, dep_, index :individuals dobj 11
POS, tag, lemma :NOUN NNS individual
5
Tok 1 of 1
Sentence is :The number of full-time employees of the Company was approximately 3

In [370]:
for i,t in enumerate(test_sents):
    print(str(i))
    print(extract_emp_relations(nlp(t), verbose=False))

0
[RelationDetails(sent_num=0, word_num=0, s=we, v=employed, quantity=7,300, quantity_type='Other Employees', type_token=0, word=employees, sentence=As of September 30, 2016, we employed approximately 7,300 employees world-wide.)]
1
[RelationDetails(sent_num=0, word_num=0, s=subsidiaries, v=had, quantity=17,634, quantity_type='Other Employees', type_token=0, word=employees, sentence=As of December 31, 2016, the subsidiaries of AEP had a total of 17,634 employees.)]
2
[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity=56,400, quantity_type='Other Employees', type_token=0, word=employees, sentence=At December 31, 2016 and 2015, we had approximately 56,400 and 66,400 employees, respectively.)]
3
[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity=9,400, quantity_type='Full-Time Employees', type_token=full-time, word=employees, sentence=At December 31, 2016, we had approximately 9,400 full-time employees.)]
4
[RelationDetails(sent_num=0, word_num=0, s=we, v=employed,

## Testing with paragraphs

acc_id: `0000034088-17-000017`  
```
"The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively. Regular employees are defined as active executive, management, professional, technical and wage employees who work full time or part time for the Corporation and are covered by the Corporation's benefit plans and programs. Regular employees do not include employees of the company‑operated retail sites (CORS). The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively. The decrease in CORS employees reflects the multi‑year transition of the company‑operated retail network to a more capital‑efficient Branded Wholesaler model."
```

In [68]:
test_paragraphs = ["The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively. Regular employees are defined as active executive, management, professional, technical and wage employees who work full time or part time for the Corporation and are covered by the Corporation's benefit plans and programs. Regular employees do not include employees of the company‑operated retail sites (CORS). The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively. The decrease in CORS employees reflects the multi‑year transition of the company‑operated retail network to a more capital‑efficient Branded Wholesaler model.",
                  "As of December 31, 2016, we had a total of 619 employees, including 565 full-time, 20 regularly scheduled part-time employees, and 34 need-based part-time employees. We consider our current relationship with our employees to be good. Our employees are not represented by labor unions and are not subject to collective bargaining agreements.", 
                  "As of September 30, 2016, we employed approximately 7,300 employees world-wide. Approximately 860 of our employees in Mexico, 450 employees in Singapore, and 200 employees in Japan are covered by collective bargaining and other union agreements.",
                  "As of December 31, 2016, we employed 2,776 full-time employees, of which 31 held Ph.D. degrees in a science or engineering field. Of our employees, 287 are located in the U.S., 1,218 are located in Taiwan and 1,271 are located in China. None of our employees are represented by any collective bargaining agreement, but certain employees of our China subsidiary are members of a trade union. We have never suffered any work stoppage as a result of an employment related strike or any employee related dispute and believe that we have satisfactory relations with our employees.", 
                  "Our business depends on highly qualified management, operations and flight personnel. As a percentage of our consolidated operating expenses, salaries, wages and benefits accounted for approximately 25.4% in 2016, 20.7% in 2015 and 19.2% in 2014. As of December 31, 2016, we had 2,646 employees, 1,581 of whom were pilots.", 
                  "As of December 31, 2016, we had 699 full-time employees and 202 temporary employees. The breakdown of our full-time employees by department is as follows: 175 direct manufacturing employees and 524 administrative and manufacturing support employees. Of the 524 administrative and manufacturing support employees, 213 were involved in sales, marketing, communications and training. Of the 202 temporary employees, more than 92% worked in direct manufacturing roles. Our employees are not covered by any collective bargaining agreement, and we have never experienced a work stoppage. We believe that our relations with our employees are good.", 
                  "The employee cost at Jaguar Land Rover increased by 17.6% to Rs.228,730 million in Fiscal 2016 from Rs.194,467 million in Fiscal 2015. This increase includes an unfavorable foreign currency translation from GBP to Indian rupees of Rs.546 million. In GBP terms, employee costs at Jaguar Land Rover increased to GBP 2,321 million in Fiscal 2016 from GBP1,977 million in Fiscal 2015. The employee cost at Jaguar Land Rover as a percentage to revenue increased to 10.5% in Fiscal 2016 from 9.0% in Fiscal 2015. Due to consistent increases in volumes and to support new launches and product development projects, Jaguar Land Rover increased its average permanent headcount by 19.6% in Fiscal 2016 to 29,789 employees from 24,902 employees in Fiscal 2015. However, the average temporary headcount was flat at 7,216 employees in Fiscal 2016 from 7,225 employees in Fiscal 2015. Total number of permanent employees as at March 31, 2016 was 30,750, as compared to 27,004 as at March 31, 2015 for Jaguar Land Rover."]

In [69]:
test_para = nlp(test_paragraphs[0])

test_paragraphs[0]

"The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively. Regular employees are defined as active executive, management, professional, technical and wage employees who work full time or part time for the Corporation and are covered by the Corporation's benefit plans and programs. Regular employees do not include employees of the company‑operated retail sites (CORS). The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively. The decrease in CORS employees reflects the multi‑year transition of the company‑operated retail network to a more capital‑efficient Branded Wholesaler model."

In [70]:
print_df(make_tok_df(nlp("Regular employees are defined as active executive, management, professional, technical and wage employees who work full time or part time for the Corporation and are covered by the Corporation's benefit plans and programs.")))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,Regular,regular,amod,employees,nsubjpass,ADJ,JJ,adjectival modifier,adjective
1,EMP_NOUN,employees,employee,nsubjpass,defined,ROOT,NOUN,NNS,nominal subject (passive),"noun, plural"
2,,are,be,auxpass,defined,ROOT,VERB,VBP,auxiliary (passive),"verb, non-3rd person singular present"
3,,defined,define,ROOT,defined,ROOT,VERB,VBN,,"verb, past participle"
4,,as,as,prep,defined,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
5,,active,active,amod,executive,pobj,ADJ,JJ,adjectival modifier,adjective
6,,executive,executive,pobj,as,prep,NOUN,NN,object of preposition,"noun, singular or mass"
7,,",",",",punct,executive,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
8,,management,management,conj,executive,pobj,NOUN,NN,conjunct,"noun, singular or mass"
9,,",",",",punct,management,conj,PUNCT,",",punctuation,"punctuation mark, comma"


In [71]:
extract_emp_relations(test_para, verbose=False)

[RelationDetails(sent_num=0, word_num=0, s=The number of regular employees, v=was, quantity=71.1 thousand, quantity_type='Other Employees', type_token=regular, word=employees, sentence=The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.),
 RelationDetails(sent_num=3, word_num=0, s=The number of CORS employees, v=was, quantity=1.6 thousand, quantity_type='Other Employees', type_token=CORS, word=employees, sentence=The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively.)]

In [72]:
print_df(make_tok_df(test_para))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,The,the,det,number,nsubj,DET,DT,determiner,determiner
1,,number,number,nsubj,was,ROOT,NOUN,NN,nominal subject,"noun, singular or mass"
2,,of,of,prep,number,nsubj,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
3,,regular,regular,amod,employees,pobj,ADJ,JJ,adjectival modifier,adjective
4,EMP_NOUN,employees,employee,pobj,of,prep,NOUN,NNS,object of preposition,"noun, plural"
5,,was,be,ROOT,was,ROOT,VERB,VBD,,"verb, past tense"
6,CARDINAL,71.1,71.1,compound,thousand,attr,NUM,CD,,cardinal number
7,CARDINAL,thousand,thousand,attr,was,ROOT,NUM,CD,attribute,cardinal number
8,,",",",",punct,thousand,attr,PUNCT,",",punctuation,"punctuation mark, comma"
9,CARDINAL,73.5,73.5,compound,thousand,appos,NUM,CD,,cardinal number


In [73]:
print_doc_info(test_para)

doc is: 
The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively. Regular employees are defined as active executive, management, professional, technical and wage employees who work full time or part time for the Corporation and are covered by the Corporation's benefit plans and programs. Regular employees do not include employees of the company‑operated retail sites (CORS). The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively. The decrease in CORS employees reflects the multi‑year transition of the company‑operated retail network to a more capital‑efficient Branded Wholesaler model.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,4,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP
1,6,71.1 thousand,CARDINAL,thousand,attr,attribute,was,ROOT,VERB
2,9,73.5 thousand,CARDINAL,thousand,appos,appositional modifier,thousand,attr,NUM
3,13,75.3 thousand,CARDINAL,thousand,conj,conjunct,thousand,attr,NUM
4,16,years ended 2016,DATE,ended,advcl,adverbial clause modifier,was,ROOT,VERB
5,20,2015,DATE,2015,conj,conjunct,2016,npadvmod,NUM
6,22,2014,DATE,2014,conj,conjunct,2015,conj,NUM
7,27,employees,EMP_NOUN,employees,nsubjpass,nominal subject (passive),defined,ROOT,VERB
8,41,employees,EMP_NOUN,employees,appos,appositional modifier,executive,pobj,NOUN
9,44,full time,FULL_TIME,full time,npadvmod,noun phrase as adverbial modifier,work,relcl,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,The number,number,,nsubj,nominal subject,was,ROOT,VERB
1,3,regular employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP
2,16,years,years,DATE,pobj,object of preposition,at,prep,ADP
3,26,Regular employees,employees,EMP_NOUN,nsubjpass,nominal subject (passive),defined,ROOT,VERB
4,31,active executive,executive,,pobj,object of preposition,as,prep,ADP
5,34,management,management,,conj,conjunct,executive,pobj,NOUN
6,40,wage,wage,,conj,conjunct,technical,conj,ADJ
7,41,employees,employees,EMP_NOUN,appos,appositional modifier,executive,pobj,NOUN
8,42,who,who,,nsubj,nominal subject,work,relcl,VERB
9,48,the Corporation,Corporation,ORG,pobj,object of preposition,for,dative,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,71.1,71.1,compound,thousand,attr,NUM,CD,,cardinal number
1,CARDINAL,thousand,thousand,attr,was,ROOT,NUM,CD,attribute,cardinal number
2,CARDINAL,73.5,73.5,compound,thousand,appos,NUM,CD,,cardinal number
3,CARDINAL,thousand,thousand,appos,thousand,attr,NUM,CD,appositional modifier,cardinal number
4,CARDINAL,75.3,75.3,compound,thousand,conj,NUM,CD,,cardinal number
5,CARDINAL,thousand,thousand,conj,thousand,attr,NUM,CD,conjunct,cardinal number
6,CARDINAL,1.6,1.6,compound,thousand,attr,NUM,CD,,cardinal number
7,CARDINAL,thousand,thousand,attr,was,ROOT,NUM,CD,attribute,cardinal number
8,CARDINAL,2.1,2.1,compound,thousand,appos,NUM,CD,,cardinal number
9,CARDINAL,thousand,thousand,appos,thousand,attr,NUM,CD,appositional modifier,cardinal number


In [74]:
list(test_para[81].children)

[CORS]

In [75]:
train_df[train_df.acc_id == '0000034088-17-000017']

Unnamed: 0_level_0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split,label
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
614,0000034088-17-000017,The number of regular employees was 71.1 thous...,731,False,False,The number of regular employees was 71.1 thous...,"<p class=""c64""><span class=""c23"">The number of...",train,1
616,0000034088-17-000017,Number of regular employees at year-end (thous...,55,False,False,Number of regular employees at year-end (thous...,"<p class=""c2""><span class=""c96"">Number of regu...",train,1
617,0000034088-17-000017,(3) Regular employees are defined as active ...,228,False,False,(3) Regular employees are defined as active ...,"<p class=""c103""><span class=""c23"">(3) Regula...",train,0


In [76]:
for i,t in enumerate(test_paragraphs):
    print(str(i))
    print(extract_emp_relations(nlp(t), verbose=False))

0
[RelationDetails(sent_num=0, word_num=0, s=The number of regular employees, v=was, quantity=71.1 thousand, quantity_type='Other Employees', type_token=regular, word=employees, sentence=The number of regular employees was 71.1 thousand, 73.5 thousand, and 75.3 thousand at years ended 2016, 2015 and 2014, respectively.), RelationDetails(sent_num=3, word_num=0, s=The number of CORS employees, v=was, quantity=1.6 thousand, quantity_type='Other Employees', type_token=CORS, word=employees, sentence=The number of CORS employees was 1.6 thousand, 2.1 thousand, and 8.4 thousand at years ended 2016, 2015 and 2014, respectively.)]
1
[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity=619, quantity_type='Other Employees', type_token=0, word=employees, sentence=As of December 31, 2016, we had a total of 619 employees, including 565 full-time, 20 regularly scheduled part-time employees, and 34 need-based part-time employees.), RelationDetails(sent_num=0, word_num=1, s=we, v=had, quantit

In [77]:
extract_emp_relations(nlp(test_paragraphs[5]), verbose=False)

[RelationDetails(sent_num=0, word_num=0, s=we, v=had, quantity=699, quantity_type='Full-Time Employees', type_token=full-time, word=employees, sentence=As of December 31, 2016, we had 699 full-time employees and 202 temporary employees.),
 RelationDetails(sent_num=0, word_num=1, s=we, v=had, quantity=202, quantity_type='Other Employees', type_token=temporary, word=employees, sentence=As of December 31, 2016, we had 699 full-time employees and 202 temporary employees.),
 RelationDetails(sent_num=1, word_num=1, s=breakdown, v=is, quantity=175, quantity_type='Other Employees', type_token=manufacturing, word=employees, sentence=The breakdown of our full-time employees by department is as follows: 175 direct manufacturing employees and 524 administrative and manufacturing support employees.)]

In [78]:
print_df(make_tok_df(nlp(test_paragraphs[5])))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,As,as,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,,of,of,prep,As,prep,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
2,DATE,December,december,pobj,of,prep,PROPN,NNP,object of preposition,"noun, proper singular"
3,DATE,31,31,nummod,December,pobj,NUM,CD,,cardinal number
4,DATE,",",",",punct,December,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
5,DATE,2016,2016,nummod,December,pobj,NUM,CD,,cardinal number
6,,",",",",punct,had,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
7,,we,-PRON-,nsubj,had,ROOT,PRON,PRP,nominal subject,"pronoun, personal"
8,,had,have,ROOT,had,ROOT,VERB,VBD,,"verb, past tense"
9,CARDINAL,699,699,nummod,employees,dobj,NUM,CD,,cardinal number


In [79]:
print_doc_info(nlp(test_paragraphs[5]))

doc is: 
As of December 31, 2016, we had 699 full-time employees and 202 temporary employees. The breakdown of our full-time employees by department is as follows: 175 direct manufacturing employees and 524 administrative and manufacturing support employees. Of the 524 administrative and manufacturing support employees, 213 were involved in sales, marketing, communications and training. Of the 202 temporary employees, more than 92% worked in direct manufacturing roles. Our employees are not covered by any collective bargaining agreement, and we have never experienced a work stoppage. We believe that our relations with our employees are good.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"December 31, 2016",DATE,December,pobj,object of preposition,of,prep,ADP
1,9,699,CARDINAL,699,nummod,,employees,dobj,NOUN
2,10,full-time,FULL_TIME,full-time,compound,,employees,dobj,NOUN
3,11,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB
4,13,202,CARDINAL,202,nummod,,employees,conj,NOUN
5,15,employees,EMP_NOUN,employees,conj,conjunct,employees,dobj,NOUN
6,21,full-time,FULL_TIME,full-time,compound,,employees,pobj,NOUN
7,22,employees,EMP_NOUN,employees,pobj,object of preposition,of,prep,ADP
8,29,175,CARDINAL,175,nummod,,employees,dobj,NOUN
9,32,employees,EMP_NOUN,employees,dobj,direct object,follows,advcl,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,December,December,DATE,pobj,object of preposition,of,prep,ADP
1,7,we,we,,nsubj,nominal subject,had,ROOT,VERB
2,9,699 full-time employees,employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB
3,13,202 temporary employees,employees,EMP_NOUN,conj,conjunct,employees,dobj,NOUN
4,17,The breakdown,breakdown,,nsubj,nominal subject,is,ROOT,VERB
5,20,our full-time employees,employees,EMP_NOUN,pobj,object of preposition,of,prep,ADP
6,24,department,department,,pobj,object of preposition,by,prep,ADP
7,29,175 direct manufacturing employees,employees,EMP_NOUN,dobj,direct object,follows,advcl,VERB
8,34,524 administrative and manufacturing support e...,employees,EMP_NOUN,conj,conjunct,employees,dobj,NOUN
9,42,the 524 administrative and manufacturing suppo...,employees,EMP_NOUN,pobj,object of preposition,Of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,699,699,nummod,employees,dobj,NUM,CD,,cardinal number
1,CARDINAL,202,202,nummod,employees,conj,NUM,CD,,cardinal number
2,CARDINAL,175,175,nummod,employees,dobj,NUM,CD,,cardinal number
3,CARDINAL,524,524,nmod,employees,conj,NUM,CD,modifier of nominal,cardinal number
4,CARDINAL,524,524,nummod,employees,pobj,NUM,CD,,cardinal number
5,CARDINAL,213,213,nsubjpass,involved,ROOT,NUM,CD,nominal subject (passive),cardinal number
6,CARDINAL,202,202,nummod,employees,pobj,NUM,CD,,cardinal number


In [80]:
extract_emp_relations(nlp(test_paragraphs[6]), verbose=True)

Word_id is : 0
Word is : headcount
Root token lemma not one of ['be', 'employ', 'have']. 
Root token, lemma are : increased increase
[Due, to, consistent, increases, in, volumes, and, to, support, new, launches, and, product, development, projects, ,, Jaguar, Land, Rover, increased, its, average, permanent, headcount, by, 19.6, %, in, Fiscal, 2016, to, 29,789, employees, from, 24,902, employees, in, Fiscal, 2015, .]
verb token lemma not one of ['be', 'employ', 'have']. 
verb token, lemma are : increased increase
[Due, to, consistent, increases, in, volumes, and, to, support, new, launches, and, product, development, projects, ,, Jaguar, Land, Rover, increased, its, average, permanent, headcount, by, 19.6, %, in, Fiscal, 2016, to, 29,789, employees, from, 24,902, employees, in, Fiscal, 2015, .]
Word_id is : 1
Word is : employees
Root token lemma not one of ['be', 'employ', 'have']. 
Root token, lemma are : increased increase
[Due, to, consistent, increases, in, volumes, and, to, support

[RelationDetails(sent_num=5, word_num=1, s=headcount, v=was, quantity=7,216, quantity_type='Other Employees', type_token=0, word=employees, sentence=However, the average temporary headcount was flat at 7,216 employees in Fiscal 2016 from 7,225 employees in Fiscal 2015.),
 RelationDetails(sent_num=5, word_num=2, s=headcount, v=was, quantity=7,225, quantity_type='Other Employees', type_token=0, word=employees, sentence=However, the average temporary headcount was flat at 7,216 employees in Fiscal 2016 from 7,225 employees in Fiscal 2015.),
 RelationDetails(sent_num=6, word_num=0, s=Total number of permanent employees as at March 31, 2016, v=was, quantity=30,750, quantity_type='Other Employees', type_token=permanent, word=employees, sentence=Total number of permanent employees as at March 31, 2016 was 30,750, as compared to 27,004 as at March 31, 2015 for Jaguar Land Rover.)]

## Extract information from training paragraphs

### Initial logic testing

In [64]:
train_para_list = train_df.para_text.tolist()

val_para_list = val_df.para_text.tolist()

train_df.head(1)

In [66]:
train_df.columns

Index(['acc_id', 'para_text', 'len', 'emp_header', 'first_emp_head_block',
       'para_text_orig', 'para_tag', 'split', 'label'],
      dtype='object')

In [81]:
rels = extract_emp_relations(test_para, verbose=False)

for rel in rels:
    print(list(zip(rel._fields, [rel.sent_num, rel.word_num, rel.s, rel.v, rel.quantity, rel.quantity_type, rel.type_token , rel.word, rel.sentence])))

In [93]:
tup = rels[0]
tup._fields

In [95]:
five_paras = train_para_list[50:55]

rels_list = []
for i, p in enumerate(five_paras):
    print(i)
    print(p)
    print(extract_emp_relations(nlp(p), verbose=False))
    if extract_emp_relations(nlp(p)):
        rels_list.append(extract_emp_relations(nlp(p)))

In [97]:
print(rels_list)

[]

In [98]:
print(five_paras[1])
print_df(make_tok_df(nlp(five_paras[1])))

'In January 2016, the Company initiated a reduction in workforce that was effectively completed by the end of the first quarter. As a result of the workforce reduction, the Company recognized a $1 million non-cash curtailment loss related to its pension plan for both the curtailment-related decrease to the benefit obligation and the recognition of the proportionate share of unrecognized prior service cost and net loss from other comprehensive income (loss) in the second quarter of 2016. For the year ended December 31, 2016, the Company recognized a non-cash settlement loss of $11 million related to a total of $37 million of lump sum payments from the pension plan. Additionally, the Company recognized a non-cash curtailment gain of $6 million related to its other postretirement benefit plan in the first quarter of 2016.'

In [305]:
print_doc_info(nlp(five_paras[1]))

doc is: 
Employees' Deferred Profit Sharing Retirement Plan (Amended and Restated effective January 1, 1997). (6)
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,Employees,EMP_NOUN,Employees,poss,possession modifier,Plan,ROOT,PROPN
1,12,"January 1, 1997",DATE,January,npadvmod,noun phrase as adverbial modifier,Restated,conj,VERB
2,19,6,CARDINAL,6,ROOT,,6,ROOT,NUM


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,Employees' Deferred Profit Sharing Retirement ...,Plan,,ROOT,,Plan,ROOT,PROPN


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,6,6,ROOT,6,ROOT,NUM,CD,,cardinal number


In [319]:
test_sent = list(nlp(five_paras[1]).sents)

print(test_sent)

for p_id, par_sent in enumerate(nlp(five_paras[1]).sents):    
    for x_id, x in enumerate(filter(lambda w: w.ent_type_ == 'EMP_NOUN', par_sent)): 
        print(x_id)
        print(x)

### Make fact dataframes

Functions

In [45]:
def make_fact_df(docs, re_func, df=0, id_fields=0, verbose = False):
    """Return data frame with rows for each fact extracted. """
    
    # Initialize list of fields to maintain from input df. 
    input_df_fields = ['doc_ids']
    
    # Turn doc into a list if a single string is passed
    if type(docs) == type(""):
        docs = [docs]
    
    if type(df) == type(pd.DataFrame()) :
        doc_ids = df.index.tolist() # Use the index to identify documents
        if id_fields:
            input_df_fields = input_df_fields + id_fields 
        docs = docs.tolist()

    else: 
        doc_ids = list(range(len(docs)))
        
    all_docs, all_rels, doc_id_list = [], [], []
    for i, doc in enumerate(docs):
        doc_rels = re_func(nlp(doc))
        if doc_rels:         
            if all_rels:
                doc_list = [doc] * len(doc_rels)
                all_docs = all_docs + doc_list
                all_rels = all_rels + doc_rels
                doc_id_list = doc_id_list + [doc_ids[i]] * len(doc_rels)
            else: 
                try: # Get field names, if availble
                    field_names = list(doc_rels[0]._fields)
                except:
                    field_names = list(range(len(doc_rels[0])))
                doc_list = [doc] * len(doc_rels)
                all_docs = all_docs + doc_list     
                all_rels = all_rels + doc_rels
                doc_id_list = doc_id_list + [doc_ids[i]] * len(doc_rels)
           
    if all_rels:
        if verbose == True:
            print("len of all_rels is: " + str(len(all_rels)))
            print("len of all_docs is: " + str(len(all_rels)))
            print("len of doc_id_list is: " + str(len(doc_id_list)))
        fact_dict = {} 
        for i, f in enumerate(field_names):
            fact_dict[f] = [rel[i] for rel in all_rels]
        
        fact_dict['doc_ids'] = doc_id_list
        output_df_fields = input_df_fields + field_names
        output_df = pd.DataFrame(fact_dict, columns=output_df_fields)
        return output_df
    return pd.DataFrame(columns = ['doc_ids', 'sent_num', 'word_num', 'subject', 'verb', 'quantity',
       'quantity_type', 'type_token', 'word', 'sentence'] ) 

tens_dict = {}
for w, n in zip([y + '-'  for y in tens_word_list ], list(range(2,10))):
    tens_dict[w] = n
teens_dict = {}
for w, n in zip(teens_word_list, list(range(10,20))):
    teens_dict[w] = n
singles_dict = {}
for w, n in zip(singles_word_list, list(range(1,10))):
    singles_dict[w] = n

import re
def add_units_and_values(df, quantity_col):
    """Create units, data_value columns from tokens in quantity_col."""
    if df.shape[0] == 0:
        return pd.DataFrame(columns = df.columns.tolist() + ['units', 'data_value'])
    
    df = df.copy()
    df.loc[:, 'units'] = 'ones'
    
    # Identify columns with more than one token
    # Rows with more than one token for quantity contain either words as numbers 
    # or non-numeric qualifiers
    quantity_bs = df.loc[:, quantity_col].apply(lambda x: len(x.text.split())) > 1
    
    # Remove tokens that are not numbers or number words
    df.loc[quantity_bs, 'units'] = df.loc[quantity_bs, quantity_col].apply(lambda x: [x for x in x if x._.is_num_word == True])
    
    
    empty_units_bs = df.units.apply(lambda x: len(x)) == 0
    not_empty_units= df.units.apply(lambda x: len(x)) == 1

    df.loc[empty_units_bs, 'units'] = 'ones'
    df.loc[not_empty_units, 'units'] = df.loc[not_empty_units, quantity_col].apply(lambda x: [x.text for x in x if x._.is_num_word == True][-1])
    
    # Initialize the values column
    df['data_value'] = '0'
    df.loc[~quantity_bs, 'data_value'] = df.loc[~quantity_bs, quantity_col].apply(lambda x: x.text)
    df.loc[quantity_bs, 'data_value'] = df.loc[quantity_bs, quantity_col].apply(lambda x: [x.text for x in x if x.like_num == True][0])  

    comma_bs = df['data_value'].str.contains(",")
    
    # Create dictionary mappings for small number words
    df.replace({'data_value' : singles_dict}, inplace=True)
    df.replace({'data_value' : teens_dict}, inplace=True)

    df.loc[comma_bs,'data_value'] = df.loc[comma_bs,'data_value'].apply(lambda x: x.replace(',', ''))
    not_num_bs = df['data_value'].astype('str').str.contains(re.compile(r"[^0-9.]"))
    df.loc[not_num_bs, 'data_value'] = 0
    df.loc[comma_bs, 'data_value'] = df.loc[comma_bs,'data_value'].apply(lambda x: float(x))
    df.loc[~comma_bs,'data_value'] = df.loc[~comma_bs,'data_value'].apply(lambda x: float(x))
    
    # Multiply values by 1000 if units = 'thousand'
    df.loc[df.units == 'thousand', 'data_value'] = df.loc[df.units == 'thousand', 'data_value'] * 1000
    
    return df

def fix_token_columns(df, col_list):
    """Return df with tokens converted to text."""
    if df.shape[0] == 0:
        return df
    df = df.copy()
    for col in col_list:
        df.loc[:,col] = df.loc[:, col].apply(lambda x: x.text)
    return df

In [46]:
print(train_df[100:105].shape)

fact_df_test = make_fact_df(train_df[100:105].para_text, extract_emp_relations, df=train_df[100:105], verbose=True)
fact_df_test = add_units_and_values(fact_df_test, 'quantity')

(5, 9)
len of all_rels is: 3
len of all_docs is: 3
len of doc_id_list is: 3


Train fact df

In [24]:
print(train_df.shape)

fact_df = make_fact_df(train_df.para_text, extract_emp_relations, df=train_df, verbose=True)

(7700, 9)
Length of emp_counts is : 1 while length of years is : 2
--------------------
(1) The total number of BCE employees at the end of 2016 was 48,090, down from 49,968 at December 31, 2015, due primarily to workforce reductions across our Bell Wireline and Bell Wireless segments attributable to normal attrition, retirements and productivity improvements.
Length of emp_counts is : 1 while length of years is : 2
--------------------
The number of employees at December 31, 2016 was 26,498, a decrease of 159 employees from December 31, 2015. This decrease primarily reflects the impact of cost reduction programs implemented during the current year partially offset by acquisitions.
Length of emp_counts is : 1 while length of years is : 2
--------------------
The employee cost at Jaguar Land Rover increased by 17.6% to Rs.228,730 million in Fiscal 2016 from Rs.194,467 million in Fiscal 2015. This increase includes an unfavorable foreign currency translation from GBP to Indian rupees of 

len of all_rels is: 2605
len of all_docs is: 2605
len of doc_id_list is: 2605


In [25]:
err_doc = nlp("We had 49, 61, and 57 full-time equivalent employees in research and development at December 31, 2016, 2015, and 2014, respectively.")
err_tok = err_doc[4]
err_ent = err_doc.ents[1]
list(err_doc[2].conjuncts)

[61, employees]

In [26]:
fact_df = add_units_and_values(fact_df, 'quantity')

fact_df.units.value_counts()

ones        2603
thousand       2
Name: units, dtype: int64

In [50]:
fact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2605 entries, 0 to 2604
Data columns (total 14 columns):
doc_ids          2605 non-null int64
sent_num         2605 non-null int64
word_num         2605 non-null int64
subject          2605 non-null object
verb             2605 non-null object
quantity         2605 non-null object
quantity_type    2605 non-null object
type_token       2605 non-null object
word             2605 non-null object
word_dep         2605 non-null object
depth            2605 non-null int64
sentence         2605 non-null object
units            2605 non-null object
data_value       2605 non-null object
dtypes: int64(4), object(10)
memory usage: 285.0+ KB


In [51]:
fact_df.head()

Unnamed: 0,doc_ids,sent_num,word_num,subject,verb,quantity,quantity_type,type_token,word,word_dep,depth,sentence,units,data_value
0,4,0,0,we,employed,7300,Other Employees,0,employees,dobj,1,"As of September 30, 2016, we employed approxim...",ones,7300
1,13,0,0,subsidiaries,had,17634,Other Employees,0,employees,pobj,1,"As of December 31, 2016, the subsidiaries of A...",ones,17634
2,14,4,0,"(AEP, Texas)",had,1500,Other Employees,0,employees,dobj,1,"As of December 31, 2016, AEP Texas had 1,500 e...",ones,1500
3,15,3,0,APCo,had,1845,Other Employees,0,employees,dobj,1,"As of December 31, 2016, APCo had 1,845 employ...",ones,1845
4,16,2,0,(I&M),had,2475,Other Employees,0,employees,dobj,1,"As of December 31, 2016, I&M had 2,475 employees.",ones,2475


TODO:
- Keep track of year 
- Deal with ", a(n) (in|de)crease of \d" clauses

Validation fact df

In [25]:
fact_df_val = make_fact_df(val_df.para_text, extract_emp_relations, df=val_df, verbose=True)

Length of emp_counts is : 1 while length of years is : 2
--------------------
As of December 31, 2016, we had approximately 28,300 employees, as compared to approximately 19,200 employees as of December 31, 2015. During 2016, reduction in workforce activities resulted in the severance of approximately 1,950 employees of which approximately 450 employees remained as employees as of December 31, 2016. Approximately 17,900 of our total employees are represented by unions. The number of employees covered by a collective bargaining agreement that expired in 2016, but have been extended and are still effective for 2017, is approximately 600. The number of employees covered by collective bargaining agreements that expire in 2017 is approximately 3,800. We consider our relations with our employees to be good.
Length of emp_counts is : 1 while length of years is : 2
--------------------
At 30 June 2016 the group had, on a full time equivalent basis, 31,485 (2015 - 32,409; 2014 - 26,588) employe

Create units column

In [26]:
fact_df_val = add_units_and_values(fact_df_val, 'quantity')

fact_df_val.units.value_counts()

ones    853
Name: units, dtype: int64

In [27]:
fact_df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 12 columns):
doc_ids          853 non-null int64
sent_num         853 non-null int64
word_num         853 non-null int64
subject          853 non-null object
verb             853 non-null object
quantity         853 non-null object
quantity_type    853 non-null object
type_token       853 non-null object
word             853 non-null object
sentence         853 non-null object
units            853 non-null object
data_values      853 non-null object
dtypes: int64(3), object(9)
memory usage: 80.0+ KB


## Classification Scores

### Identify tables in labeled data

I can eliminate tables from the false negative pool, as table parsing is a separate (and easier) problem. In my labeled data set, tables have a lot of white space.

In [None]:
labeled_df = pd.read_excel('../data/train_val_employee_count_paragraphs.xlsx')
subset_df = pd.read_excel('../data/subset_employee_count_paragraphs.xlsx')

train_labeled_df = labeled_df[labeled_df.split == 'train'].copy()
val_labeled_df = labeled_df[labeled_df.split == 'val'].copy()
#print(train_labeled_df.paragraph_text.apply(lambda x: x.count(' ') / len(x)).to_frame('space_percent').sort_values('space_percent', ascending=False).head(100))

val_paragraphs = val_df.loc[:, ['acc_id', 'para_text']].copy().merge(labeled_df.loc[:,['accession_number', 'ticker']].copy().drop_duplicates(), 
                                          how = 'left', left_on = 'acc_id', right_on = 'accession_number')

In [108]:
def print_row_detail(df=val_paragraphs, header_list = ['ticker', 'accession_number' ], nrow=10, 
                    detail_list = ['data_key_friendly_name','para_text'],
                    sortby=['ticker', 'data_key_friendly_name'], ascending=True):
    df_sorted = df.sort_values(sortby, ascending=ascending).copy().reset_index()
    nrow = min(len(df), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

In [35]:
train_labeled_df.loc[:,'space_percent'] = train_labeled_df.loc[:, 'paragraph_text'].apply(lambda x: x.count(' ') / len(x))

val_labeled_df.loc[:,'space_percent'] = val_labeled_df.loc[:, 'paragraph_text'].apply(lambda x: x.count(' ') / len(x))

space_bs = train_labeled_df.paragraph_text.apply(lambda x: x.count(' ') / len(x)) > 0.36 
space_bs_val = val_labeled_df.paragraph_text.apply(lambda x: x.count(' ') / len(x)) > 0.36 
print(sum(space_bs))
print(sum(space_bs_val))

105
37


In [109]:
print_row_detail(train_labeled_df[space_bs].head(), detail_list=['space_percent','data_key_friendly_name', 'data_value', 'reported_units', 'text', 'paragraph_text'],
                 sortby='space_percent')

----------------------------------- ADBE -----------------------------------
----------------------------------- 0000796343-17-000031 -----------------------------------
space_percent  :0.49632892804698975

data_key_friendly_name  :Other Employees

data_value  :15706.0

reported_units  :ones

text  :Worldwide employees

paragraph_text  :Fiscal Years                                           2016           2015           2014           2013           2012   Operations:   Revenue                             $  5,854,430   $  4,795,511   $  4,147,065   $  4,055,240   $  4,403,677   Gross profit                        $  5,034,522   $  4,051,194   $  3,524,985   $  3,468,683   $  3,919,895   Income before income taxes          $  1,435,138   $    873,781   $    361,376   $    356,141   $  1,118,794   Net income                          $  1,168,782   $    629,551   $    268,395   $    289,985   $    832,775   Net income per share:   Basic                               $       2.35   $     

In [56]:
print_row_detail(val_labeled_df[space_bs_val].head(), detail_list=['space_percent','data_key_friendly_name', 'data_value', 'reported_units', 'text', 'paragraph_text'],
                 sortby='space_percent')

----------------------------------- ANW -----------------------------------
----------------------------------- 0000919574-17-004377 -----------------------------------
space_percent  :0.5145631067961165

data_key_friendly_name  :Other Employees

data_value  :379.0

reported_units  :ones

text  :Shoreside personnel

paragraph_text  :Year Ended December 31,                         2016     2015   2014   Shipboard personnel     612      645    646   Shoreside personnel     379      332    314   Total                   991      977    960

----------------------------------- ANW -----------------------------------
----------------------------------- 0000919574-17-004377 -----------------------------------
space_percent  :0.5145631067961165

data_key_friendly_name  :Other Employees

data_value  :612.0

reported_units  :ones

text  :Shipboard personnel

paragraph_text  :Year Ended December 31,                         2016     2015   2014   Shipboard personnel     612      645    646   Shore

### Merge fact_df with train_df

In [61]:
train_fact_df = train_df.merge(fact_df, left_index=True, right_on='doc_ids')

train_accession_ids = pd.read_csv('../data/train_accession_ids.csv', names=['acc_id']).loc[:,'acc_id'].tolist()

negative_ids = [x for x in train_accession_ids if x not in train_labeled_df.accession_number.unique()]

train_facts = train_fact_df.loc[:, train_fact_df.columns.intersection(['acc_id', 'data_value', 'quantity_type'])].copy().drop_duplicates()

train_facts.columns = ['accession_number', 'data_key_friendly_name', 'data_value']
train_facts.shape

(2322, 3)

In [56]:
len(train_accession_ids)

1667

In [63]:
labeled_facts = train_labeled_df.loc[~space_bs, ['accession_number', 'data_key_friendly_name', 'data_value']].copy()
labeled_facts.shape

(1853, 3)

In [66]:
labeled_facts.sort_values('accession_number').head()

Unnamed: 0,accession_number,data_key_friendly_name,data_value
2261,0000004127-16-000068,Other Employees,7300.0
64,0000004904-17-000019,Other Employees,17634.0
88,0000005272-17-000017,Other Employees,56400.0
2437,0000005513-17-000018,Full-Time Employees,9400.0
47,0000006281-16-000097,Other Employees,10000.0


In [65]:
train_facts.head()

Unnamed: 0,accession_number,data_key_friendly_name,data_value
0,0000004127-16-000068,Other Employees,7300
1,0000004904-17-000019,Other Employees,17634
2,0000004904-17-000019,Other Employees,1500
3,0000004904-17-000019,Other Employees,1845
4,0000004904-17-000019,Other Employees,2475


In [134]:
missed_facts = pd.merge(labeled_facts ,train_facts , on=labeled_facts.columns.tolist(), how='outer', indicator=True).query(
    "_merge == 'left_only'").drop('_merge', 1).sort_values('accession_number')
missed_facts.shape

(549, 3)

In [140]:
missed_facts.head()

Unnamed: 0,accession_number,data_key_friendly_name,data_value
283,0000014707-16-000090,Other Employees,11000.0
696,0000018498-16-000065,Part-Time Employees,18275.0
299,0000022356-17-000009,Part-Time Employees,395.0
300,0000022356-17-000009,Full-Time Employees,4482.0
343,0000024090-17-000008,Other Employees,333.0


In [141]:
train_facts.loc[train_facts.accession_number.isin(missed_facts.head().accession_number.tolist() + ["0000034067-17-000007"])]

Unnamed: 0,accession_number,data_key_friendly_name,data_value
42,0000014707-16-000090,Full-Time Employees,11000
43,0000014707-16-000090,Other Employees,20
56,0000018498-16-000065,Other Employees,27500
66,0000022356-17-000009,Other Employees,4482
67,0000022356-17-000009,Other Employees,395


In [144]:
train_fact_df[train_fact_df.acc_id == "0000034067-17-000007"]

Unnamed: 0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split,label,doc_ids,...,verb,quantity,quantity_type,type_token,word,word_dep,depth,sentence,units,data_value


In [69]:
doc_count = len(train_labeled_df.accession_number.unique())
#doc_count = len(labeled_facts.accession_number.unique())

input_doc_count = len(train_df.acc_id.unique())
fact_docs = len(train_fact_df.acc_id.unique())
print("Candidate paragraphs extraced for " + str(input_doc_count) + 
     " of " + str(doc_count) + " (" + str(round(input_doc_count/doc_count*100,1)) +
      " %) original documents.")

print("Facts extraced for " + str(fact_docs) + 
     " of " + str(input_doc_count) + " (" + str(round(fact_docs/input_doc_count*100,1)) +
      "%) input documents, and " + str(round(fact_docs/doc_count*100, 1)) +
      "% of original documents.")

Candidate paragraphs extraced for 1644 of 1641 (100.2 %) original documents.
Facts extraced for 1426 of 1644 (86.7%) input documents, and 86.9% of original documents.


In [70]:
train_fact_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2605 entries, 0 to 2604
Data columns (total 23 columns):
acc_id                  2605 non-null object
para_text               2605 non-null object
len                     2605 non-null int64
emp_header              2605 non-null bool
first_emp_head_block    2605 non-null bool
para_text_orig          2605 non-null object
para_tag                2605 non-null object
split                   2605 non-null object
label                   2605 non-null int64
doc_ids                 2605 non-null int64
sent_num                2605 non-null int64
word_num                2605 non-null int64
subject                 2605 non-null object
verb                    2605 non-null object
quantity                2605 non-null object
quantity_type           2605 non-null object
type_token              2605 non-null object
word                    2605 non-null object
word_dep                2605 non-null object
depth                   2605 non-null int64
sen

### Validation fact df

In [174]:
val_fact_df = val_df.merge(fact_df_val, left_index=True, right_on='doc_ids')



val_accession_ids = pd.read_csv('../data/val_accession_ids.csv', names=['acc_id'])['acc_id'].tolist()
val_negative_ids = [x for x in val_accession_ids if x not in val_labeled_df.accession_number.unique()]

val_facts = val_fact_df.loc[:, ['acc_id', 'values', 'quantity_type']].drop_duplicates().copy()
val_facts.columns = ['accession_number', 'data_value', 'data_key_friendly_name']
val_facts.shape

In [178]:
len(val_accession_ids)

556

In [183]:
labeled_facts_val = val_labeled_df.loc[~space_bs_val, ['accession_number', 'data_value', 'data_key_friendly_name']]
labeled_facts_val.shape

(630, 3)

In [184]:
missed_facts_val = pd.merge(labeled_facts_val ,val_facts , on=labeled_facts_val.columns.tolist(), how='outer', indicator=True).query(
    "_merge == 'left_only'").drop('_merge', 1)
missed_facts_val.shape

(202, 3)

In [185]:
doc_count_val = len(val_labeled_df.accession_number.unique())

input_doc_count_val = len(val_df.acc_id.unique())
fact_docs_val = len(val_fact_df.acc_id.unique())
print("Candidate paragraphs extraced for " + str(input_doc_count_val) + 
     " of " + str(doc_count_val) + " (" + str(round(input_doc_count_val/doc_count_val*100,1)) +
      " %) original documents.")

print("Facts extraced for " + str(fact_docs_val) + 
     " of " + str(input_doc_count_val) + " (" + str(round(fact_docs_val/input_doc_count_val*100,1)) +
      "%) input documents, and " + str(round(fact_docs_val/doc_count_val*100, 1)) +
      "% of original documents.")

Candidate paragraphs extraced for 552 of 551 (100.2 %) original documents.
Facts extraced for 478 of 552 (86.6%) input documents, and 86.8% of original documents.


In [186]:
val_fact_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 854 entries, 0 to 853
Data columns (total 21 columns):
acc_id                  854 non-null object
para_text               854 non-null object
len                     854 non-null int64
emp_header              854 non-null bool
first_emp_head_block    854 non-null bool
para_text_orig          854 non-null object
para_tag                854 non-null object
split                   854 non-null object
label                   854 non-null int64
doc_ids                 854 non-null int64
sent_num                854 non-null int64
word_num                854 non-null int64
s                       854 non-null object
v                       854 non-null object
quantity                854 non-null object
quantity_type           854 non-null object
type_token              854 non-null object
word                    854 non-null object
sentence                854 non-null object
units                   854 non-null object
values                  

### Scores for training set

In [79]:
len([x for x in train_labeled_df[~space_bs].accession_number.unique() if x not in train_fact_df.acc_id.unique()])

194

In [81]:
value_match = train_fact_df[['acc_id', 'data_values']].drop_duplicates().merge(train_labeled_df[['accession_number', 'data_value']].drop_duplicates(), left_on=['acc_id', 'data_values'], 
                   right_on = ['accession_number', 'data_value']).drop_duplicates()

value_match_dict = {'True Positives' : value_match.shape[0], 
                   'False Positives' : train_fact_df[['acc_id', 'data_values']].drop_duplicates().shape[0] - value_match.shape[0], 
                   'True Negatives' : len([x for x in negative_ids if x not in train_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in train_labeled_df[~space_bs].accession_number.unique() if x not in train_fact_df.acc_id.unique()])
                  }

value_match_dict['recall'] = round(value_match_dict['True Positives'] /  (
    value_match_dict['True Positives'] + value_match_dict['False Negatives']), 2)
value_match_dict['precision'] = round(value_match_dict['True Positives'] /  (
    value_match_dict['True Positives'] + value_match_dict['False Positives']), 2)
value_match_dict['accuracy'] = round((value_match_dict['True Positives'] + value_match_dict['True Negatives']) /  (
    value_match_dict['True Positives'] + value_match_dict['True Negatives'] + value_match_dict['False Positives'] + value_match_dict['False Negatives']), 2)

value_match_dict

{'False Negatives': 194,
 'False Positives': 900,
 'True Negatives': 10,
 'True Positives': 1411,
 'accuracy': 0.57,
 'precision': 0.61,
 'recall': 0.88}

In [82]:
key_match = train_fact_df[['acc_id', 'quantity_type']].drop_duplicates().merge(train_labeled_df[['accession_number', 'data_key_friendly_name']].drop_duplicates(), left_on=['acc_id', 'quantity_type'], 
                   right_on = ['accession_number', 'data_key_friendly_name']).drop_duplicates()

key_match_dict = {'True Positives' : key_match.shape[0], 
                   'False Positives' : train_fact_df[['acc_id', 'quantity_type']].drop_duplicates().shape[0] - key_match.shape[0], 
                   'True Negatives' : len([x for x in negative_ids if x not in train_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in train_labeled_df[~space_bs].accession_number.unique() if x not in train_fact_df.acc_id.unique()])
                  }

key_match_dict['recall'] = round(key_match_dict['True Positives'] /  (
    key_match_dict['True Positives'] + key_match_dict['False Negatives']), 2)
key_match_dict['precision'] = round(key_match_dict['True Positives'] /  (
    key_match_dict['True Positives'] + key_match_dict['False Positives']), 2)
key_match_dict['accuracy'] = round((key_match_dict['True Positives'] + key_match_dict['True Negatives']) /  (
    key_match_dict['True Positives'] + key_match_dict['True Negatives'] + key_match_dict['False Positives'] + key_match_dict['False Negatives']), 2)

key_match_dict

{'False Negatives': 194,
 'False Positives': 215,
 'True Negatives': 10,
 'True Positives': 1363,
 'accuracy': 0.77,
 'precision': 0.86,
 'recall': 0.88}

In [83]:
full_match = train_fact_df[['acc_id', 'data_values', 'quantity_type']].drop_duplicates().merge(train_labeled_df[['accession_number', 'data_value', 'data_key_friendly_name']].drop_duplicates(), 
                                                           left_on=['acc_id', 'data_values', 'quantity_type'], 
                   right_on = ['accession_number', 'data_value', 'data_key_friendly_name']).drop_duplicates()

full_match_dict = {'True Positives' : full_match.shape[0], 
                   'False Positives' : train_fact_df[['acc_id', 'data_values', 'quantity_type']].drop_duplicates().shape[0] - full_match.shape[0], 
                   'True Negatives' : len([x for x in negative_ids if x not in train_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in train_labeled_df[~space_bs].accession_number.unique() if x not in train_fact_df.acc_id.unique()])
                  }

full_match_dict['recall'] = round(full_match_dict['True Positives'] /  (
    full_match_dict['True Positives'] + full_match_dict['False Negatives']), 2)
full_match_dict['precision'] = round(full_match_dict['True Positives'] /  (
    full_match_dict['True Positives'] + full_match_dict['False Positives']), 2)
full_match_dict['accuracy'] = round((full_match_dict['True Positives'] + full_match_dict['True Negatives']) /  (
    full_match_dict['True Positives'] + full_match_dict['True Negatives'] + full_match_dict['False Positives'] + full_match_dict['False Negatives']), 2)

full_match_dict

{'False Negatives': 194,
 'False Positives': 999,
 'True Negatives': 10,
 'True Positives': 1323,
 'accuracy': 0.53,
 'precision': 0.57,
 'recall': 0.87}

In [84]:
train_fact_df.quantity_type.value_counts()

Other Employees        2030
Full-Time Employees     527
Part-Time Employees      48
Name: quantity_type, dtype: int64

In [85]:
train_labeled_df[~space_bs].data_key_friendly_name.value_counts()

Other Employees        1093
Full-Time Employees     605
Part-Time Employees     155
Name: data_key_friendly_name, dtype: int64

### Scores for validation set

In [187]:
len([x for x in val_labeled_df[~space_bs_val].accession_number.unique() if x not in val_fact_df.acc_id.unique()])

63

In [188]:
value_match_val = val_fact_df[['acc_id', 'values']].drop_duplicates().merge(val_labeled_df[['accession_number', 'data_value']].drop_duplicates(), left_on=['acc_id', 'values'], 
                   right_on = ['accession_number', 'data_value']).drop_duplicates()

value_match_dict_val = {'True Positives' : value_match_val.shape[0], 
                   'False Positives' : val_fact_df[['acc_id', 'values']].drop_duplicates().shape[0] - value_match_val.shape[0], 
                   'True Negatives' : len([x for x in val_negative_ids if x not in val_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in val_labeled_df[~space_bs_val].accession_number.unique() if x not in val_fact_df.acc_id.unique()])
                  }

value_match_dict_val['recall'] = round(value_match_dict_val['True Positives'] /  (
    value_match_dict_val['True Positives'] + value_match_dict_val['False Negatives']), 2)
value_match_dict_val['precision'] = round(value_match_dict_val['True Positives'] /  (
    value_match_dict_val['True Positives'] + value_match_dict_val['False Positives']), 2)
value_match_dict_val['accuracy'] = round((value_match_dict_val['True Positives'] + value_match_dict_val['True Negatives']) /  (
    value_match_dict_val['True Positives'] + value_match_dict_val['True Negatives'] + value_match_dict_val['False Positives'] + value_match_dict_val['False Negatives']), 2)

value_match_dict_val

In [192]:
key_match_val = val_fact_df[['acc_id', 'quantity_type']].drop_duplicates().merge(val_labeled_df[['accession_number', 'data_key_friendly_name']].drop_duplicates(), left_on=['acc_id', 'quantity_type'], 
                   right_on = ['accession_number', 'data_key_friendly_name']).drop_duplicates()

key_match_dict_val = {'True Positives' : key_match_val.shape[0], 
                   'False Positives' : val_fact_df[['acc_id', 'quantity_type']].drop_duplicates().shape[0] - key_match_val.shape[0], 
                   'True Negatives' : len([x for x in val_negative_ids if x not in val_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in val_labeled_df[~space_bs_val].accession_number.unique() if x not in val_fact_df.acc_id.unique()])
                  }

key_match_dict_val['recall'] = round(key_match_dict_val['True Positives'] /  (
    key_match_dict_val['True Positives'] + key_match_dict_val['False Negatives']), 2)
key_match_dict_val['precision'] = round(key_match_dict_val['True Positives'] /  (
    key_match_dict_val['True Positives'] + key_match_dict_val['False Positives']), 2)
key_match_dict_val['accuracy'] = round((key_match_dict_val['True Positives'] + key_match_dict_val['True Negatives']) /  (
    key_match_dict_val['True Positives'] + key_match_dict_val['True Negatives'] + key_match_dict_val['False Positives'] + key_match_dict_val['False Negatives']), 2)

key_match_dict_val

In [194]:
full_match_val = val_fact_df[['acc_id', 'values', 'quantity_type']].drop_duplicates().merge(val_labeled_df[['accession_number', 'data_value', 'data_key_friendly_name']].drop_duplicates(), 
                                                           left_on=['acc_id', 'values', 'quantity_type'], 
                   right_on = ['accession_number', 'data_value', 'data_key_friendly_name']).drop_duplicates()

full_match_dict_val = {'True Positives' : full_match_val.shape[0], 
                   'False Positives' : val_fact_df[['acc_id', 'values', 'quantity_type']].drop_duplicates().shape[0] - full_match_val.shape[0], 
                   'True Negatives' : len([x for x in val_negative_ids if x not in val_fact_df.acc_id.unique()]), 
                   'False Negatives' : len([x for x in val_labeled_df[~space_bs_val].accession_number.unique() if x not in val_fact_df.acc_id.unique()])
                  }

full_match_dict_val['recall'] = round(full_match_dict_val['True Positives'] /  (
    full_match_dict_val['True Positives'] + full_match_dict_val['False Negatives']), 2)
full_match_dict_val['precision'] = round(full_match_dict_val['True Positives'] /  (
    full_match_dict_val['True Positives'] + full_match_dict_val['False Positives']), 2)
full_match_dict_val['accuracy'] = round((full_match_dict_val['True Positives'] + full_match_dict_val['True Negatives']) /  (
    full_match_dict_val['True Positives'] + full_match_dict_val['True Negatives'] + full_match_dict_val['False Positives'] + full_match_dict_val['False Negatives']), 2)

full_match_dict_val

In [196]:
val_fact_df.quantity_type.value_counts()

Other Employees        637
Full-Time Employees    205
Part-Time Employees     12
Name: quantity_type, dtype: int64

In [198]:
val_labeled_df[~space_bs_val].data_key_friendly_name.value_counts()

Other Employees        364
Full-Time Employees    205
Part-Time Employees     61
Name: data_key_friendly_name, dtype: int64

### Evaluate misses

In [72]:
train_fact_df.loc[:, ['acc_id', 'data_value', 'quantity_type']].drop_duplicates().shape

(2322, 3)

In [99]:
full_match_left= train_fact_df.drop_duplicates().merge(train_labeled_df.drop_duplicates(), 
                                                           how='left', left_on=['acc_id', 'data_value', 'quantity_type'], 
                   right_on = ['accession_number', 'data_value', 'data_key_friendly_name']).drop_duplicates()

full_match_right= train_fact_df.drop_duplicates().merge(train_labeled_df.loc[~space_bs,:].drop_duplicates(), 
                                                           how='right', left_on=['acc_id', 'data_value', 'quantity_type'], 
                   right_on = ['accession_number', 'data_value', 'data_key_friendly_name']).drop_duplicates()

print(full_match_left.shape)
print(full_match_right.shape)

fp_bs = full_match_left.ticker.isna()
fn_bs = full_match_right.acc_id.isna()
print(sum(fp_bs))
print(sum(fn_bs))

(2605, 32)
(2018, 32)
1109
549


In [130]:
full_match_left.quantity_type.value_counts(dropna=False)

Other Employees        2030
Full-Time Employees     527
Part-Time Employees      48
Name: quantity_type, dtype: int64

In [131]:
full_match_left.data_key_friendly_name.value_counts(dropna=False)

NaN                    1109
Other Employees        1042
Full-Time Employees     415
Part-Time Employees      39
Name: data_key_friendly_name, dtype: int64

In [129]:
full_match_right.data_key_friendly_name.value_counts(dropna=False)

Other Employees        1233
Full-Time Employees     629
Part-Time Employees     156
Name: data_key_friendly_name, dtype: int64

In [132]:
full_match_left[fp_bs].quantity_type.value_counts(dropna=False)

Other Employees        988
Full-Time Employees    112
Part-Time Employees      9
Name: quantity_type, dtype: int64

In [127]:
full_match_right.quantity_type.value_counts(dropna=False)

Other Employees        1015
NaN                     549
Full-Time Employees     415
Part-Time Employees      39
Name: quantity_type, dtype: int64

In [128]:
full_match_right[fn_bs].data_key_friendly_name.value_counts(dropna=False)

Other Employees        218
Full-Time Employees    214
Part-Time Employees    117
Name: data_key_friendly_name, dtype: int64

In [100]:
full_match_right[fn_bs].head(1)

Unnamed: 0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split_x,label,doc_ids,...,data_value,ticker,accession_number,data_key_friendly_name,text,reported_data_value,reported_units,paragraph_text,split_y,space_percent
1469,,,,,,,,,,,...,18000.0,ABC,0001140859-16-000022,Full-Time Employees,full-time,18000.0,ones,"Employees As of September 30, 2016, we had a...",train,0.182598


In [75]:
full_match_left.columns

Index(['acc_id', 'para_text', 'len', 'emp_header', 'first_emp_head_block',
       'para_text_orig', 'para_tag', 'split_x', 'label', 'doc_ids', 'sent_num',
       'word_num', 'subject', 'verb', 'quantity', 'quantity_type',
       'type_token', 'word', 'word_dep', 'depth', 'sentence', 'units',
       'data_value', 'ticker', 'accession_number', 'data_key_friendly_name',
       'text', 'reported_data_value', 'reported_units', 'paragraph_text',
       'split_y', 'space_percent'],
      dtype='object')

In [85]:
full_match_left_columns = ['acc_id','ticker', 'data_value',  'quantity_type', 
                           'data_key_friendly_name',
 'paragraph_text',  'para_text', 'sent_num', 'sentence', 'emp_header', 'first_emp_head_block', 
 'word_num', 'subject', 'verb', 'quantity',  'type_token', 'word','len', 
       'para_text_orig', 'para_tag', 'split_x', 'label', 'doc_ids',        
        'units', 'reported_units',  'text',  'reported_data_value',
         'space_percent']

full_match_right_columns = ['accession_number','ticker', 'data_value',  'quantity_type', 
                           'data_key_friendly_name',
 'paragraph_text',  'para_text', 'sent_num', 'sentence', 'emp_header', 'first_emp_head_block', 
 'word_num', 'subject', 'verb', 'quantity',  'type_token', 'word','len', 
       'para_text_orig', 'para_tag', 'split_x', 'label', 'doc_ids',        
        'units', 'reported_units',  'text',  'reported_data_value',
         'space_percent']

In [144]:
extract_emp_relations(nlp("At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014."), verbose=True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: dobj
Num_toks are: [17,700]
Root is at 1 steps from employees.
(Avnet, had, 17,700, 'Other Employees', 0, employees)
Word_id is : 1
Word is : employees
Dep_ of EMP_NOUN is: pobj
Num_toks are: [18,800]
Root is at 1 steps from employees.
(Avnet, had, 18,800, 'Other Employees', 0, employees)


[RelationDetails(sent_num=0, word_num=0, s=Avnet, v=had, quantity=17,700, quantity_type='Other Employees', type_token=0, word=employees, sentence=At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014.),
 RelationDetails(sent_num=0, word_num=1, s=Avnet, v=had, quantity=18,800, quantity_type='Other Employees', type_token=0, word=employees, sentence=At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014.)]

In [143]:
print_df(make_tok_df(nlp("At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014.")))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,At,at,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,DATE,July,july,pobj,At,prep,PROPN,NNP,object of preposition,"noun, proper singular"
2,DATE,2,2,nummod,July,pobj,NUM,CD,,cardinal number
3,DATE,",",",",punct,July,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
4,DATE,2016,2016,appos,July,pobj,NUM,CD,appositional modifier,cardinal number
5,,",",",",punct,had,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
6,ORG,Avnet,avnet,nsubj,had,ROOT,PROPN,NNP,nominal subject,"noun, proper singular"
7,,had,have,ROOT,had,ROOT,VERB,VBD,,"verb, past tense"
8,CARDINAL,approximately,approximately,advmod,17700,nummod,ADV,RB,adverbial modifier,adverb
9,CARDINAL,17700,17700,nummod,employees,dobj,NUM,CD,,cardinal number


In [145]:
print_doc_info(nlp("At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014."))

doc is: 
At July 2, 2016, Avnet had approximately 17,700 employees compared to 18,800 employees at June 27, 2015, and 19,000 at June 28, 2014.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,"July 2, 2016",DATE,July,pobj,object of preposition,At,prep,ADP
1,6,Avnet,ORG,Avnet,nsubj,nominal subject,had,ROOT,VERB
2,8,"approximately 17,700",CARDINAL,17700,nummod,,employees,dobj,NOUN
3,10,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB
4,13,18800,CARDINAL,18800,nummod,,employees,pobj,NOUN
5,14,employees,EMP_NOUN,employees,pobj,object of preposition,to,prep,ADP
6,16,"June 27, 2015",DATE,June,pobj,object of preposition,at,prep,ADP
7,22,19000,CARDINAL,19000,conj,conjunct,employees,pobj,NOUN
8,24,"June 28, 2014",DATE,June,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,1,July,July,DATE,pobj,object of preposition,At,prep,ADP
1,6,Avnet,Avnet,ORG,nsubj,nominal subject,had,ROOT,VERB
2,8,"approximately 17,700 employees",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB
3,13,"18,800 employees",employees,EMP_NOUN,pobj,object of preposition,to,prep,ADP
4,16,June,June,DATE,pobj,object of preposition,at,prep,ADP
5,24,June,June,DATE,pobj,object of preposition,at,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,17700,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,17700,17700,nummod,employees,dobj,NUM,CD,,cardinal number
2,CARDINAL,18800,18800,nummod,employees,pobj,NUM,CD,,cardinal number
3,CARDINAL,19000,19000,conj,employees,pobj,NUM,CD,conjunct,cardinal number


In [113]:
print_row_detail(df=full_match_right.loc[fn_bs, :], 
                 nrow=10, header_list = ['accession_number', 'ticker' ],
                    detail_list = ['data_key_friendly_name','data_value',  'text', 'paragraph_text'],
                    sortby=['accession_number', 'ticker', 'data_key_friendly_name' ], ascending=True)

----------------------------------- 0000014707-16-000090 -----------------------------------
----------------------------------- CAL -----------------------------------
data_key_friendly_name  :Other Employees

data_value  :11000.0

text  :full-time and part-time employees

paragraph_text  :Employees   We had approximately 11,000 full-time and part-time employees  as of January 30, 2016. In the United  States,   there are no employees subject to union contracts. In Canada, we employ approximately 20 warehouse  employees   under a union contract, which expires in October 2016

----------------------------------- 0000018498-16-000065 -----------------------------------
----------------------------------- GCO -----------------------------------
data_key_friendly_name  :Part-Time Employees

data_value  :18275.0

text  :part-time

paragraph_text  :Employees   Genesco had approximately 27,500 employees  at January 30, 2016, approximately  130 of whom were employed  in   corporate staff depar

In [112]:
print_row_detail(df=full_match_left.loc[fp_bs, :], 
                 nrow=10, header_list = ['acc_id', 'sent_num' ],
                    detail_list = ['quantity_type','data_value',  'label','subject', 'verb', 'quantity',
                                   'sentence', 'para_text'],
                    sortby=['acc_id', 'quantity_type', 'sent_num'], ascending=True)

----------------------------------- 0000004904-17-000019 -----------------------------------
----------------------------------- 2 -----------------------------------
quantity_type  :Other Employees

data_value  :2475.0

label  :1

subject  :I&M

verb  :had

quantity  :2,475

sentence  :As of December 31, 2016, I&M had 2,475 employees.

para_text  :Organized in Indiana in 1907, I&M is engaged in the generation, transmission and distribution of electric power to approximately 592,000 retail customers in northern and eastern Indiana and southwestern Michigan, and in supplying and marketing electric power at wholesale to other electric utility companies, rural electric cooperatives, municipalities and other market participants.  I&M owns or leases 3,539 MWs of generating capacity, which it uses to serve its retail and other customers.  As of December 31, 2016, I&M had 2,475 employees. Among the principal industries served are primary metals, transportation equipment, electrical and electr

para_text  :Organized in West Virginia in 1883 and reincorporated in 1911, WPCo provides electric service to approximately 41,000 retail customers in northern West Virginia. WPCo owns 780 MWs of generating capacity which it uses to serve its retail and other customers. WPCo is a member of PJM. As of December 31, 2016, WPCo had 57 employees.  WPCo is part of AEP's Vertically Integrated Utilities segment.

----------------------------------- 0000004904-17-000019 -----------------------------------
----------------------------------- 3 -----------------------------------
quantity_type  :Other Employees

data_value  :5805.0

label  :1

subject  :AEPSC

verb  :had

quantity  :5,805

sentence  :As of December 31, 2016, AEPSC had 5,805 employees.

para_text  :AEP also owns a service company subsidiary, AEPSC. AEPSC provides accounting, administrative, information systems, engineering, financial, legal, maintenance and other services at cost to AEP subsidiaries. The executive officers of AEP a

In [147]:
train_labeled_df[train_labeled_df.accession_number == '0000011199-17-000011']

Unnamed: 0,ticker,accession_number,data_key_friendly_name,text,data_value,reported_data_value,reported_units,paragraph_text,split,space_percent
342,BMS,0000011199-17-000011,Other Employees,Number of employees,17678.0,17678.0,ones,"Years Ended December 31, ...",train,0.563313


In [88]:
full_match_left.loc[fp_bs, ['acc_id', 'data_value', 'quantity_type', 'label', 'para_text']].drop_duplicates().shape

(1097, 5)

In [89]:
full_match_left.loc[:,full_match_left_columns].head()

Unnamed: 0,acc_id,ticker,data_value,quantity_type,data_key_friendly_name,paragraph_text,para_text,sent_num,sentence,emp_header,...,para_text_orig,para_tag,split_x,label,doc_ids,units,reported_units,text,reported_data_value,space_percent
0,0000004127-16-000068,SWKS,7300,Other Employees,Other Employees,"EMPLOYEES As of September 30, 2016, we emp...","As of September 30, 2016, we employed approxim...",0,"As of September 30, 2016, we employed approxim...",True,...,"As of September 30, 2016,\r\r\r\nwe employed a...","<div class=""c80""><span class=""c32"">As of</span...",train,1,4,ones,ones,employed,7300.0,0.201465
1,0000004904-17-000019,AEP,17634,Other Employees,Other Employees,"As of December 31, 2016, the subsidiaries of ...","As of December 31, 2016, the subsidiaries of A...",0,"As of December 31, 2016, the subsidiaries of A...",False,...,"As of December 31, 2016,\r\r\r\nthe subsidiari...","<div class=""c110""><span class=""c99"">As of Dece...",train,1,13,ones,ones,employees,17634.0,0.198214
2,0000004904-17-000019,,1500,Other Employees,,,"Organized in Delaware in 1925, AEP Texas was f...",4,"As of December 31, 2016, AEP Texas had 1,500 e...",False,...,"Organized in Delaware in 1925, AEP Texas was f...","<div class=""c110""><span class=""c99"">Organized ...",train,1,14,ones,,,,
3,0000004904-17-000019,,1845,Other Employees,,,"Organized in Virginia in 1926, APCo is engaged...",3,"As of December 31, 2016, APCo had 1,845 employ...",False,...,"Organized in Virginia in 1926, APCo is engaged...","<div class=""c110""><span class=""c99"">Organized ...",train,1,15,ones,,,,
4,0000004904-17-000019,,2475,Other Employees,,,"Organized in Indiana in 1907, I&M is engaged i...",2,"As of December 31, 2016, I&M had 2,475 employees.",False,...,"Organized in Indiana in 1907, I&M is engaged i...","<div class=""c110""><span class=""c99"">Organized ...",train,1,16,ones,,,,


### Miss examples

In [156]:
fn_list = [
"We had approximately 11,000 full-time and part-time employees as of January 30, 2016.", 
"As of December 31, 2016, Comerica and its subsidiaries had 7,659 full-time and 490 part-time employees.", 
"At December 31, 2016, we had approximately 1,150 employees on a full-time equivalent basis.",
"The Company employed 4,482 persons on a full-time basis and 395 persons on a part-time basis at December 31, 2016.", 
"As of December 31, 2016, we had 428 employees (200 U.S. and 228 non-U.S.), the majority of whom are engaged in manufacturing operations, with the remainder primarily in sales, marketing and administrative functions.", 
"Retail stores employ a substantial number of part-time employees, and approximately 18,275 of the Company's employees were part-time at January 30, 2016.", 
"With about 201,000 employees and 62 plants worldwide, our core business includes designing, manufacturing, marketing, and servicing a full line of Ford cars, trucks, and SUVs, as well as Lincoln luxury vehicles.",
"Our policies are sold and serviced through a home service marketing distribution system of approximately 333 employee-agents who work on a route system and through over 286 funeral homes and independent agents to sell policies, collect premiums and service policyholders.",
]
fn1 = nlp(fn_list[0])
fn2 = nlp(fn_list[1])

In [160]:
extract_emp_relations(fn1, verbose=True)[0]

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: dobj
Num_toks are: [11,000]
Root is at 1 steps from employees.
(We, had, 11,000, 'Full-Time Employees', full-time, employees)


RelationDetails(sent_num=0, word_num=0, subject=We, verb=had, quantity=11,000, quantity_type='Full-Time Employees', type_token=full-time, word=employees, word_dep='dobj', depth=1, sentence='We had approximately 11,000 full-time and part-time employees as of January 30, 2016.')

In [169]:
list(find_emp_type_tok(fn1[7], verbose=True).conjuncts)

Flagged_toks:  [full-time]


[part-time]

In [157]:
extract_emp_relations(fn2, verbose=True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: dobj
Num_toks are: [7,659]
Root is at 1 steps from employees.
(Comerica, had, 7,659, 'Full-Time Employees', full-time, employees)


[RelationDetails(sent_num=0, word_num=0, subject=Comerica, verb=had, quantity=7,659, quantity_type='Full-Time Employees', type_token=full-time, word=employees, word_dep='dobj', depth=1, sentence='As of December 31, 2016, Comerica and its subsidiaries had 7,659 full-time and 490 part-time employees.')]

In [148]:
fn1 = nlp(fn_list[0])
print_df(make_tok_df(fn1))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,We,-PRON-,nsubj,had,ROOT,PRON,PRP,nominal subject,"pronoun, personal"
1,,had,have,ROOT,had,ROOT,VERB,VBD,,"verb, past tense"
2,CARDINAL,approximately,approximately,advmod,11000,nummod,ADV,RB,adverbial modifier,adverb
3,CARDINAL,11000,11000,nummod,employees,dobj,NUM,CD,,cardinal number
4,FULL_TIME,full-time,full,nmod,employees,dobj,ADJ,JJ,modifier of nominal,adjective
5,,and,and,cc,full-time,nmod,CCONJ,CC,coordinating conjunction,"conjunction, coordinating"
6,PART_TIME,part-time,part,conj,full-time,nmod,NOUN,NN,conjunct,"noun, singular or mass"
7,EMP_NOUN,employees,employee,dobj,had,ROOT,NOUN,NNS,direct object,"noun, plural"
8,,as,as,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
9,,of,of,prep,as,prep,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"


In [149]:
print_doc_info(fn1)

doc is: 
We had approximately 11,000 full-time and part-time employees as of January 30, 2016.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"approximately 11,000",CARDINAL,11000,nummod,,employees,dobj,NOUN
1,4,full-time,FULL_TIME,full-time,nmod,modifier of nominal,employees,dobj,NOUN
2,6,part-time,PART_TIME,part-time,conj,conjunct,full-time,nmod,ADJ
3,7,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB
4,10,"January 30, 2016",DATE,January,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,0,We,We,,nsubj,nominal subject,had,ROOT,VERB
1,2,"approximately 11,000 full-time and part-time e...",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB
2,10,January,January,DATE,pobj,object of preposition,of,prep,ADP


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,approximately,approximately,advmod,11000,nummod,ADV,RB,adverbial modifier,adverb
1,CARDINAL,11000,11000,nummod,employees,dobj,NUM,CD,,cardinal number


In [152]:
displacy.render(fn1, jupyter=True)

In [158]:
print_df(make_tok_df(fn2))

Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,,As,as,prep,had,ROOT,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
1,,of,of,prep,As,prep,ADP,IN,prepositional modifier,"conjunction, subordinating or preposition"
2,DATE,December,december,pobj,of,prep,PROPN,NNP,object of preposition,"noun, proper singular"
3,DATE,31,31,nummod,December,pobj,NUM,CD,,cardinal number
4,DATE,",",",",punct,December,pobj,PUNCT,",",punctuation,"punctuation mark, comma"
5,DATE,2016,2016,nummod,December,pobj,NUM,CD,,cardinal number
6,,",",",",punct,had,ROOT,PUNCT,",",punctuation,"punctuation mark, comma"
7,ORG,Comerica,comerica,nsubj,had,ROOT,PROPN,NNP,nominal subject,"noun, proper singular"
8,,and,and,cc,Comerica,nsubj,CCONJ,CC,coordinating conjunction,"conjunction, coordinating"
9,,its,-PRON-,poss,subsidiaries,conj,ADJ,PRP$,possession modifier,"pronoun, possessive"


In [159]:
print_doc_info(fn2)

doc is: 
As of December 31, 2016, Comerica and its subsidiaries had 7,659 full-time and 490 part-time employees.
--------------------------------------------------
Entities are: 


Unnamed: 0,tok_i,entity,ent_label,root,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,"December 31, 2016",DATE,December,pobj,object of preposition,of,prep,ADP
1,7,Comerica,ORG,Comerica,nsubj,nominal subject,had,ROOT,VERB
2,12,7659,CARDINAL,7659,nummod,,employees,dobj,NOUN
3,13,full-time,FULL_TIME,full-time,nmod,modifier of nominal,employees,dobj,NOUN
4,15,490,CARDINAL,490,nummod,,part-time,conj,ADJ
5,16,part-time,PART_TIME,part-time,conj,conjunct,full-time,nmod,ADJ
6,17,employees,EMP_NOUN,employees,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Noun chunks are: 


Unnamed: 0,tok_i,noun_chunk,root,root_ent,root_dep,dep_def,root_head,root_head_dep,root_head_pos
0,2,December,December,DATE,pobj,object of preposition,of,prep,ADP
1,7,Comerica,Comerica,ORG,nsubj,nominal subject,had,ROOT,VERB
2,9,its subsidiaries,subsidiaries,,conj,conjunct,Comerica,nsubj,PROPN
3,12,"7,659 full-time and 490 part-time employees",employees,EMP_NOUN,dobj,direct object,had,ROOT,VERB


--------------------------------------------------
Cardinal entities are: 


Unnamed: 0,tok_ent,toks,lemma,dep,head,h_dep,pos,tag,dep_def,tag_def
0,CARDINAL,7659,7659,nummod,employees,dobj,NUM,CD,,cardinal number
1,CARDINAL,490,490,nummod,part-time,conj,NUM,CD,,cardinal number


## Examples that have needed handling to avoid errors (more robust treatment needed)

In [384]:
train_df.loc[7729].para_text

'Our current product, the Argus ® II System, treats outer retinal degenerations, such as retinitis pigmentosa, also referred to as RP. RP is a hereditary disease, affecting an estimated 1.5 million people worldwide including about 100,000 people in the United States, that causes a progressive degeneration of the light-sensitive cells of the retina, leading to significant visual impairment and ultimately blindness. The Argus II System is the only retinal prosthesis approved in the United States by the Food and Drug Administration (FDA), and was the first approved retinal prosthesis in the world. By restoring a form of useful vision in patients who otherwise have total sight loss, the Argus II System can provide benefits which include:'

In [476]:
train_para_list[744:747]

['Employees and Labor Relations - As of December 31, 2016, Praxair had 26,498 employees worldwide. Of this number, 10,182 are employed in the United States. Praxair has collective bargaining agreements with unions at numerous locations throughout the world, which expire at various dates. Praxair considers relations with its employees to be good.',
 'The number of employees at December 31, 2016 was 26,498, a decrease of 159 employees from December 31, 2015. This decrease primarily reflects the impact of cost reduction programs implemented during the current year partially offset by acquisitions.',
 'The number of employees at December 31, 2015 was 26,657, a decrease of 1,123 employees from December 31, 2014. This decrease primarily reflects the impact of cost reduction programs implemented during the current year.']

In [441]:
train_para_list[462]

'At March 31, 2016, 2015 and 2014, we had 3,066, 2,982 and 2,843 employees, respectively. None of our employees are covered by a collective bargaining agreement. We consider our relations with our employees to be satisfactory. However, competition for experienced asset management personnel is intense and from time to time we may experience a loss of valuable personnel. We recognize the importance to our business of hiring, training and retaining skilled professionals.'

In [489]:
train_para_list[1144]

'Our operating expenses have historically been driven in large part by personnel-related costs, including wages, commissions, bonuses, benefits, share-based compensation, and travel. Facility and information technology, or IT, departmental costs are allocated to each department based on usage and headcount. We had a total of 9,832, 9,058, and 8,806 employees as of December 31, 2016, 2015, and 2014, respectively. Our headcount increased by 774 employees, or 9%, in 2016, compared to 2015, primarily in research and development, driven by our 2016 business acquisitions, as well as higher services and sales headcount as we focus on delivering our new products to our customers.'

In [620]:
train_para_list[1694]

'We had 17,912, 14,533 and 10,625 employees as of December 31, 2014, 2015 and 2016, respectively. The following table sets forth the number of our employees categorized by our areas of operations and as a percentage of our total employees as of December 31, 2016.'

In [677]:
train_para_list[1907]

'The employee cost at Jaguar Land Rover increased by 17.6% to Rs.228,730 million in Fiscal 2016 from Rs.194,467 million in Fiscal 2015. This increase includes an unfavorable foreign currency translation from GBP to Indian rupees of Rs.546 million. In GBP terms, employee costs at Jaguar Land Rover increased to GBP 2,321 million in Fiscal 2016 from GBP1,977 million in Fiscal 2015. The employee cost at Jaguar Land Rover as a percentage to revenue increased to 10.5% in Fiscal 2016 from 9.0% in Fiscal 2015. Due to consistent increases in volumes and to support new launches and product development projects, Jaguar Land Rover increased its average permanent headcount by 19.6% in Fiscal 2016 to 29,789 employees from 24,902 employees in Fiscal 2015. However, the average temporary headcount was flat at 7,216 employees in Fiscal 2016 from 7,225 employees in Fiscal 2015. Total number of permanent employees as at March 31, 2016 was 30,750, as compared to 27,004 as at March 31, 2015 for Jaguar Land 

In [689]:
train_para_list[2204]

'•  Independent Equipment Dealer Solicitations. This origination channel focuses on soliciting and establishing relationships with independent equipment dealers in a variety of equipment categories located across the United States. Our typical independent equipment dealer has less than $12.0 million in annual revenues and fewer than 50 employees. Service is a key determinant in becoming the preferred provider of financing recommended by these equipment dealers.'

In [696]:
train_para_list[2460]

'The assets of communications services businesses are primarily their employees, and the Company is highly dependent on the talent, creative abilities and technical skills of its personnel and the relationships its personnel have with clients. The Company believes that its operating companies have established reputations in the industry that attract talented personnel. However, the Company, like all communications services businesses, is vulnerable to adverse consequences from the loss of key employees due to the competition among these businesses for talented personnel. On 31 December 2016, the Group, including all employees of associated undertakings, had approximately 198,000 employees located in over 3,000 offices in 112 countries compared with 190,000 and 179,000 as at 31 December 2015 and 2014, respectively. Excluding all employees of associated undertakings, this figure is 134,341 (2015: 128,123, 2014: 123,621). The average number of employees in 2016 was 132,657 compared to 124

In [697]:
extract_emp_relations(nlp(train_para_list[2460]), verbose=True)

Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: attr
Root is at 2 steps from employees.
No num_tok. 
Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: pobj
Root is at 1 steps from employees.
Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: pobj
Root is at 1 steps from employees.
No num_tok. 
Word_id is : 1
Word is : employees
Dep_ of EMP_NOUN is: dobj
Num_toks are: [198,000]
Root is at 1 steps from employees.
(Group, had, 198,000, 'Other', 0, employees)
Word_id is : 0
Word is : employees
Dep_ of EMP_NOUN is: pobj
year_emps: [('2014', '128,123'), ('2015', '134,341')]
max year_emps: ('2015', '134,341')
Sentence has multiple years:[(143, 2015), (147, 2014)]
First card subtree is :[134,341, (, 2015, :, 128,123, ,, 2014, :, 123,621, )]
years: [(143, 2015), (147, 2014)]
cards: [134,341, 128,123, 123,621]
emp_counts: [(141, 134,341), (145, 128,123), (149, 123,621)]
Root is at 1 steps from employees.
(figure, is, '134,341', 'Other', 0, employees)
Word_id is : 0
Word is : e

[RelationDetails(sent_num=3, word_num=1, s=Group, v=had, quantity=198,000, quantity_type='Other', type_token=0, word=employees),
 RelationDetails(sent_num=4, word_num=0, s=figure, v=is, quantity='134,341', quantity_type='Other', type_token=0, word=employees),
 RelationDetails(sent_num=5, word_num=0, s=The average number of employees in 2016, v=was, quantity=132,657, quantity_type='Other', type_token=0, word=employees)]

In [706]:
len(fact_df.doc_num.unique())

1961

In [428]:
train_para_list[392]

"At September 30, 2016, HRG employed 21 persons and HRG's subsidiaries employed approximately 16,000 persons. In the normal course of business, HRG and its subsidiaries use contract personnel to supplement their employee base to meet business needs. As of September 30, 2016, none of HRG's employees were represented by labor unions or covered by collective bargaining agreements. See the remainder of this report for additional information regarding the employees of HRG's subsidiaries. HRG believes that its overall relationship with its employees is good."

In [409]:
extract_emp_relations(nlp(train_para_list[47]), verbose=True)

Word_id is : 0
Word is : individuals
Dep_ of EMP_NOUN is: dobj
Num_toks are: [7,683]
Root is at 1 steps from individuals.
(we, employed, 7,683, 'Other', 0, individuals)
Word_id is : 0
Word is : persons
Dep_ of EMP_NOUN is: dobj
Num_toks are: [7,536]
Root is at 1 steps from persons.
(subsidiaries, employed, 7,536, 'Other', 0, persons)
Word_id is : 1
Word is : persons
Dep_ of EMP_NOUN is: nsubjpass


[RelationDetails(sent_num=0, word_num=0, s=we, v=employed, quantity=7,683, quantity_type='Other', type_token=0, word=individuals),
 RelationDetails(sent_num=1, word_num=0, s=subsidiaries, v=employed, quantity=7,536, quantity_type='Other', type_token=0, word=persons)]