In [4]:
cd ..

/Users/nguyen/projects/archi


In [69]:
from src.archi_nlp import Archi, NER_data

import pandas as pd

from nltk import Tree


### Create Archi 🌈

In [6]:
archi = Archi('en_core_web_lg')

### Get the NER train data

In [15]:
archi.get_ner_train_data('data/ner/18-05-11_ner.pkl', n_copy=8)

In [16]:
len(archi.ner_train_data)

1080

### Train the nlp model with the new entity labels

In [19]:
archi.ner_train(output_dir='src/models/nlp/0511_nlp', new_label=['AEC'], n_iter=10)

  ret = sqrt(sqnorm)


{'ner': 8032.2865986598163}
{'ner': 8066.4323207047037}
{'ner': 8149.8010117247195}
{'ner': 8135.4443989713618}
{'ner': 8170.5112680544898}
{'ner': 8133.8771987438458}
{'ner': 8077.9465594933099}
{'ner': 8006.6611939708391}
{'ner': 8063.4386998443806}
{'ner': 8048.9531531616149}
Saved model to src/models/nlp/0511_nlp
Loading from src/models/nlp/0511_nlp
Archi object has been updated to new model


### Get the raw data

In [20]:
archi.get_raw_data('data/raw_df/ibc.pkl')

### Fit the raw data with the new nlp model

In [21]:
archi.fit_nlp()

### Pickle the nlp model to avoid training the default model again

In [22]:
archi.pickle_raw_nlp('data/nlp_df/raw_nlp_0511.pkl')

### Check the results by building a new random sample of NER train data

In [23]:
archi.build_ner_train()

In [70]:
def tok_format(tok):
    return "_".join([tok.orth_, tok.tag_])


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

### Serve one row of data at a time to the reviewer

In [96]:
docket = archi.review_ner_train()
docket.view_data

('Alternate setbacks and clearances are permitted, subject to the approval of the building official. The building official shall be permitted to require a geotechnical investigation as set forth in Section 1803.5.10.',
 {'entities': []})

In [97]:
doc = docket.doc

In [98]:
[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

                             permitted_VBN                                                          
    _______________________________|__________________________________                               
   |     |   |                     |                              subject_JJ                        
   |     |   |                     |                                  |                              
   |     |   |                     |                                to_IN                           
   |     |   |                     |                                  |                              
   |     |   |                     |                             approval_NN                        
   |     |   |                     |                         _________|___________                   
   |     |   |                     |                        |                   of_IN               
   |     |   |                     |                        |                     |    

[None, None]

In [99]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children], token.lemma_)

Alternate amod setbacks NOUN [] alternate
setbacks nsubjpass permitted VERB [Alternate, and, clearances] setback
and cc setbacks NOUN [] and
clearances conj setbacks NOUN [] clearance
are auxpass permitted VERB [] be
permitted ROOT permitted VERB [setbacks, are, ,, subject, .] permit
, punct permitted VERB [] ,
subject amod permitted VERB [to] subject
to prep subject ADJ [approval] to
the det approval NOUN [] the
approval pobj to ADP [the, of] approval
of prep approval NOUN [official] of
the det official NOUN [] the
building compound official NOUN [] building
official pobj of ADP [the, building] official
. punct permitted VERB [] .
The det official NOUN [] the
building compound official NOUN [] building
official nsubjpass permitted VERB [The, building] official
shall aux permitted VERB [] shall
be auxpass permitted VERB [] be
permitted ROOT permitted VERB [official, shall, be, require, .] permit
to aux require VERB [] to
require xcomp permitted VERB [to, investigation, set] require
a d

In [100]:
nouns = [token.text for token in doc if token.pos_ == 'NOUN']

In [89]:
nouns

['penetrations', 'fire', 'resistance', 'assembly']

In [106]:
for chunk in doc.noun_chunks:
    print(chunk)

Alternate setbacks
clearances
the approval
the building official
The building official
a geotechnical investigation


In [108]:
doc.print_tree()

[{'NE': '',
  'POS_coarse': 'VERB',
  'POS_fine': 'VBN',
  'arc': 'ROOT',
  'lemma': 'permit',
  'modifiers': [{'NE': '',
    'POS_coarse': 'NOUN',
    'POS_fine': 'NNS',
    'arc': 'nsubjpass',
    'lemma': 'setback',
    'modifiers': [{'NE': '',
      'POS_coarse': 'ADJ',
      'POS_fine': 'JJ',
      'arc': 'amod',
      'lemma': 'alternate',
      'modifiers': [],
      'word': 'Alternate'},
     {'NE': '',
      'POS_coarse': 'CCONJ',
      'POS_fine': 'CC',
      'arc': 'cc',
      'lemma': 'and',
      'modifiers': [],
      'word': 'and'},
     {'NE': '',
      'POS_coarse': 'NOUN',
      'POS_fine': 'NNS',
      'arc': 'conj',
      'lemma': 'clearance',
      'modifiers': [],
      'word': 'clearances'}],
    'word': 'setbacks'},
   {'NE': '',
    'POS_coarse': 'VERB',
    'POS_fine': 'VBP',
    'arc': 'auxpass',
    'lemma': 'be',
    'modifiers': [],
    'word': 'are'},
   {'NE': '',
    'POS_coarse': 'PUNCT',
    'POS_fine': ',',
    'arc': 'punct',
    'lemma': ',',
    '