## Part II: `Candidate` Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the Training `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [3]:
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance

corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 78 Documents


## Defining a `Candidate` Schema

In [4]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Writing a basic `CandidateExtractor`

In [5]:
# from snorkel.matchers import DictionaryMatch

# from hardware_utils import load_extended_parts_dict
# gold_file ='data/hardware/hardware_gold.csv'
# parts_dict = load_extended_parts_dict(gold_file) # NOTE: this include A/B/C/-16/-25/-40 
# print "Loaded %d part numbers." % len(parts_dict)
# parts_matcher = DictionaryMatch(d=parts_dict)

In [6]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

In [7]:
from snorkel.matchers import RegexMatchSpan

temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]', longest_match_only=False)

# TEMP

In [8]:
# import os
# from hardware_utils import get_gold_dict
# from collections import defaultdict

# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
# gold_parts_by_doc = defaultdict(set)
# for part in gold_parts:
#     gold_parts_by_doc[part[0]].add(part[1])

In [9]:
# print len(gold_parts_by_doc)

In [10]:
# from snorkel.models import Corpus
# from snorkel.utils import get_ORM_instance
# from hardware_utils import OmniNgramsPart
# from snorkel.matchers import RegexMatchSpan

In [11]:
# %%time
# group_matcher = RegexMatchSpan(rgx=r'^(A|B|C|-?16|-?25|-?40)$', ignore_case=False)

# corpus = get_ORM_instance(Corpus, session, 'Hardware')
# part_ngrams_2 = OmniNgramsPart(n_max=3)
# parts_by_doc = defaultdict(set)
# groups_by_doc = defaultdict(set)
# for doc in corpus.documents:
#     for tc in part_ngrams_2.apply(doc):
#         if parts_matcher.f(tc):
#             parts_by_doc[tc.parent.document.name.upper()].add(tc.get_span())
#         if group_matcher._f(tc):
#             if 
#             groups_by_doc[tc.parent.document.name.upper()].add(tc.get_span())

In [12]:
# print set(parts_by_doc.keys()).difference(set(gold_parts_by_doc.keys()))
# print set(gold_parts_by_doc.keys()).difference(set(parts_by_doc.keys()))

In [13]:
# print len(gold_parts_by_doc)
# print len(parts_by_doc)
# print gold_parts_by_doc['PJECS00521-1']

In [14]:
# print groups_by_doc

In [15]:
# from itertools import chain
# from pprint import pprint

# gold_parts = [x for x in chain.from_iterable(gold_parts_by_doc.values())]
# found_parts = [x for x in chain.from_iterable(parts_by_doc.values())]
# print len(gold_parts)
# print len(found_parts)
# print set(gold_parts).difference(set(found_parts))
# missed = []
# for doc in gold_parts_by_doc.keys():
#     for part in gold_parts_by_doc[doc]:
#         if part not in parts_by_doc[doc]:
#              missed.append((doc, part))
# print len(missed)
# pprint(missed)

In [16]:
from snorkel.throttlers import PartThrottler

part_throttler = PartThrottler()

# TEMP

In [17]:
# TODO: replace dictionary-based parts_by_doc with first-pass parts_by_doc
from hardware_utils import get_gold_dict
from collections import defaultdict

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1]) # TODO: change gold_parts to work with namedTuples

In [18]:
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)
temp_ngrams = OmniNgramsTemp(n_max=3)

In [19]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        part_throttler)

## Running the `CandidateExtractor`

In [20]:
%time train = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "%s contains %d Candidates" % (train, len(train))


CPU times: user 42.8 s, sys: 340 ms, total: 43.1 s
Wall time: 43.5 s
Candidate Set (Hardware Training Candidates) contains 22335 Candidates


In [21]:
for c in train[:3]:
    print c

Part_Temp(Span("BC550", parent=18435, chars=[32,36], words=[8,8]), ImplicitSpan("-65", parent=100729, words=[0,0], position=[0]))
Part_Temp(ImplicitSpan("BC546BCTA", parent=18435, words=[0,0], position=[0]), ImplicitSpan("-65", parent=100729, words=[0,0], position=[0]))
Part_Temp(ImplicitSpan("BC546ATA", parent=18435, words=[0,0], position=[1]), ImplicitSpan("-65", parent=100729, words=[0,0], position=[0]))


In [22]:
# from collections import defaultdict
# from snorkel.utils import ProgressBar

# def get_candidate_id(c):
#     return c.part.get_stable_id() + c.temp.get_stable_id()

# seen = defaultdict(int)
# pb = ProgressBar(len(train))
# for i, c in enumerate(train):
#     pb.bar(i)
#     seen[get_candidate_id(c)] += 1
#     if seen[get_candidate_id(c)] == 2:
#         import pdb; pdb.set_trace()
# pb.close()

In [23]:
# print type(train[0])

In [24]:
# print len(train) == len(set([c for c in train]))

### Saving the extracted candidates

In [25]:
session.add(train)
session.commit()

### Reloading the candidates

In [26]:
from snorkel.models import CandidateSet
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
print "%s contains %d Candidates" % (train, len(train))

Candidate Set (Hardware Training Candidates) contains 22335 Candidates


### Repeating for development and test corpora

In [27]:
for corpus_name in ['Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Development)

CPU times: user 15.7 s, sys: 84 ms, total: 15.8 s
Wall time: 16.3 s
Candidate Set (Hardware Development Candidates) contains 7914 Candidates


In [28]:
# train = get_ORM_instance(Corpus, session, 'Hardware Training')
# dev = get_ORM_instance(Corpus, session, 'Hardware Development')
# test = get_ORM_instance(Corpus, session, 'Hardware Test')
# trainies = [d.name for d in train.documents]
# len(trainies)
# for d in test.documents:
#     if d.name in trainies:
#         print 'YES!'
# # for d in test.documents[:10]: print d

## TEMPORARY - Assessing Total Recall

In [29]:
from hardware_utils import entity_level_total_recall
from snorkel.utils import get_ORM_instance

train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()
total_set = set([])
for c in train:
    total_set.add(c)
for c in dev:
    total_set.add(c)

In [30]:
print list(total_set)[0]

Part_Temp(ImplicitSpan("BC858C", parent=88990, words=[2,4], position=[11]), ImplicitSpan("-65", parent=88993, words=[0,0], position=[0]))


In [31]:
print len(train)
train_set = set([c for c in train])
print len(train_set)

print len(dev)
dev_set = set([c for c in dev])
print len(dev_set)

print len(total_set)

22335
22335
7914
7914
30249


In [32]:
import os
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(total_set, gold_file, 'stg_temp_min', relation=True)

Preparing candidates...

Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1168 
Entity-level Gold: 831
Intersection Candidates: 804
----------------------------------------
Overlap with Gold:  0.9675



In [33]:
fn

{('LTSCS02912-1', 'BC846AW', '-55'),
 ('LTSCS02912-1', 'BC846BW', '-55'),
 ('TKCGS00622-1', 'DTC114TE', '-55'),
 ('TKCGS00622-1', 'DTC114YE', '-55'),
 ('TKCGS00622-1', 'DTC123EE', '-55'),
 ('TKCGS00622-1', 'DTC124EE', '-55'),
 ('TKCGS00622-1', 'DTC124XE', '-55'),
 ('TKCGS00622-1', 'DTC143EE', '-55'),
 ('TKCGS00622-1', 'DTC143TE', '-55'),
 ('TKCGS00622-1', 'DTC143ZE', '-55'),
 ('TKCGS00622-1', 'DTC144EE', '-55'),
 ('UTCLS02155-1', 'BC807', '-65'),
 ('UTCLS02155-1', 'BC807-16', '-65'),
 ('UTCLS02155-1', 'BC807-25', '-65'),
 ('UTCLS02155-1', 'BC807-40', '-65'),
 ('UTCLS02155-1', 'BC808', '-65'),
 ('UTCLS02155-1', 'BC808-16', '-65'),
 ('UTCLS02155-1', 'BC808-25', '-65'),
 ('UTCLS02155-1', 'BC808-40', '-65'),
 ('VSMIS00373-1', 'BC337', '-65'),
 ('VSMIS00373-1', 'BC337-16', '-65'),
 ('VSMIS00373-1', 'BC337-25', '-65'),
 ('VSMIS00373-1', 'BC337-40', '-65'),
 ('VSMIS00373-1', 'BC338', '-65'),
 ('VSMIS00373-1', 'BC338-16', '-65'),
 ('VSMIS00373-1', 'BC338-25', '-65'),
 ('VSMIS00373-1', 'BC338-4

In [34]:
from hardware_utils import get_gold_dict
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_attrib = 'stg_temp_min'
gold = gold_dict = get_gold_dict(gold_file, gold_attrib)
print len(gold)

12611


In [35]:
from collections import defaultdict
gold_dict_by_doc = defaultdict(set)
for g in gold_dict:
    gold_dict_by_doc[g[0]].add(g)
print sum([len(gold_dict_by_doc[g]) for g in gold_dict_by_doc])

12611


In [36]:
# from snorkel.utils import ProgressBar
# target = sorted(list(fn))[-1]
# print target
# print "-------------------------------"
# pb = ProgressBar(len(candidates))
# for i, c in enumerate(list(candidates)[:]):
#     pb.bar(i)
#     if (c.part.parent.document.name.upper() == target[0].upper())
#         and c.part.get_span().upper() == target[1].upper()):
#         print c
# pb.close()
# print len(tp)
# for c in sorted(list(tp))[:5]:
#     print c
# print "-------------------------------"
# print len(fn)
# for c in sorted(list(fn))[:50]:
#     print c

In [37]:
# from hardware_utils import part_error_analysis

# for c in total_set:
#     if c.part.parent.document.name.upper()=='BC546-D' and c.part.get_span() == 'BC547':
#         part_error_analysis(c)
#         import pdb; pdb.set_trace()

In [38]:
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()

# for doc in corpus.documents:
#     if doc.name == 'PNJIS00254-1':
#         d = doc
#         break
# print d

In [39]:
# for phrase in d.phrases:
#     if '55' in phrase.words:
#         p = phrase
#         print p.cell
#         import pdb; pdb.set_trace()

In [40]:
# candies = sorted(candidates, key=lambda x: x[0])

In [41]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time count_hardware_labels(candidates, filename, attrib='stg_temp_min', attrib_class='temp')

In [42]:
import os
os.system('cp snorkel.db\ candidates snorkel.db');

In [43]:
candidates_by_doc = defaultdict(int)
for c in total_set:
    candidates_by_doc[c.part.parent.document.name] += 1
print sum(sorted(candidates_by_doc.values())[:50])

1173


## TEMPORARY - Return to Normalcy

In [44]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.