# CB_V_MAX

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=9)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

### Define Matchers

In [None]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

#NOTE: This is super specific.
cb_v_matcher = RegexMatchSpan(rgx=r'\-?[2-8]0', longest_match_only=False)

### Define ContextSpaces

In [None]:
import os
from collections import defaultdict
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])
    
part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)

# TODO: This is missing the current represented as an Amp rather than a milliamp
cb_v_ngrams = OmniNgramsTemp(n_max=3)

In [None]:
from snorkel.lf_helpers import *

def CBVThrottler((part_span, attr_span)):
    """
    Removes candidates unless the part is not in a table, or the part aligned
    temperature are not aligned.
    """
    def aligned(span1, span2):
        return (span1.parent.table == span2.parent.table)
    
    if (part_span.parent.table is None):
        return True
    
    if (aligned(part_span, attr_span)):
        ngrams = set(get_row_ngrams(attr_span))
        if ('vcbo' in ngrams or
            'v cbo' in ngrams or
            'collector-base' in ngrams):
            return True
    
    return False

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance


ce = CandidateExtractor(Part_Voltage, 
                        [part_ngrams, cb_v_ngrams], 
                        [parts_matcher, cb_v_matcher],
                        throttler=CBVThrottler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

### Assess Recall

In [None]:
from snorkel.models import CandidateSet
from hardware_utils import entity_level_total_recall, most_common_document, get_gold_dict
from snorkel.models import Candidate

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'

# gold_dict = get_gold_dict(gold_file, attrib='c_current_max')

(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'cb_v_max', relation=True, integerize=True)
print len(tp)
print len(fp)
print len(fn)

In [None]:
from pprint import pprint
fns = list(fn)
pprint(sorted(fns))

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

In [None]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           attrib='cb_v_max')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

### Extract Features

In [2]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 27236950 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 58min 1s, sys: 1min 51s, total: 59min 52s
Wall time: 1h 11s
Bulk upserting 3448799 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 12min 56s, sys: 29 s, total: 13min 25s
Wall time: 13min 25s


In [3]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [None]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
label_manager = LabelManager()

# Helpers
def set_all_in_set(a, b):
    '''return true if all of a is in b'''
    return b.issuperset(a)

def set_none_in_set(a, b):
    '''return true if none of a is in b'''
    return (b.difference(a) == b)

def set_any_in_set(a, b):
    '''return true if any of a is in b'''
    return len(b.intersection(a)) > 0

LFs = []

###################################################################
# POSITIVE
###################################################################

def LF_voltage_inside_table(c):
    return 1 if c.voltage.parent.row is not None else 0
LFs.append(LF_voltage_inside_table)

# def LF_part_is_aligned(c):
#     return 1 if (c.part.parent.table == c.voltage.parent.table and
#                 (c.part.parent.row_num == c.voltage.parent.row_num or
#                  c.part.parent.col_num == c.voltage.parent.col_num)) else 0
# LFs.append(LF_part_is_aligned)
    
def LF_ce_keywords(c):
    individuals = set(['collector', 'emitter', 'voltage'])
    together = set(['collector-emitter', 'voltage'])
    row_ngrams = set(x.replace(' ', '') for x in get_row_ngrams(c.voltage, infer=True))
    if set_all_in_set(individuals, row_ngrams):
        return 1
    if set_all_in_set(together, row_ngrams):
        return 1
    return 0
LFs.append(LF_ce_keywords)

def LF_pos_keywords_in_row(c):
    pos_keys = set(['v ceo', 'ceo', 'vceo', 'value', 'rating'])
    ngrams = set(get_row_ngrams(c.voltage, infer=True))
    if set_any_in_set(pos_keys, ngrams):
        return 1
    else:
        return 0
LFs.append(LF_pos_keywords_in_row)

def LF_low_table_num(c):
    if c.voltage.parent.table <= 2:
        return 1
    else:
        return -1
LFs.append(LF_low_table_num)

def LF_whole_phrase_in_row(c):
    row_ngrams = set(get_row_ngrams(c.voltage))
    if 'collector-emitter voltage' in row_ngrams:
        return 1
    else:
        return 0
LFs.append(LF_whole_phrase_in_row)


###################################################################
# NEGATIVE
###################################################################

def LF_specific_neg_row_keywords(c):
    left_ngrams = set(get_row_ngrams(c.voltage, infer=True))
    neg_keys = set(['continuous', 'dc', 'cut-off'])
    if set_any_in_set(neg_keys, left_ngrams):
        return -1
    else:
        return 0
LFs.append(LF_specific_neg_row_keywords)

def LF_equals_in_row(c):
    row_ngrams = set(get_row_ngrams(c.voltage))
    if '=' in row_ngrams:
        return -1
    else:
        return 0
LFs.append(LF_equals_in_row)

def LF_i_in_row(c):
    row_ngrams = set(get_row_ngrams(c.voltage))
    if 'i' in row_ngrams:
        return -1
    else:
        return 0
LFs.append(LF_i_in_row)

def LF_first_row(c):
    if c.voltage.parent.row_num == 0:
        return -1
    else:
        return 0
LFs.append(LF_first_row)
    
def LF_not_ce_relevant(c):
    ce_keywords = set(['collector', 'emitter', 'collector-emitter'])
    ngrams = set(get_aligned_ngrams(c.voltage))
    if not set_any_in_set(ce_keywords, ngrams):
        return -1
    else:
        return 1
LFs.append(LF_not_ce_relevant)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.voltage, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 4 else 0
LFs.append(LF_too_many_numbers_row)

def LF_negative_keywords(c):
    row_neg_keys = set(['ambient',
                    'small-signal',
                    'cut-off',
                    'na',
                    'ma',
                    'cex',
                    'resistance',
                    'power',
                    'junction',
                    'dissipation', 
                    'breakdown',
                    'current',
                    'cbo',
                    'vcbo'
                    'peak',
                    '=',
                    'f',
                    'p',
                    'base',
                    'mw',
                    'ebo',
                    'vebo',
                    'i c',
                    'total',
                    'device',
                    'c',
                    'mhz',
                    'temperature',
                    'saturation',
                    'operating',
                    'storage'
                    'bandwidth',
                    'derate',
                    'above',
                    'product',
                    'figure',
                    'conditions',
                    'current gain',
                    'saturation',
                    'min',
                    'min.',
                    'typ',
                    'typ.',
                    'max',
                    'max.',
                    'gain',
                    'p',
                    'thermal',
                    'test'])
    row_ngrams = set(get_row_ngrams(c.voltage))
    col_ngrams = set(get_col_ngrams(c.voltage))
    col_neg_keys = set(['conditions', 
                        'condition', 
                        'parameter', 
                        'min',
                        'min.',
                        'typ',
                        'typ.',
                        'max',
                        'max.',
                        'test'])
    if set_any_in_set(row_neg_keys, row_ngrams):
        return -1
    if set_any_in_set(col_neg_keys, col_ngrams):
        return -1
    
    return 0

LFs.append(LF_negative_keywords)
    
# def LF_negative_keywords_in_col(c):
#     neg_keys = set(['conditions',
#                     'condition',
#                     'parameter',
#                     'test'])
#     ngrams = set(get_col_ngrams(c.voltage))
#     if set_any_in_set(neg_keys, ngrams):
#         return -1
#     else:
#         return 0

# LFs.append(LF_negative_keywords_in_col)

# def LF_negative_keywords_in_part_aligned(c):
#     ngrams = set(get_aligned_ngrams(c.part))
#     return -1 if (
#         'gain'          in ngrams or
#         'small-signal'  in ngrams or
#         'small'         in ngrams or
#         'cbo'         in ngrams or
#         'collector-emitter' in ngrams or
#         'value'         in ngrams or
#         'thermal'       in ngrams) else 0
# LFs.append(LF_negative_keywords_in_part_aligned)

# def LF_negative_keywords(c):
#     ngrams = set(get_aligned_ngrams(c.voltage))
#     return -1 if (
#         'collector-base'    in ngrams or
#         'cut-off'           in ngrams or
#         '='                 in ngrams or
#         'gain'              in ngrams or
#         'h fe'              in ngrams or
#         'typ.'              in ngrams or
#         'typ'               in ngrams or
#         'min'               in ngrams or
#         'min.'              in ngrams or
#         'saturation'        in ngrams or
#         'mhz'               in ngrams or
#         'gain'              in ngrams or
#         'obo'               in ngrams or
#         'c obo'             in ngrams) else 0
# LFs.append(LF_negative_keywords)


### Apply LFs

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

### Assess LF accuracy

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

In [None]:
from snorkel.fast_annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=4000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

In [None]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'eb_v_max')


In [None]:
from pprint import pprint
FN_list = sorted(list(FN))
FP_list = sorted(list(FP))
TP_list = sorted(list(TP))
pprint(FN_list[:60])

In [None]:
from hardware_utils import entity_to_candidates

entity = FN_list[61]
print entity
print

matches = entity_to_candidates(entity, fn)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate

print "\nPhrase:"
print candidate.voltage.parent

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

In [None]:

import os
os.system('cp snorkel.db snorkel.db\ final');