# C_CURRENT_MAX

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=6)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

### Define Matchers

In [None]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

#TODO: This is missing values represented as AMPs rather than milliamps
ce_v_matcher = RegexMatchSpan(rgx=r'\-?[2-6][05]', longest_match_only=False)

### Define ContextSpaces

In [None]:
import os
from collections import defaultdict
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])
    
part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)

# TODO: This is missing the current represented as an Amp rather than a milliamp
ce_v_ngrams = OmniNgramsTemp(n_max=3)

In [None]:
from snorkel.lf_helpers import *

def CEVThrottler((part_span, attr_span)):
    """
    Removes candidates unless the part is not in a table, or the part aligned
    temperature are not aligned.
    """
    def aligned(span1, span2):
        return (span1.parent.table == span2.parent.table and
            (span1.parent.row_num == span2.parent.row_num or
             span1.parent.col_num == span2.parent.col_num))
    
    if (part_span.parent.table is None):
        return True
    
    if (aligned(part_span, attr_span)):
        ngrams = set(get_row_ngrams(attr_span))
        if ('ceo' in ngrams or
            'vceo' in ngrams or
            'collector-emitter' in ngrams):
            return True
    
    return False

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance


ce = CandidateExtractor(Part_Voltage, 
                        [part_ngrams, ce_v_ngrams], 
                        [parts_matcher, ce_v_matcher],
                        throttler=CEVThrottler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

### Assess Recall

In [None]:
from snorkel.models import CandidateSet
from hardware_utils import entity_level_total_recall, most_common_document, get_gold_dict
from snorkel.models import Candidate

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'

# gold_dict = get_gold_dict(gold_file, attrib='c_current_max')

(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'ce_v_max', relation=True)
print len(tp)
print len(fp)
print len(fn)

In [None]:
from pprint import pprint
fns = list(fn)
pprint(sorted(fns))

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

In [None]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           attrib='ce_v_max')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])

### Extract Features

In [None]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [None]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
label_manager = LabelManager()

LFs = []

# POSITIVE

# def LF_head_is_IC(c):
#     head_ngrams = set(get_head_ngrams(c.current, axis='row'))
#     return 1 if ('ic' in head_ngrams) else 0
# LFs.append(LF_head_is_IC)

# def LF_current_inside_table(c):
#     return 1 if c.current.parent.row is not None else 0
# LFs.append(LF_current_inside_table)

# def LF_part_is_aligned(c):
#     return 1 if (c.part.parent.row == c.voltage.parent.row or
#                   c.part.parent.col == c.voltage.parent.col) else 0
# LFs.append(LF_part_is_aligned)

# def LF_current_row(c):
#     return 1 if 'current' in get_row_ngrams(c.current) else 0
# LFs.append(LF_current_row)

# def LF_collector_row(c):
#     return 1 if 'collector' in get_row_ngrams(c.current) else 0
# LFs.append(LF_collector_row)

# def LF_all_one(c):
#     return 1
# LFs.append(LF_all_one)
    
# def LF_ce_keywords(c):
#     row_ngrams = set(get_row_ngrams(c.voltage))
#     return 1 if (
#         ('collector-emitter' in row_ngrams or
#          ('collector' in row_ngrams and
#           'emitter' in row_ngrams)) and
#         'voltage'           in row_ngrams) else 0
# LFs.append(LF_ce_keywords)

def LF_pos_keywords_in_row(c):
    ngrams = set(get_row_ngrams(c.voltage, infer=True))
    return 1 if (
        'v ceo'       in ngrams or
        'ceo'    in ngrams or
        'vceo'   in ngrams) else 0
LFs.append(LF_pos_keywords_in_row)

# NEGATIVE

def LF_continuous_left(c):
    left_ngrams = set(get_left_ngrams(c.voltage, window=7))
    return -1 if ('continuous' in left_ngrams or
                 'dc' in left_ngrams) else 0
LFs.append(LF_continuous_left)

def LF_not_ce_relevant(c):
    ngrams = get_aligned_ngrams(c.voltage)
    return -1 if not ('collector' in ngrams or
                      'emitter' in ngrams or
                      'collector-emitter' in ngrams) else 0
LFs.append(LF_not_ce_relevant)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.voltage, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 4 else 0
LFs.append(LF_too_many_numbers_row)

# def LF_thermal_row(c):
#     return -1 if 'thermal' in get_row_ngrams(c.current) else 0
# LFs.append(LF_collector_row)

# def LF_junction_row(c):
#     return -1 if 'junction' in get_row_ngrams(c.current) else 0
# LFs.append(LF_junction_row)

# def LF_resistance_row(c):
#     return -1 if 'resistance' in get_row_ngrams(c.current) else 0
# LFs.append(LF_resistance_row)

# def LF_small_signal_row(c):
#     return -1 if 'small-signal' in get_row_ngrams(c.current) else 0
# LFs.append(LF_small_signal_row)

# def LF_ambient_row(c):
#     return -1 if 'ambient' in get_row_ngrams(c.current) else 0
# LFs.append(LF_ambient_row)



def LF_negative_keywords_in_row(c):
    ngrams = set(get_row_ngrams(c.voltage))
    return -1 if (
        'ambient'       in ngrams or
        'small-signal'  in ngrams or
        'resistance'    in ngrams or
        'power'         in ngrams or
        'junction'      in ngrams or
        'dissipation'   in ngrams or
        'current'       in ngrams or
        'cbo'           in ngrams or
        'peak'          in ngrams or
        'base'          in ngrams or
        'mw'            in ngrams or
        'ebo'           in ngrams or
        'p'             in ngrams or
        'thermal'       in ngrams) else 0
LFs.append(LF_negative_keywords_in_row)
    

# def LF_negative_keywords_in_part_aligned(c):
#     ngrams = set(get_aligned_ngrams(c.part))
#     return -1 if (
#         'gain'          in ngrams or
#         'small-signal'  in ngrams or
#         'small'         in ngrams or
#         'cbo'         in ngrams or
#         'collector-emitter' in ngrams or
#         'value'         in ngrams or
#         'thermal'       in ngrams) else 0
# LFs.append(LF_negative_keywords_in_part_aligned)

# def LF_negative_keywords(c):
#     ngrams = set(get_aligned_ngrams(c.voltage))
#     return -1 if (
#         'collector-base'    in ngrams or
#         'cut-off'           in ngrams or
#         '='                 in ngrams or
#         'gain'              in ngrams or
#         'h fe'              in ngrams or
#         'typ.'              in ngrams or
#         'typ'               in ngrams or
#         'min'               in ngrams or
#         'min.'              in ngrams or
#         'saturation'        in ngrams or
#         'mhz'               in ngrams or
#         'gain'              in ngrams or
#         'obo'               in ngrams or
#         'c obo'             in ngrams) else 0
# LFs.append(LF_negative_keywords)

# def LF_voltage_row_current(c):
#     ngrams = set(get_row_ngrams(c.current))
#     return -1 if (
#         'voltage' in ngrams or
#         'cbo'     in ngrams or
#         'ceo'     in ngrams or
#         'ce'      in ngrams or
#         'ebo'     in ngrams or
#         'v'       in ngrams) else 0
# LFs.append(LF_voltage_row_current)

# def LF_voltage_row_part(c):
#     ngrams = set(get_row_ngrams(c.part))
#     return -1 if (
#         'voltage' in ngrams or
#         'cbo'     in ngrams or
#         'ceo'     in ngrams or
#         'ebo'     in ngrams or
#         'v'       in ngrams) else 0
# LFs.append(LF_voltage_row_part)

# def LF_test_condition_row(c):
#     ngrams = set(get_row_ngrams(c.current))
#     return -1 if ('test'      in ngrams and
#                   'condition' in ngrams) else 0
# LFs.append(LF_test_condition_row)


### Apply LFs

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

[==                                      ] 3%

### Assess LF accuracy

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Voltage = candidate_subclass('Part_Voltage', ['part','voltage'])
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

In [None]:
from snorkel.annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

In [None]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'ce_v_max')


In [None]:
from pprint import pprint
FN_list = sorted(list(FN))
FP_list = sorted(list(FP))
TP_list = sorted(list(TP))
pprint(FN_list[:60])

In [None]:
from hardware_utils import entity_to_candidates

entity = FN_list[11]
print entity
print

matches = entity_to_candidates(entity, tn)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate
print

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

In [None]:

import os
os.system('cp snorkel.db snorkel.db\ final');