imports

In [1]:
from fonduer.supervision import Labeler
from fonduer.supervision.models import GoldLabel
from fonduer.features import Featurizer
from fonduer.candidates.models import Candidate

from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
from fonduer.supervision.models import LabelKey

from MeMoKBC.pipeline.utils import get_session, load_candidates, match_label_matrix
from MeMoKBC.definitions.candidates import NameFullAbbr, NameAbbrTask
from MeMoKBC.pipeline.lfs.name_short_long_lfs import short_long_lfs
from MeMoKBC.pipeline.lfs.name_short_task_lfs import name_abbr_task_lfs
from MeMoKBC.gold_label_matcher import match_gold_label
from importlib import reload
import csv

Get session object

In [2]:
session = get_session(db_name="pipeline6")

[2023-07-02 18:54:09,513][INFO] fonduer.meta:49 - Setting logging directory to: /tmp/2023-07-02_18-54-09


Define candidates and Labeler object

In [3]:
candidates = [NameFullAbbr, NameAbbrTask]

Load Goldlabels

In [4]:
# Load goldlabels from json file and compare to candidates in database
gold_labels = match_gold_label(
    "pipeline6",
    "/data/Goldlabel_biomedRxiv/goldlabel1_docs801-840_laura/goldlabel_authorlong_short_task_medRxiv.json",
    [NameAbbrTask, NameFullAbbr]
)

# filter potential goldlabels after candidate class
nat_cands = []
nfa_cands = []
for cand in gold_labels:
    if type(cand) == NameAbbrTask:
        # remove candidates where short and long name are not in the same sentence
        if cand[0].context.sentence.id == cand[1].context.sentence.id:
            # append the id of the candidate to the list
            nat_cands.append(cand.id)
    elif type(cand) == NameFullAbbr:
        # append the id of the candidate to the list
        nfa_cands.append(cand.id)

[2023-07-02 18:54:10,156][INFO] fonduer.meta:134 - Connecting user:postgres to fonduer-postgres-dev:5432/pipeline6
[2023-07-02 18:54:10,157][INFO] fonduer.meta:162 - Initializing the storage schema
[2023-07-02 18:54:10,450][INFO] root:88 - Found relations for 22 documents
[2023-07-02 18:54:10,794][INFO] root:93 - Found 6820 candidates for <class 'fonduer.candidates.models.candidate.NameAbbrTask'>
[2023-07-02 18:54:11,357][INFO] root:93 - Found 28105 candidates for <class 'fonduer.candidates.models.candidate.NameFullAbbr'>
[2023-07-02 18:54:11,746][INFO] root:102 - Found candidates for 40 documents


In [5]:
# create labeler object
labeler = Labeler(session, candidates)

# write function that returns gold label for a candidate
def gold(c: Candidate) -> int:

    if type(c) == NameAbbrTask:

        # check if the candidate id is inside the list of goldlabel candidate id's
        if c.id in nat_cands:
            return 1

    elif type(c) == NameFullAbbr:
        
        # check if the candidate id is inside the list of goldlabel candidate id's
        if c.id in nfa_cands:
            return 1

    # if the candidate id is not inside the list of goldlabel candidate id's return FALSE
    return 0

# Apply the gold label function for each candidate class
labeler.apply(lfs=[[gold], [gold]], table=GoldLabel, train=True)

  .filter(candidate_class.id.in_(sub_query))
[2023-07-02 18:54:25,546][INFO] fonduer.supervision.labeler:330 - Clearing Labels (split 0)
  query = self.session.query(table).filter(table.candidate_id.in_(sub_query))
[2023-07-02 18:54:25,593][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/14 [00:00<?, ?it/s]

In [6]:
# load candidates
train_cands = load_candidates(session, 0, candidates)

# match the candidates with the outcome of the labeling functions to generate input for the label model
L_train_NFA, L_train_NAT = match_label_matrix(session, candidates, 0) 

 # load gold labels list
L_gold_train_NFA, L_gold_train_NAT = labeler.get_gold_labels(train_cands)

## LF analysis

NameFull + Abrreviation

In [7]:
LFAnalysis(
    L_train_NFA,
    lfs=sorted(short_long_lfs, key=lambda lf: lf.name)
).lf_summary(Y=L_gold_train_NFA.reshape(-1))

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
check_all_uppercase_letters,0,[1],0.238114,0.238114,0.237132,134,1078,0.110561
check_horizont_abr_short,1,[],0.0,0.0,0.0,0,0,0.0
check_long_name_not_upper,2,[0],0.965422,0.510216,0.510216,4780,134,0.972731
check_uppercase_letters,3,[1],0.144794,0.144794,0.143811,131,606,0.177748
check_uppercase_letters_short_in_long,4,[1],0.144794,0.144794,0.143811,131,606,0.177748
name_full_in_top_percentile_sentence_wise,5,[1],0.259921,0.256778,0.255599,0,1323,0.0
name_short_outside_half_percentile_sentence_wise,6,[],0.0,0.0,0.0,0,0,0.0
small_letter_count,7,[1],0.069745,0.068173,0.066994,0,355,0.0
word_count,8,[1],0.098232,0.095874,0.095481,0,500,0.0


NameAbbr + Task

In [8]:
LFAnalysis(
    L_train_NAT,
    lfs=sorted(name_abbr_task_lfs, key=lambda lf: lf.name)
).lf_summary(Y=L_gold_train_NAT.reshape(-1))

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_medical_abbreviation,0,[0],1.0,0.366935,0.366935,1627,109,0.937212
lf_length_more_than_three_words,1,[0],0.020737,0.020737,0.020737,36,0,1.0
lf_name_short_in_first_words,2,[1],0.366935,0.366935,0.366935,80,557,0.125589
