In [18]:
from fonduer.supervision import Labeler
from fonduer.supervision.models import GoldLabel
from fonduer.features import Featurizer
from fonduer.candidates.models import Candidate
from fonduer.parser.models import Document

from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel

from MeMoKBC.pipeline.utils import get_session, load_candidates, match_label_matrix
from MeMoKBC.definitions.candidates import NameFullAbbr, NameAbbrTask
from MeMoKBC.pipeline.lfs.name_short_long_lfs import short_long_lfs
from MeMoKBC.pipeline.lfs.name_short_task_lfs import name_abbr_task_lfs
from MeMoKBC.gold_label_matcher import match_gold_label

In [3]:
session = get_session("pipeline6")

INFO:fonduer.meta:Connecting user:postgres to fonduer-postgres-dev:5432/pipeline6
INFO:fonduer.meta:Initializing the storage schema


In [4]:
candidates = [NameFullAbbr, NameAbbrTask]

In [25]:
gold_labels = match_gold_label(
    "pipeline6",
    "/data/Goldlabel_biomedRxiv/goldlabel1_docs801-840_laura/goldlabel_authorlong_short_task_medRxiv.json",
    [NameAbbrTask, NameFullAbbr]
)

nat_cands = []
nfa_cands = []
for cand in gold_labels:
    if type(cand) == NameAbbrTask:
        if cand[0].context.sentence.id == cand[1].context.sentence.id:
            nat_cands.append(cand.id)
    elif type(cand) == NameFullAbbr:
        nfa_cands.append(cand.id)

INFO:fonduer.meta:Connecting user:postgres to fonduer-postgres-dev:5432/pipeline6
INFO:fonduer.meta:Initializing the storage schema
INFO:root:Found relations for 22 documents
INFO:root:Found 6820 candidates for <class 'fonduer.candidates.models.candidate.NameAbbrTask'>
INFO:root:Found 28105 candidates for <class 'fonduer.candidates.models.candidate.NameFullAbbr'>
INFO:root:Found candidates for 40 documents


In [14]:
labeler = Labeler(session, candidates)

In [26]:
def gold(c: Candidate) -> int:
    if type(c) == NameAbbrTask:
        if c.id in nat_cands:
            return 1

    elif type(c) == NameFullAbbr:
        if c.id in nfa_cands:
            return 1

    return 0

In [29]:
labeler.apply(lfs=[[gold], [gold]], table=GoldLabel, train=True)

  .filter(candidate_class.id.in_(sub_query))
INFO:fonduer.supervision.labeler:Clearing Labels (split 0)
  query = self.session.query(table).filter(table.candidate_id.in_(sub_query))
INFO:fonduer.utils.udf:Running UDF...


  0%|          | 0/14 [00:00<?, ?it/s]

In [31]:
train_cands = load_candidates(session, 0, candidates)

  cands = (session.query(candidate_class).filter(candidate_class.id.in_(sub_query)).order_by(candidate_class.id).all())


In [34]:
L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold")

In [30]:
from pathlib import Path

gen_model_NFA = LabelModel(cardinality=2)
gen_model_NAT = LabelModel(cardinality=2)

In [11]:
L_train_NFA, L_train_NAT = match_label_matrix(session, candidates, 0)

  cands = (session.query(candidate_class).filter(candidate_class.id.in_(sub_query)).order_by(candidate_class.id).all())


In [12]:
gen_model_NFA.fit(L_train_NFA, n_epochs=500, log_freq=100)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.066]
INFO:root:[100 epochs]: TRAIN:[loss=0.003]
 27%|██▋       | 133/500 [00:00<00:00, 1326.84epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.002]
 55%|█████▌    | 277/500 [00:00<00:00, 1388.29epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.001]
INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 1405.08epoch/s]
INFO:root:Finished Training


In [13]:
gen_model_NAT.fit(L_train_NAT, n_epochs=500, log_freq=100)

ValueError: L_train should have at least 3 labeling functions

In [None]:


if Path("models/label_model_NFA_v1.pkl").is_file() and Path("models/label_model_NFT_v1.pkl").is_file():
    gen_model_NFA.load(source="models/label_model_NFA_v1.pkl")
    gen_model_NFT.load(source="models/label_model_NFT_v1.pkl")
    print("Loaded Models")
else:
    gen_model_NFT.fit(L_train_NFT, n_epochs=500, log_freq=100)
    gen_model_NFT.save("models/label_model_NFT_v1.pkl")

    gen_model_NFA.fit(L_train_NFA, n_epochs=500, log_freq=100)
    gen_model_NFA.save("models/label_model_NFA_v1.pkl")
    print("Fit and saved models")

### Generating train marginals

In [None]:
train_marginals_NFA = gen_model_NFA.predict_proba(L_train_NFA)

In [None]:
train_marginals_NFT = gen_model_NFT.predict_proba(L_train_NFT)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2)
fig.set_figheight(4)
fig.set_figwidth(12)
fig.set_tight_layout("w_pad")

bins=20

ax[0].hist(train_marginals_NFA[:, 0], bins=bins)
ax[0].set_title("NFA(TRUE)")

ax[1].hist(train_marginals_NFT[:, 0], bins=bins)
ax[1].set_title("NFT(TRUE)")
plt.show()

### Iterate on LFs

In [None]:
labeler = Labeler(session, )

In [None]:
dev_cands = load_candidates(session, split=1, candidate_list=candidates)

L_dev_NFA, L_dev_NFT = labeler.get_label_matrices(dev_cands)

## Discriminative Model

extract words from train_cands and count them

In [None]:
import numpy as np
# calculate the diff between true and false prediction probability of each candidate
# the bigger the difference the more certain the model is
# Example True = 0.4 False = 0.6
# diff = 0.6 - 0.4 = 0.2 --> model is very unsure 
diffs = train_marginals_NFT.max(axis=1) - train_marginals_NFT.min(axis=1)

# filter out all candidates where labelmodel is very unsure
# unsure is a diff of smaller then 0.000001
train_idxs = np.where(diffs > 0.2)[0].astype(np.int64)
filtered = train_marginals_NFT[train_idxs, 1]

# Cast continous values to binary for logistic regression model
y = np.where(filtered > 0.5, 1, 0)

Get feature matrix and filter with previous filter

In [None]:
featurizer = Featurizer(session, candidates)

In [None]:
F_train_NFT = featurizer.get_feature_matrices(train_cands)[0]
X = F_train_NFT[train_idxs, :]


train logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression, BayesianRidge

clf = LogisticRegression(max_iter=200).fit(X, y)

In [None]:
test_cands = load_candidates(session, 2, candidates)
F_test_NFT = featurizer.get_feature_matrices(test_cands)[0]

preds = clf.predict(F_test_NFT)

In [None]:
reg = BayesianRidge().fit(X.toarray(), y)