# Labelfunction and model analysis

imports

In [1]:
from fonduer.supervision import Labeler
from fonduer.features import Featurizer
from fonduer.candidates.models import Candidate

from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel

from MeMoKBC.pipeline.utils import get_session, load_candidates, match_label_matrix
from MeMoKBC.definitions.candidates import NameFullAbbr, NameAbbrTask
from MeMoKBC.pipeline.lfs.name_short_long_lfs import short_long_lfs
from MeMoKBC.pipeline.lfs.name_short_task_lfs import name_abbr_task_lfs

Get session object

In [2]:
session = get_session(db_name="pipeline2")

[2023-06-09 14:55:39,667][INFO] fonduer.meta:49 - Setting logging directory to: /tmp/2023-06-09_14-55-39


Define candidates and Labeler object

In [3]:
candidates = [NameAbbrTask, NameFullAbbr]


Load candidates and labels

In [4]:
L_train_NAT, L_train_NFA = match_label_matrix(session, candidates, 0)

  cands = (session.query(candidate_class).filter(candidate_class.id.in_(sub_query)).order_by(candidate_class.id).all())


## LF analysis

NameFull + Abrreviation

In [5]:
labeler = Labeler(session, candidates)

In [11]:
LFAnalysis(
    L_train_NFA,
    lfs=sorted(short_long_lfs, key=lambda lf: lf.name)
).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
check_all_uppercase_letters,0,[],0.0,0.0,0.0
name_full_in_top_percentile_sentence_wise,1,[1],0.256985,0.0427,0.0
name_short_outside_half_percentile_sentence_wise,2,[1],0.016922,0.006887,0.0
small_letter_count,3,[1],0.069067,0.050964,0.0
word_count,4,[1],0.097993,0.054309,0.0


NameAbbr + Task

In [12]:
LFAnalysis(
    L_train_NAT,
    lfs=sorted(name_abbr_task_lfs, key=lambda lf: lf.name)
).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_length_more_than_three_words,0,[],0.0,0.0,0.0
lf_name_short_in_first_words,1,[0],0.018817,0.0,0.0


## Model analysis !! continue when LFs are ready !!

List of models

| Modelname | Candidate | description | n_epochs |
| --------- | --------- | ----------- | -------- |
| label_model_v1_NFA | NameFullAbbr | label model with random label functions | 500 |
| label_model_v1_NFT | NameFullTask | label model with random label functions | 500 |

In [7]:
from pathlib import Path

gen_model_NFT = LabelModel(cardinality=2)
gen_model_NFA = LabelModel(cardinality=2)


if Path("models/label_model_NFA_v1.pkl").is_file() and Path("models/label_model_NFT_v1.pkl").is_file():
    gen_model_NFA.load(source="models/label_model_NFA_v1.pkl")
    gen_model_NFT.load(source="models/label_model_NFT_v1.pkl")
    print("Loaded Models")
else:
    gen_model_NFT.fit(L_train_NFT, n_epochs=500, log_freq=100)
    gen_model_NFT.save("models/label_model_NFT_v1.pkl")

    gen_model_NFA.fit(L_train_NFA, n_epochs=500, log_freq=100)
    gen_model_NFA.save("models/label_model_NFA_v1.pkl")
    print("Fit and saved models")

NameError: name 'L_train_NFT' is not defined

### Generating train marginals

In [None]:
train_marginals_NFA = gen_model_NFA.predict_proba(L_train_NFA)
train_marginals_NFT = gen_model_NFT.predict_proba(L_train_NFT)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2)
fig.set_figheight(4)
fig.set_figwidth(12)
fig.set_tight_layout("w_pad")

bins=20

ax[0].hist(train_marginals_NFA[:, 0], bins=bins)
ax[0].set_title("NFA(TRUE)")

ax[1].hist(train_marginals_NFT[:, 0], bins=bins)
ax[1].set_title("NFT(TRUE)")
plt.show()

### Iterate on LFs

In [None]:
dev_cands = load_candidates(session, split=1, candidate_list=candidates)

L_dev_NFA, L_dev_NFT = labeler.get_label_matrices(dev_cands)

## Discriminative Model

extract words from train_cands and count them

In [None]:
import numpy as np
# calculate the diff between true and false prediction probability of each candidate
# the bigger the difference the more certain the model is
# Example True = 0.4 False = 0.6
# diff = 0.6 - 0.4 = 0.2 --> model is very unsure 
diffs = train_marginals_NFT.max(axis=1) - train_marginals_NFT.min(axis=1)

# filter out all candidates where labelmodel is very unsure
# unsure is a diff of smaller then 0.000001
train_idxs = np.where(diffs > 0.2)[0].astype(np.int64)
filtered = train_marginals_NFT[train_idxs, 1]

# Cast continous values to binary for logistic regression model
y = np.where(filtered > 0.5, 1, 0)

Get feature matrix and filter with previous filter

In [None]:
featurizer = Featurizer(session, candidates)

In [None]:
F_train_NFT = featurizer.get_feature_matrices(train_cands)[0]
X = F_train_NFT[train_idxs, :]


train logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression, BayesianRidge

clf = LogisticRegression(max_iter=200).fit(X, y)

In [None]:
test_cands = load_candidates(session, 2, candidates)
F_test_NFT = featurizer.get_feature_matrices(test_cands)[0]

preds = clf.predict(F_test_NFT)

In [None]:
reg = BayesianRidge().fit(X.toarray(), y)