In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../") # go to parent dir

import numpy as np
import torch
from torch import nn, optim
import matplotlib.pyplot as plt

from itertools import product

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x7f1353238a70>

### Step 1: Generate Data

In [156]:
from synthetic.generate import SingleTaskTreeDepsGenerator

K = 2
M = 10
N = 10000

# Generate the true class balance to be recovered
class_balance = np.ones(K)/K + np.random.random(K)/5.
class_balance /= class_balance.sum()

data = SingleTaskTreeDepsGenerator(N, M, K, class_balance, edges=[(0,1)])
print (f"LF Dependencies: {data.E}")
print (f"Class Balance: {data.p}")

LF Dependencies: [(0, 1)]
Class Balance: [0.50537711 0.49462289]


In [157]:
from metal.analysis import lf_summary
lf_summary(data.L,data.Y)

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
0,"[1, 2]",0.861,0.861,0.7254,7277,1333,0.84518
1,"[1, 2]",0.9091,0.9091,0.7636,8136,955,0.894951
2,"[1, 2]",0.8522,0.8522,0.7199,7009,1513,0.82246
3,"[1, 2]",0.7493,0.7493,0.6409,5010,2483,0.668624
4,"[1, 2]",0.7259,0.7259,0.6224,4598,2661,0.633421
5,"[1, 2]",0.8725,0.8725,0.7301,7427,1298,0.851232
6,"[1, 2]",0.8936,0.8936,0.7495,7864,1072,0.880036
7,"[1, 2]",0.7878,0.7878,0.6674,5861,2017,0.743971
8,"[1, 2]",0.8234,0.8234,0.6908,6567,1667,0.797547
9,"[1, 2]",0.8248,0.8248,0.6976,6465,1783,0.783826


### Step 2: Learn Dependencies using `DependencyLearnerModel`
**TEMP: hardcoding for now**

In [158]:
edges = data.E
L = np.array(data.L.todense())

### Step 3: Recover the class balance using subset of independent LFs

In [159]:
# Generate indices for independent LFs
ind_lfs = []
for i in range(M):
    if i not in list(sum(edges, ())):
        ind_lfs.append(i)
L_ind = L[:,ind_lfs]

In [160]:
%%time
from metal.label_model.class_balance import ClassBalanceModel

cb_model = ClassBalanceModel(K, abstains=True)
cb_model.train_model(L=L_ind, verbose=False)

print(f"Estimated class balance: {cb_model.class_balance}")
print(f"True class balance: {class_balance}")
print()

Estimated class balance: [0.50483036 0.49531323]
True class balance: [0.50537711 0.49462289]

CPU times: user 3.28 s, sys: 152 ms, total: 3.43 s
Wall time: 922 ms


### Step 4: Train `LabelModel` using $\mu$ from `ClassBalanceModel`

In [161]:
%%time
from metal.label_model import LabelModel
label_model = LabelModel(verbose=True,k=K, seed=123)

label_model.train_model(
    data.L, 
    cond_probs = cb_model.cond_probs, 
    ind_lfs = ind_lfs, 
    deps = edges, 
    lr = 1e-3, 
    n_epochs = 1000, 
    log_train_every = 100
)

print()
for y in range(K):
    mu_idx = [il*K + y for il in ind_lfs]
    err = np.linalg.norm(label_model.mu[mu_idx,:].detach().numpy() - 
                         cb_model.cond_probs[:,y+1,:])/(M*1.0)
    print(f"Error between mu and cond_probs for y = {y+1}: {err}")
print()

Computing O^{-1}...
Estimating Z...
[100 epo]: TRAIN:[loss=8.079]
[200 epo]: TRAIN:[loss=8.028]
[300 epo]: TRAIN:[loss=8.028]
[400 epo]: TRAIN:[loss=8.028]
[500 epo]: TRAIN:[loss=8.028]
[600 epo]: TRAIN:[loss=8.028]
[700 epo]: TRAIN:[loss=8.028]
[800 epo]: TRAIN:[loss=8.028]
[900 epo]: TRAIN:[loss=8.028]
[1000 epo]: TRAIN:[loss=8.028]
Finished Training
Estimating \mu...
[100 epo]: TRAIN:[loss=0.067]
[200 epo]: TRAIN:[loss=0.037]
[300 epo]: TRAIN:[loss=0.021]
[400 epo]: TRAIN:[loss=0.012]
[500 epo]: TRAIN:[loss=0.007]
[600 epo]: TRAIN:[loss=0.004]
[700 epo]: TRAIN:[loss=0.002]
[800 epo]: TRAIN:[loss=0.001]
[900 epo]: TRAIN:[loss=0.001]
[1000 epo]: TRAIN:[loss=0.000]
Finished Training

Error between mu and cond_probs for y = 1: 0.0028541550040245057
Error between mu and cond_probs for y = 2: 0.003642233833670616

CPU times: user 22.1 s, sys: 2.64 s, total: 24.7 s
Wall time: 1.47 s


In [162]:
print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Trained Label Model Metrics:
Accuracy: 0.974
Precision: 0.972
Recall: 0.975
F1: 0.974
        y=1    y=2   
 l=1   4924    140   
 l=2    125   4811   


### Step 4b: Compare to MajorityVote and No Class Balance+Dependencies LabelModel

**Baseline: No Class Balance or Dependencies**

In [163]:
%%time
from metal.label_model import LabelModel
label_model = LabelModel(k=K, seed=123)

label_model.train_model(data.L, lr=1e-3, n_epochs=500, log_train_every=50)

Computing O...
Estimating \mu...
[50 epo]: TRAIN:[loss=0.114]
[100 epo]: TRAIN:[loss=0.017]
[150 epo]: TRAIN:[loss=0.014]
[200 epo]: TRAIN:[loss=0.014]
[250 epo]: TRAIN:[loss=0.014]
[300 epo]: TRAIN:[loss=0.014]
[350 epo]: TRAIN:[loss=0.014]
[400 epo]: TRAIN:[loss=0.014]
[450 epo]: TRAIN:[loss=0.014]
[500 epo]: TRAIN:[loss=0.014]
Finished Training
CPU times: user 14 s, sys: 88 ms, total: 14.1 s
Wall time: 516 ms


In [164]:
print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Trained Label Model Metrics:
Accuracy: 0.968
Precision: 0.976
Recall: 0.960
F1: 0.968
        y=1    y=2   
 l=1   4846    119   
 l=2    203   4832   


**Baseline: Majority Vote**

In [165]:
from metal.label_model.baselines import MajorityLabelVoter

mv = MajorityLabelVoter(k=K,seed=123)
print('Majority Label Voter Metrics:')
scores = mv.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Majority Label Voter Metrics:
Accuracy: 0.967
Precision: 0.973
Recall: 0.961
F1: 0.967
        y=1    y=2   
 l=1   4854    135   
 l=2    195   4816   
