In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../") # go to parent dir

import numpy as np
import torch
from torch import nn, optim
import matplotlib.pyplot as plt

from itertools import product

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x7faf33c1b9f0>

### Step 1: Generate Data

In [3]:
from synthetic.generate import SingleTaskTreeDepsGenerator

K = 2
M = 10
N = 10000

# Generate the true class balance to be recovered
class_balance = np.ones(K)/K + np.random.random(K)/3.
class_balance /= class_balance.sum()

#, edges=[(0,1)]
data = SingleTaskTreeDepsGenerator(N, M, K, class_balance, edges=[(0,1)])
print (f"LF Dependencies: {data.E}")
print (f"Class Balance: {data.p}")

LF Dependencies: [(0, 1)]
Class Balance: [0.55151521 0.44848479]


In [4]:
from metal.analysis import lf_summary
lf_summary(data.L,data.Y)

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
0,"[1, 2]",0.8838,0.8838,0.7348,7668,1170,0.867617
1,"[1, 2]",0.9546,0.9546,0.7919,9095,451,0.952755
2,"[1, 2]",0.8089,0.8089,0.6776,6170,1919,0.762764
3,"[1, 2]",0.7636,0.7636,0.6415,5310,2326,0.69539
4,"[1, 2]",0.8361,0.8361,0.698,6683,1678,0.799306
5,"[1, 2]",0.855,0.855,0.7117,7126,1424,0.83345
6,"[1, 2]",0.923,0.923,0.7657,8414,816,0.911593
7,"[1, 2]",0.8526,0.8526,0.7105,7029,1497,0.824419
8,"[1, 2]",0.7597,0.7597,0.6416,5237,2360,0.689351
9,"[1, 2]",0.7497,0.7497,0.6371,5085,2412,0.678271


### Step 2: Learn Dependencies using `DependencyLearnerModel`
**TEMP: hardcoding for now**

In [5]:
edges = data.E
L = np.array(data.L.todense())

### Step 3: Recover the class balance using subset of independent LFs

In [6]:
# Generate indices for independent LFs
ind_lfs = []
for i in range(M):
    if i not in list(sum(edges, ())):
        ind_lfs.append(i)
L_ind = L[:,ind_lfs]

In [7]:
%%time
from metal.label_model.class_balance import ClassBalanceModel

cb_model = ClassBalanceModel(K, abstains=True)
cb_model.train_model(L=L_ind, verbose=False)

print(f"Estimated class balance: {cb_model.class_balance}")
print(f"True class balance: {class_balance}")
print()

Estimated class balance: [0.5619966  0.43781662]
True class balance: [0.55151521 0.44848479]

CPU times: user 5.42 s, sys: 12 ms, total: 5.43 s
Wall time: 929 ms


# Debugging

**Model with no Class Balance Passed In**

In [8]:
L = np.array(data.L.todense())
Y = np.array(data.Y,)
#L = np.vstack((L.T,Y)).T

from metal.label_model import LabelModel
label_model = LabelModel(verbose=True,k=K, seed=123)

label_model.train_model(
    L, 
    deps = edges,
    lr = 1e-3,
    n_epochs=500,
    log_train_every=100
)

print()
print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Computing O^{-1}...
Estimating Z...
[100 epo]: TRAIN:[loss=10.870]
[200 epo]: TRAIN:[loss=10.714]
[300 epo]: TRAIN:[loss=10.714]
[400 epo]: TRAIN:[loss=10.714]
[500 epo]: TRAIN:[loss=10.714]
Finished Training
Estimating \mu...
[100 epo]: TRAIN:[loss=0.010]
[200 epo]: TRAIN:[loss=0.005]
[300 epo]: TRAIN:[loss=0.003]
[400 epo]: TRAIN:[loss=0.002]
[500 epo]: TRAIN:[loss=0.001]
Finished Training

Trained Label Model Metrics:
Accuracy: 0.986
Precision: 0.989
Recall: 0.985
F1: 0.987
        y=1    y=2   
 l=1   5491    61    
 l=2    83    4365   


**Model with Class Balance Passed In**

In [15]:
L = np.array(data.L.todense())
Y = np.array(data.Y,)
#L = np.vstack((L.T,Y)).T

from metal.label_model import LabelModel
label_model = LabelModel(verbose=True,k=K, seed=123)

label_model.train_model(
    L, 
    deps = edges,
    lr = 1e-3,
    n_epochs=500,
    class_balance=cb_model.class_balance.astype(float),
    log_train_every=100
)

print()
print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Computing O^{-1}...
Estimating Z...
[100 epo]: TRAIN:[loss=10.870]
[200 epo]: TRAIN:[loss=10.714]
[300 epo]: TRAIN:[loss=10.714]
[400 epo]: TRAIN:[loss=10.714]
[500 epo]: TRAIN:[loss=10.714]
Finished Training
Estimating \mu...
[100 epo]: TRAIN:[loss=0.004]
[200 epo]: TRAIN:[loss=0.002]
[300 epo]: TRAIN:[loss=0.001]
[400 epo]: TRAIN:[loss=0.001]
[500 epo]: TRAIN:[loss=0.000]
Finished Training

Trained Label Model Metrics:
Accuracy: 0.981
Precision: 0.972
Recall: 0.994
F1: 0.983
        y=1    y=2   
 l=1   5539    159   
 l=2    35    4267   


### Step 4: Train `LabelModel` using $\mu$ from `ClassBalanceModel`

In [16]:
%%time
from metal.label_model import LabelModel
label_model = LabelModel(verbose=True,k=K, seed=123)

label_model.train_model(
    data.L, 
    cond_probs = cb_model.cond_probs, 
    ind_lfs = ind_lfs, 
    deps = edges, 
    #class_balance = cb_model.class_balance.astype(float),
    lr = 1e-3, 
    n_epochs = 500, 
    log_train_every = 100
)

print()
for y in range(K):
    mu_idx = [il*K + y for il in ind_lfs]
    diff = np.linalg.norm(label_model.mu[mu_idx,y].detach().numpy() - 
                         label_model.mu_init[mu_idx,y].detach().numpy())/(M*1.0)
    print(f"Diff between mu and mu_init for y = {y+1}: {diff}")
print()

Computing O^{-1}...
Estimating Z...
[100 epo]: TRAIN:[loss=10.870]
[200 epo]: TRAIN:[loss=10.714]
[300 epo]: TRAIN:[loss=10.714]
[400 epo]: TRAIN:[loss=10.714]
[500 epo]: TRAIN:[loss=10.714]
Finished Training
Estimating \mu...
[100 epo]: TRAIN:[loss=0.006]
[200 epo]: TRAIN:[loss=0.003]
[300 epo]: TRAIN:[loss=0.002]
[400 epo]: TRAIN:[loss=0.001]
[500 epo]: TRAIN:[loss=0.001]
Finished Training

Diff between mu and mu_init for y = 1: 0.009306653589010238
Diff between mu and mu_init for y = 2: 0.009698057174682617

CPU times: user 10.4 s, sys: 2.43 s, total: 12.8 s
Wall time: 611 ms


In [18]:
print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Trained Label Model Metrics:
Accuracy: 0.985
Precision: 0.990
Recall: 0.983
F1: 0.986
        y=1    y=2   
 l=1   5477    54    
 l=2    97    4372   


**With $\mu$ and `class_balance`**

In [19]:
%%time
from metal.label_model import LabelModel
label_model = LabelModel(verbose=True,k=K, seed=123)

label_model.train_model(
    data.L, 
    cond_probs = cb_model.cond_probs, 
    ind_lfs = ind_lfs, 
    deps = edges, 
    class_balance = cb_model.class_balance.astype(float),
    lr = 1e-3, 
    n_epochs = 500, 
    log_train_every = 100
)

print('Trained Label Model Metrics:')
scores = label_model.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Computing O^{-1}...
Estimating Z...
[100 epo]: TRAIN:[loss=10.870]
[200 epo]: TRAIN:[loss=10.714]
[300 epo]: TRAIN:[loss=10.714]
[400 epo]: TRAIN:[loss=10.714]
[500 epo]: TRAIN:[loss=10.714]
Finished Training
Estimating \mu...
[100 epo]: TRAIN:[loss=0.006]
[200 epo]: TRAIN:[loss=0.003]
[300 epo]: TRAIN:[loss=0.002]
[400 epo]: TRAIN:[loss=0.001]
[500 epo]: TRAIN:[loss=0.001]
Finished Training
Trained Label Model Metrics:
Accuracy: 0.980
Precision: 0.971
Recall: 0.994
F1: 0.982
        y=1    y=2   
 l=1   5542    167   
 l=2    32    4259   
CPU times: user 15.8 s, sys: 1.97 s, total: 17.8 s
Wall time: 1.16 s


**Baseline: Majority Vote**

In [20]:
from metal.label_model.baselines import MajorityLabelVoter

mv = MajorityLabelVoter(k=K,seed=123)
print('Majority Label Voter Metrics:')
scores = mv.score((data.L, data.Y), metric=['accuracy','precision', 'recall', 'f1'])

Majority Label Voter Metrics:
Accuracy: 0.978
Precision: 0.974
Recall: 0.987
F1: 0.980
        y=1    y=2   
 l=1   5500    149   
 l=2    74    4277   
