In [1]:
import pyagrum as gum
import pyagrum.lib.notebook as gnb
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from math import prod
from sklearn.metrics import confusion_matrix

In [2]:
np.random.seed(42)
gum.initRandom(seed=42)

In [3]:
# Import BN
gt = gum.loadBN(f"./gt_bns/cancer.bif")

gnb.flow.row(gt)

In [4]:
# Generate data
g = gum.BNDatabaseGenerator(gt)
g.drawSamples(1000)
g.setDiscretizedLabelModeRandom()
data = g.to_pandas()

data.describe()

Unnamed: 0,Pollution,Smoker,Cancer,Xray,Dyspnoea
count,1000,1000,1000,1000,1000
unique,2,2,2,2,2
top,low,False,False,negative,False
freq,905,711,992,791,686


In [5]:
## Inference example
# Create object for inference
gt_tmp = gum.BayesNet(gt)
gt_ie = gum.LazyPropagation(gt_tmp)

# Set evidence
evid = {"Xray": "positive", "Dyspnoea": "True"}
gt_ie.setEvidence(evid)

# Set target
target = "Cancer"

# Compute posterior
p = gt_ie.posterior(target)
print(f"+++ Posterior of '{target}': {p}")

# Compute MPE
mpe_idx = p.argmax()[0][0].get(target)
print(f"+++ MPE for '{target}': {gt.variable(target).label(mpe_idx)}")

+++ Posterior of 'Cancer': 
  Cancer           |
True     |False    |
---------|---------|
 0.1029  | 0.8971  |

+++ MPE for 'Cancer': False


In [6]:
# Split data
train, test = train_test_split(data, test_size=0.3)

# Learn bn from train
bn_learner=gum.BNLearner(train)
bn_learner.useSmoothingPrior(1e-5)
bn = bn_learner.learnParameters(gt.dag())

In [7]:
# Create noisy bn (Zhang et al., 2017)
bn_ie = gum.LazyPropagation(bn)
bn_ie.makeInference()

d = bn.size()                   # number of nodes
n = len(data)                   # sample size
eps = 0.1                       # smaller 'eps' = more privacy.
scale = (2 * d) / (n * eps)     # scale for Laplace distrib. (mean=0)

bn_noisy = gum.BayesNet(bn)

# For each node ...
for node in bn.names():

    # Get the joint P(X, Pa(X))
    joint = bn_ie.jointPosterior(bn.family(node))

    # Add noise to P(X, Pa(X)) and normalize
    noise = np.random.laplace(scale=scale, size=joint.shape)
    noisy_joint = np.clip(joint.toarray() + noise, a_min=10e-8, a_max=None)
    noisy_joint = noisy_joint / np.sum(noisy_joint)
    joint.fillWith(noisy_joint.flatten())

    # Compute the conditional P(X | Pa(X))
    cond = joint / joint.sumOut(node)

    # Fill noisy BN
    bn_noisy.cpt(node).fillWith(cond)

# Check noisy bn
bn_noisy.check()                # OK if = ().

()

In [8]:
# Plot
idx = 4
gnb.flow.row(gt, gt.cpt(idx), bn.cpt(idx), bn_noisy.cpt(idx), captions=["GT", "GT CPT", "BN CPT", "Noisy BN CPT"])

Unnamed: 0_level_0,Dyspnoea,Dyspnoea
Cancer,True,False
True,0.65,0.35
False,0.3,0.7

Unnamed: 0_level_0,Dyspnoea,Dyspnoea
Cancer,False,True
False,0.6999,0.3001
True,0.4286,0.5714

Unnamed: 0_level_0,Dyspnoea,Dyspnoea
Cancer,False,True
False,0.6197,0.3803
True,0.0,1.0


In [9]:
# Define MPE function
def mpe(bn_ie, row, target_var, evid_vars):

    # Set evidence from 'evid_vars'
    evid = dict(row.loc[evid_vars])
    bn_ie.setEvidence(evid)

    # Compute P(target_var | evidence)
    posterior = bn_ie.posterior(target_var)

    # Compute MPE
    mpe_idx = posterior.argmax()[0][0].get(target_var)
    mpe = bn.variable(target_var).label(mpe_idx)

    return mpe

# Inference on test set
bn_ie = gum.LazyPropagation(bn)
bn_noisy_ie = gum.LazyPropagation(bn_noisy)

target_var = "Cancer"
evid_vars = ["Dyspnoea", "Xray"]

true = test["Cancer"]

pred_bn = test.apply(lambda x: mpe(bn_ie, x, target_var, evid_vars), axis = 1)
pred_bn_noisy = test.apply(lambda x: mpe(bn_noisy_ie, x, target_var, evid_vars), axis = 1)

In [10]:
# Confusion matrix
'''
C_{ij} means i, predicted as j

Predicted
 -----------
| TP  |  FN |
|-----------|
| FP  |  TN |
 -----------
 
'''

labels = ["True", "False"]
print("Labels:", labels) 
print("\n")
print("BN:\n ", confusion_matrix(true, pred_bn, labels=labels))
print("\n")
print("Noisy BN:\n ", confusion_matrix(true, pred_bn_noisy, labels=labels))

Labels: ['True', 'False']


BN:
  [[  0   1]
 [  0 299]]


Noisy BN:
  [[  1   0]
 [ 24 275]]
