# Bayesian Reasoning and Machine Learning

### Michel Mariën, Assignment 2

## Notebook setup

#### Installing required packages

In [1]:
#!pip install pyagrum
#!pip install scikit-learn
#!pip install graphviz
#!pip install pydot
#!pip install scikit-learn
#!pip show pyagrum
#!pip install tqdm

#### Importing required packages

In [2]:
import numpy as np
import pandas as pd
import math
import graphviz
import matplotlib.pyplot as plt
import pydot
import pyagrum as gum
import pyagrum.lib.notebook as gnb
import pyagrum.causal as csl
import pyagrum.causal.notebook as cslnb
import pyagrum as gum
import pyagrum.lib.notebook as gnb

### Data preprocessing

#### Upload data

## Bayesian Network Creation

## Causal inference

In [3]:
import pyagrum
import pyagrum.lib.explain as explain
import pyagrum.lib.bn_vs_bn as bn_vs_bn
from pylab import *

In [4]:
# Task 2.2a: Investigating the effect of sample size of learning structure
## Generate 1000 samples
## Generate network structures
## Investigate effect size of the dataset on the structure of the resulting BN network
## do this for both learning algorithms: constraint-based -> MICC and score-based: -> greedy hill climb
## evaluate the networks in terms of how well it learns the original bayesian network in terms of structure (manual comparison or visulisation of the structure)

In [5]:
## Generate 1000 samples
bif_file_path = 'C:\\Users\\michel.marien_icarew\\Documents\\GitHub\\BRML\\BRML- Assignment 2\\CNC_bif.bif'
no_samples = 1000

bn_cnc = gum.loadBN(bif_file_path)
gum.generateSample(bn_cnc, no_samples, "sample_cnc.csv", True)

sample_cnc.csv: 100%|██████████████████████████████████████|

Log2-Likelihood : -10863.703887007629





-10863.703887007629

In [None]:
learner = gum.BNLearner("sample_cnc.csv", bn_cnc)  # using bn as template for variables and labels
learner.useSmoothingPrior(weight=1)
bn_cnc_2 = learner.learnParameters(bn_cnc.dag())
gnb.showBN(bn_cnc_2)

In [None]:
learner = gum.BNLearner("sample_cnc.csv", bn_cnc)  # using bn as template for variables and labels
learner.useSmoothingPrior(weight=1)
learner.useLocalSearchWithTabuList()
print(learner)
bn_cnc_2 = learner.learnBN()
print("Learned in {0}ms".format(1000 * learner.currentTime()))
#gnb.flow.row(bn_cnc, bn_cnc_2, explain.getInformation(bn_cnc_2), captions=["Original BN", "Learned BN", "information"])
gnb.flow.row(bn_cnc, bn_cnc_2, captions=["Original BN", "Learned BN"])

In [None]:
kl = gum.ExactBNdistance(bn_cnc, bn_cnc_2)
kl.compute()

In [None]:
import pyagrum.lib.explain as explain

# Assuming bn_cnc_2 is your learned Bayesian Network

# explain.getInformation() directly returns the string of SVG code
svg_code = explain.getInformation(bn_cnc_2)

# Now save this string directly to a file
svg_filepath = "bn_information.svg"
with open(svg_filepath, "w") as f:
    f.write(svg_code)

print(f"✅ Successfully saved vector graphic to '{svg_filepath}'")

In [None]:
gnb.showBN(bn_cnc_2)

In [None]:
gnb.showBN(bn_cnc)

In [None]:
gnb.showBN(bn_cnc_2)

In [None]:
import pyagrum.lib.bn_vs_bn as bnvsbn

In [None]:
gnb.flow.row(bn_cnc, bn_cnc_2, captions=["bn_cnc", "bn_cnc_2"])
gnb.flow.row(
  bnvsbn.graphDiff(bn_cnc, bn_cnc_2),
  bnvsbn.graphDiff(bn_cnc_2, bn_cnc),
  bnvsbn.graphDiffLegend(),
  captions=["bn_cnc versus bn_cnc_2", "bn_cnc_2 versus bn_cnc", ""],
)

gcmp = bnvsbn.GraphicalBNComparator(bn_cnc, bn_cnc_2)
gnb.flow.add_html(
  "<br/>".join([f"{k} : {v:.2f}" for k, v in gcmp.skeletonScores().items() if k != "count"]), "Skeleton scores"
)
gnb.flow.add_html("<br/>".join([f"{k} : {v:.2f}" for k, v in gcmp.scores().items() if k != "count"]), "Scores")

gnb.flow.display()

In [None]:
learner = gum.BNLearner("sample_cnc.csv", bn_cnc)  # using bn as template for variables and labels
learner.useGreedyHillClimbing()
print(learner)
bn2 = learner.learnBN()
print("Learned in {0}ms".format(1000 * learner.currentTime()))
gnb.sideBySide(
  bn_cnc,
  bn2,
  gnb.getBNDiff(bn_cnc, bn2),
  explain.getInformation(bn2),
  captions=["Original BN", "Learned BN", "Graphical diff", "information"],
)

In [None]:
from IPython.display import HTML
gnb.sideBySide(gnb.showBN(bn_cnc_2),gnb.showBN(bn2),
               ncols=2)

In [None]:
svg_code2 = explain.getInformation(bn2)

# Now save this string directly to a file
svg_filepath2 = "bn_information2.svg"
with open(svg_filepath2, "w") as f:
    f.write(svg_code)

print(f"✅ Successfully saved vector graphic to '{svg_filepath2}'")

In [None]:
rows = 3
sizes = [400, 500, 700, 1000, 2000, 5000, 10000, 50000, 75000, 100000, 150000, 175000, 200000, 300000, 500000]


def extract_cnc(n):
  """
  extract n line from asia.csv to extract.csv
  """
  with open("sample_cnc.csv", "r") as src:
    with open("extract_cnc.csv", "w") as dst:
      for _ in range(n + 1):
        print(src.readline(), end="", file=dst)

In [None]:
gnb.flow.clear()
nbr = 0
l = []
for i in sizes:
  extract_cnc(i)
  learner = gum.BNLearner("extract_cnc.csv", bn_cnc)  # using bn as template for variables
  learner.useGreedyHillClimbing()
  print(learner.state()["Size"][0])
  extract_cnc_2 = learner.learnBN()

  kl = gum.ExactBNdistance(bn_cnc, bn_cnc_2)
  r = kl.compute()
  l.append(log(r["klPQ"]))

  gnb.flow.add(gnb.getBNDiff(bn_cnc, bn_cnc_2, size="3!"), f"size={i}")

gnb.flow.display()
plot(sizes, l)
print(f"final value computed : {l[-1]}")

In [None]:
gnb.flow.clear()
nbr = 0
l = []
for i in sizes:
  extract_cnc(i)
  learner = gum.BNLearner("extract_cnc.csv", bn_cnc)  # using bn as template for variables
  learner.useLocalSearchWithTabuList()
  print(learner.state()["Size"][0])
  extract_cnc_2 = learner.learnBN()

  kl = gum.ExactBNdistance(bn_cnc, bn_cnc_2)
  r = kl.compute()
  l.append(log(r["klPQ"]))

  gnb.flow.add(gnb.getBNDiff(bn_cnc, bn_cnc_2, size="3!"), f"size={i}")

gnb.flow.display()
plot(sizes, l)
print(f"final value computed : {l[-1]}")

In [None]:
# Task 2.2b
## vragen rond borstkanker data set