# Task 1

* Name: Sachin Shivaramaiah
* Student ID: 341994037

In [34]:
from IPython.display import Image
from pgmpy.utils import get_example_model
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.estimators import MaximumLikelihoodEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pgmpy.inference import VariableElimination
from pgmpy.sampling import BayesianModelSampling,GibbsSampling
import numpy as np
from pgmpy.factors.discrete import State
import json
import random
import itertools
import os
import shutil
import zipfile
from pgmpy.readwrite import BIFReader, BIFWriter

In [36]:
def visualize(bayesianNet, file_name = 'bayesNet'):
    viz = bayesianNet.to_graphviz()
    viz.draw('%s.png' % file_name, prog='dot')
    return Image('%s.png' % file_name)

#### Load your model using BIFReader


In [40]:
bif_reader = BIFReader("A2_task1_model.bif")
bn_model = bif_reader.get_model()

Q1: Compute P(C_NI_12_00=2) and P(C_NI_12_45=2), and justify your answers.

In [42]:
# Step 2: Perform exact inference using Variable Elimination
inference = VariableElimination(bn_model)


result_C_NI_12_00 = inference.query(variables=["C_NI_12_00"])
print(f"P(C_NI_12_00=2): {result_C_NI_12_00.values[2]}")  # Index 2 corresponds to P(C_NI_12_00=2)


result_C_NI_12_45 = inference.query(variables=["C_NI_12_45"])
print(f"P(C_NI_12_45=2): {result_C_NI_12_45.values[2]}")  # Index 2 corresponds to P(C_NI_12_45=2)

P(C_NI_12_00=2): 0.4144947797980204
P(C_NI_12_45=2): 0.3166464815775267


##### Explanation: Variable Elimination is a widely used exact inference algorithm in probabilistic graphical models, especially in Bayesian Networks. It computes marginal distributions for specific variables by eliminating the other variables systematically.

##### Variable Elimination guarantees exact results. This means that the computed probabilities like P(C_NI_12_00=2) and P(C_NI_12_45=2) are accurate, without any approximations

##### Variable Elimination efficiently handles marginalization i.e. summing over unobserved variables by eliminating one variable at a time and keeping track of the intermediate factors. This makes it well-suited for queries like P(C_NI_12_00=2) and P(C_NI_12_45=2) where we want to sum over all the other variables except the one of interest

Explain your approach and justify your answers...



Q2: Answer the following questions regarding d-separation,explain why two variables are d-separated or
d-connected by listing all relevant blocked and unblocked paths. Additionally, explain why each of those
paths is blocked or unblocked.

#### Functionality of the below code 
* Pathfinding: Identifies all possible paths between two nodes in a Bayesian network using networkx.
* Collider vs. Non-collider Check: Differentiates between colliders (nodes with two incoming arrows) and non-colliders (chains or forks).
* Colliders: Path is blocked if the collider or its descendants aren’t conditioned on.
* Non-colliders: Path is blocked if any node on the path is conditioned.
* Conditioning Set: The code checks if paths are blocked based on a given set of conditioned variables.
* D-Separation Check: If all paths are blocked, the two nodes are d-separated (conditionally independent). If any path is unblocked, they are d-connected (dependent).

In [62]:
import networkx as nx

def find_paths(graph, source, target):
    """Function to find all paths between two nodes in the Bayesian network graph."""
    return list(nx.all_simple_paths(graph, source, target))

def is_blocked_by_conditioning(path, conditioning_set, bn_model):
    """Check if a path is blocked by the variables in the conditioning set."""
    blocked = False
    for i in range(len(path) - 1):
        node = path[i]
        next_node = path[i + 1]
        
        # Check if the path involves a collider
        if bn_model.has_edge(next_node, node):  # Collider case (arrow pointing inwards)
            if node not in conditioning_set and next_node not in conditioning_set:
                # Path is blocked if neither the collider nor its descendant is conditioned upon
                blocked = True
        else:  # Non-collider case (chain or fork)
            if node in conditioning_set:
                # Path is blocked if a non-collider node is conditioned upon
                blocked = True
    return blocked

def analyze_paths_between_nodes(bn_model, source, target, conditioning_set):
    """Function to analyze all paths between source and target and determine if they are blocked or unblocked."""
    # Convert BayesianNetwork to a networkx graph
    graph = nx.DiGraph(bn_model.edges())
    
    # Find all paths between source and target
    paths = find_paths(graph, source, target)
    
    print(f"Total paths between {source} and {target}: {len(paths)}\n")
    
    unblocked_paths_found = False
    blocked_paths = 0
    
    # Check each path for blockage
    for path in paths:
        print(f"Analyzing path: {' -> '.join(path)}")
        
        if is_blocked_by_conditioning(path, conditioning_set, bn_model):
            print(f"Path is BLOCKED by conditioning on {conditioning_set}\n")
            blocked_paths += 1
        else:
            print(f"Path is UNBLOCKED\n")
            unblocked_paths_found = True
    
    # Determine if the nodes are d-separated or d-connected
    if unblocked_paths_found:
        print(f"Conclusion: {source} and {target} are **d-connected** (not d-separated) given {conditioning_set}\n")
    else:
        print(f"Conclusion: {source} and {target} are **d-separated** given {conditioning_set}. All paths are blocked.\n")
    
    print(f"Number of blocked paths: {blocked_paths}")
    if unblocked_paths_found:
        print("At least one unblocked path found.\n")

#### a) Are C_NI_12_00 and C_NI_12_45 d-separated?

In [65]:
# Example of loading the Bayesian network model from .bif file
from pgmpy.readwrite import BIFReader

bif_reader = BIFReader('A2_task1_model.bif')
bn_model = bif_reader.get_model()

# Q2a: Analyze paths without any conditioning
print("Q2a: Checking d-separation without any conditioning")
analyze_paths_between_nodes(bn_model, "C_NI_12_00", "C_NI_12_45", conditioning_set=[])

Q2a: Checking d-separation without any conditioning
Total paths between C_NI_12_00 and C_NI_12_45: 1

Analyzing path: C_NI_12_00 -> C_NI_12_15 -> C_NI_12_30 -> C_NI_12_45
Path is UNBLOCKED

Conclusion: C_NI_12_00 and C_NI_12_45 are **d-connected** (not d-separated) given []

Number of blocked paths: 0
At least one unblocked path found.



#### b) Are C_NI_12_00 and C_NI_12_45 d-separated by ['CNOD_12_45', 'CKND_12_45']?

In [67]:
# Q2b: Analyze paths with conditioning on ['CNOD_12_45', 'CKND_12_45']
print("\nQ2b: Checking d-separation with conditioning on ['CNOD_12_45', 'CKND_12_45']")
analyze_paths_between_nodes(bn_model, "C_NI_12_00", "C_NI_12_45", conditioning_set=['CNOD_12_45', 'CKND_12_45'])


Q2b: Checking d-separation with conditioning on ['CNOD_12_45', 'CKND_12_45']
Total paths between C_NI_12_00 and C_NI_12_45: 1

Analyzing path: C_NI_12_00 -> C_NI_12_15 -> C_NI_12_30 -> C_NI_12_45
Path is UNBLOCKED

Conclusion: C_NI_12_00 and C_NI_12_45 are **d-connected** (not d-separated) given ['CNOD_12_45', 'CKND_12_45']

Number of blocked paths: 0
At least one unblocked path found.



#### c)Are C_NI_12_00 and C_NI_12_45 d-separated by ['CKNI_12_15']?

In [69]:
# Q2c: Analyze paths with conditioning on ['CKNI_12_15']
print("\nQ2c: Checking d-separation with conditioning on ['CKNI_12_15']")
analyze_paths_between_nodes(bn_model, "C_NI_12_00", "C_NI_12_45", conditioning_set=['CKNI_12_15'])


Q2c: Checking d-separation with conditioning on ['CKNI_12_15']
Total paths between C_NI_12_00 and C_NI_12_45: 1

Analyzing path: C_NI_12_00 -> C_NI_12_15 -> C_NI_12_30 -> C_NI_12_45
Path is UNBLOCKED

Conclusion: C_NI_12_00 and C_NI_12_45 are **d-connected** (not d-separated) given ['CKNI_12_15']

Number of blocked paths: 0
At least one unblocked path found.



#### d) Find a smallest set of variables that make C_NI_12_00 and C_NI_12_45 d-separated. Explain your solution in details. If there are multiple smallest sets, choose one of them as your solution.

* Objective: This function is designed to identify the minimal set of variables that d-separates two nodes in a Bayesian network. D-separation is a fundamental concept in Bayesian networks, determining conditional independence between variables.
* Exploring Combinations: By iterating over all possible combinations of variables (starting with smaller sets), the function explores how different sets of variables can block all causal paths between the source and target nodes.
* Path Blocking: D-separation involves blocking all paths between two variables. A path is blocked if conditioning on a set of variables satisfies the rules for blocking like colliders, chains, and forks. The function checks if the selected combination of nodes successfully blocks all paths between the source and target.
* Minimal Separation: The function identifies the smallest conditioning set that ensures d-separation. If all paths are blocked by the conditioning set, this is considered the minimal d-separating set, ensuring conditional independence.
* Significance: The process helps determine the most efficient conditions for d-separation without over-conditioning, improving understanding of causal relationships in Bayesian networks.


In [71]:
from itertools import combinations

def find_minimal_d_separation(bn_model, source, target):
    """
    Finds the smallest set of variables that d-separates source and target.
    """
    # Convert the BayesianNetwork to a networkx graph
    graph = nx.DiGraph(bn_model.edges())
    
    # Get all nodes in the Bayesian network, excluding source and target
    all_nodes = set(graph.nodes()) - {source, target}
    
    # Iterate through all combinations of nodes, starting with smaller sets
    for r in range(1, len(all_nodes) + 1):
        for conditioning_set in combinations(all_nodes, r):
            print(f"Trying conditioning set: {conditioning_set}")
            paths = find_paths(graph, source, target)
            
            all_blocked = True  
            for path in paths:
                if not is_blocked_by_conditioning(path, conditioning_set, bn_model):
                    all_blocked = False  # If any path is unblocked, this set doesn't work
                    break
            
            if all_blocked:
                # If all paths are blocked, this is a valid d-separating set
                return conditioning_set
    
    return None  # Return None if no d-separating set is found 

# Finding the smallest d-separating set for Q2d
print("Q2d: Finding the smallest set that d-separates C_NI_12_00 and C_NI_12_45")
smallest_d_separation_set = find_minimal_d_separation(bn_model, "C_NI_12_00", "C_NI_12_45")

if smallest_d_separation_set:
    print(f"\nSmallest d-separating set: {smallest_d_separation_set}")
else:
    print("No d-separating set found.")

Q2d: Finding the smallest set that d-separates C_NI_12_00 and C_NI_12_45
Trying conditioning set: ('CNOD_12_45',)
Trying conditioning set: ('CBODN_12_15',)
Trying conditioning set: ('CKNN_12_30',)
Trying conditioning set: ('CKNI_12_30',)
Trying conditioning set: ('CNON_12_00',)
Trying conditioning set: ('CBODD_12_45',)
Trying conditioning set: ('CNOD_12_00',)
Trying conditioning set: ('CKNI_12_15',)
Trying conditioning set: ('CKND_12_45',)
Trying conditioning set: ('CNON_12_15',)
Trying conditioning set: ('C_NI_12_30',)

Smallest d-separating set: ('C_NI_12_30',)


Q3: Compute P(C_NI_12_00=1,C_NI_12_45=1 | CNOD_12_45=0, CKND_12_45=1) and explain why a
particular inference method is chosen.

In [75]:
from pgmpy.inference import VariableElimination

# Assuming bn_model is already loaded
# The Bayesian network is represented by bn_model

# Step 1: Initialize Variable Elimination
inference = VariableElimination(bn_model)

# Step 2: Set the evidence variables (state names should be strings)
# CNOD_12_45 = '0' and CKND_12_45 = '1'
evidence = {'CNOD_12_45': '0', 'CKND_12_45': '1'}

# Step 3: Compute joint probability P(C_NI_12_00=1, C_NI_12_45=1 | CNOD_12_45=0, CKND_12_45=1)
result_joint = inference.query(variables=['C_NI_12_00', 'C_NI_12_45'], evidence=evidence)

# Step 4: Extract the specific joint probability for C_NI_12_00 = '1' and C_NI_12_45 = '1'
joint_prob = result_joint.values[1, 1]  # Index 1 corresponds to C_NI_12_00='1' and C_NI_12_45='1'

# Step 5: Output the joint probability
print(f"Joint Probability P(C_NI_12_00=1, C_NI_12_45=1 | CNOD_12_45=0, CKND_12_45=1): {joint_prob}")


Joint Probability P(C_NI_12_00=1, C_NI_12_45=1 | CNOD_12_45=0, CKND_12_45=1): 0.053227538795820414


This is a method used for exact inference in Bayesian networks, meaning it gives precise probabilities. Variable Elimination eliminates the unnecessary variables one by one to compute the required probabilities efficiently.

Conditioning on Evidence: The evidence provided (CNOD_12_45=0, CKND_12_45=1) is used to update the network, restricting the possible values for CNOD_12_45 and CKND_12_45. This is the conditional aspect of the joint probability.

Marginalizing Over Unnecessary Variables: Variable Elimination sums over all possible configurations of other random variables that aren’t directly relevant to the query i.e., variables other than C_NI_12_00, C_NI_12_45, CNOD_12_45, and CKND_12_45.

Result: The result, P(C_NI_12_00=1, C_NI_12_45=1), is then derived by summing the joint distribution and incorporating the effects of the evidence provided.

Process:
Query Setup: The query asks for the joint probability of two variables (C_NI_12_00=1 and C_NI_12_45=1), conditioned on the evidence (CNOD_12_45=0, CKND_12_45=1).

Exact Inference:
Using the Variable Elimination method, we start with the full joint distribution over the network.
We fix the values of the evidence nodes (CNOD_12_45=0, CKND_12_45=1).
We eliminate (marginalize out) the other variables not relevant to the query i.e., any variables that are neither evidence nor query variables.

Result: The algorithm calculates the exact joint probability of C_NI_12_00=1 and C_NI_12_45=1, given the evidence.

Explain your approach and justify your answers...


Q4: Put the Q4 question here from the question sheet.

In [81]:
from pgmpy.inference import VariableElimination

# Initialize the variable elimination algorithm
inference = VariableElimination(bn_model)

# Evidence: CNOD_12_45=2, CKND_12_45=0
evidence = {'CNOD_12_45': '2', 'CKND_12_45': '0'}

# Step 3: Query the conditional probability of C_NI_12_45 given the evidence
result = inference.query(variables=['C_NI_12_45'], evidence=evidence)

# Step 4: Calculate the expected value (weighted sum of values 0, 1, 2)
expected_value_exact = sum(value * prob for value, prob in enumerate(result.values))

# Output the expected value
print(f"Expected value (Exact Inference) of C_NI_12_45: {expected_value_exact}")

Expected value (Exact Inference) of C_NI_12_45: 0.9422625822278925


* Variable Elimination algorithm is used for exact inference in a Bayesian network.
* Evidence is provided: CNOD_12_45=2 and CKND_12_45=0.
* The conditional probability of C_NI_12_45 is queried given the evidence.
* The expected value of C_NI_12_45 is calculated as a weighted sum of possible values (0, 1, 2) multiplied by their respective probabilities.
* The result represents the expected value of C_NI_12_45 using exact inference, providing a precise prediction based on the network and evidence.

Explain your approach and justify your answers...


In [86]:
from pgmpy.sampling import GibbsSampling
import numpy as np

# Initialize Gibbs Sampling on the Bayesian network model
gibbs = GibbsSampling(bn_model)

# Evidence: CNOD_12_45=2, CKND_12_45=0
evidence = {'CNOD_12_45': 2, 'CKND_12_45': 0}

# Generate 1000 samples using Gibbs Sampling
samples = gibbs.sample(size=1000, evidence=evidence)

# Extract the 'C_NI_12_45' column
samples_C_NI_12_45 = samples['C_NI_12_45'].astype(int)

# Compute the expected value of C_NI_12_45 from the samples
expected_value_approx = np.mean(samples_C_NI_12_45)

# Output the expected value
print(f"Expected value (Approximate Inference - Gibbs Sampling) of C_NI_12_45: {expected_value_approx}")




KeyboardInterrupt: 