### NOTEARS: Synthetic

Random graphs $+$ Data generating model $\to$ data 

In [1]:
from HCD.notears_utils import *
from pathlib import Path
import os
import pandas as pd


n, d, graph_type, sem_type = 1000, 500, 'ER', 'logistic'
s0 = 2 * d

for i in range(10):
    set_random_seed(2025 + i)   # Do not change this
    B_true = simulate_dag(d, s0, graph_type)
    W_true = simulate_parameter(B_true)
    X = simulate_linear_sem(W_true, n, sem_type)

    if not Path(f'data/{graph_type}-{sem_type}-{n}-{d}-{s0}/{i}').exists():
        os.makedirs(f'data/{graph_type}-{sem_type}-{n}-{d}-{s0}/{i}')

    vars = [f"X{i}" for i in range(d)]
    data_df = pd.DataFrame(X, columns=vars)
    groundtruth = pd.DataFrame(B_true, columns=vars)
    
    data_df.to_csv(f'data/{graph_type}-{sem_type}-{n}-{d}-{s0}/{i}/data.csv', index=False)
    groundtruth.to_csv(f'data/{graph_type}-{sem_type}-{n}-{d}-{s0}/{i}/groundtruth.csv', index=False)


### Bnlearn: Real

Deterministic graphs $+$ deterministic probabilistic generating model $\to$ data <br>

These deterministic elements are provided by [bnlearn](https://www.bnlearn.com/bnrepository/)

In [2]:
from pgmpy.readwrite import BIFReader

dataset = "win95pts"

reader = BIFReader(f'data/bnlearn/{dataset}.bif')
model = reader.get_model()

In [3]:
import pandas as pd
import networkx as nx

# The underlying DAG is a networkx DiGraph
dag = model.edges

# Create a directed graph from edges
G = nx.DiGraph()
G.add_edges_from(dag)

# Generate adjacency matrix as DataFrame
adj_matrix = pd.DataFrame(nx.adjacency_matrix(G).todense(), index=G.nodes, columns=G.nodes)

In [4]:
from pgmpy.sampling import BayesianModelSampling

# Sampling
inference = BayesianModelSampling(model)
n=10000
df = inference.forward_sample(size=n, seed=2025)
df = df[list(G.nodes)]

  0%|          | 0/76 [00:00<?, ?it/s]

In [5]:
import os
from pathlib import Path

if not Path(f"data/{dataset}").exists():
    os.makedirs(f"data/{dataset}")

df.to_csv(f"data/{dataset}/data_{n}.csv", index=False)
adj_matrix.to_csv(f"data/{dataset}/graph.csv", index=False)

### Semi-synthetic

Bnlearn deterministic graphs $+$ NOTEARS data generating model $\to$ data

In [8]:
from pgmpy.readwrite import BIFReader
import pandas as pd
import networkx as nx

dataset = "win95pts"

reader = BIFReader(f'data/bnlearn/{dataset}.bif')
model = reader.get_model()

# The underlying DAG is a networkx DiGraph
dag = model.edges

# Create a directed graph from edges
G = nx.DiGraph()
G.add_edges_from(dag)

# Generate adjacency matrix as DataFrame
adj_matrix = pd.DataFrame(nx.adjacency_matrix(G).todense(), index=G.nodes, columns=G.nodes).to_numpy()

In [9]:
from HCD.notears_utils import *
from pathlib import Path
import os
import pandas as pd

d = adj_matrix.shape[0]
n, sem_type = 1000, 'gauss'

for i in range(10):
    set_random_seed(2025 + i)   # Do not change this
    B_true = adj_matrix
    W_true = simulate_parameter(B_true)
    X = simulate_linear_sem(W_true, n, sem_type)

    if not Path(f'data/{dataset}-{sem_type}-{n}/{i}').exists():
        os.makedirs(f'data/{dataset}-{sem_type}-{n}/{i}')

    vars = [f"X{i}" for i in range(d)]
    data_df = pd.DataFrame(X, columns=vars)
    groundtruth = pd.DataFrame(B_true, columns=vars)
    
    data_df.to_csv(f'data/{dataset}-{sem_type}-{n}/{i}/data.csv', index=False)
    groundtruth.to_csv(f'data/{dataset}-{sem_type}-{n}/{i}/groundtruth.csv', index=False)
