In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv("bupa.data", sep=None, engine='python', na_values='?')
# df = pd.read_csv("asia10K.csv", sep=None, engine='python', na_values='?')

index_constant = np.where(df.nunique() == 1)[0]
constant_columns = [df.columns[i] for i in index_constant]
df = df.drop(columns=constant_columns)
df = df.dropna()
cat_data = df.select_dtypes('object').astype('category')
for c in cat_data:
    df = df.assign(**{c: cat_data[c]})
float_data = df.select_dtypes('number').astype('float64')
for c in float_data:
    df = df.assign(**{c: float_data[c]})

In [21]:
import pybnesian as pbn
# from tigramite.independence_tests.cmiknn_mixed import CMIknnMixed

In [22]:
node_children_blacklist = []
for source in df.columns:
    for target in df.columns:
        if source != target and df[target].dtype == 'category' and df[source].dtype == 'float64':
            
            node_children_blacklist.append(
                [source, target])

In [23]:
mskcmi = pbn.MixedKMutualInformation(df=df, k=50, samples=10, scaling="min_max",gamma_approx=True, adaptive_k=True)

pdag = pbn.PC().estimate(hypot_test=mskcmi, allow_bidirected = False, arc_blacklist = node_children_blacklist, arc_whitelist = [], edge_blacklist = [], edge_whitelist = [], verbose = 1)

✔  [████████████████████████████████████████] 100% [00m:01s] Finished PC!uless


In [24]:
pdag.arcs(), pdag.edges()

([('sgpt', 'sgot'),
  ('gammagt', 'sgot'),
  ('sgpt', 'gammagt'),
  ('selector', 'gammagt')],
 [('sgpt', 'drinks'), ('mcv', 'drinks')])

In [25]:
dag = pdag.to_approximate_dag()
top_sort = dag.topological_sort()
for col in df:
    if df[col].dtype == 'category':
        top_sort.remove(col)
top_sort

['mcv', 'alkphos', 'sgpt', 'drinks', 'selector', 'gammagt', 'sgot']

In [26]:
node_types = [(x_name, pbn.DiscreteFactorType(
    )) if x.dtype == 'category' else (x_name, pbn.LinearGaussianCPDType()) for x_name, x in df.items()]
spbn =  pbn.SemiparametricBN(nodes=df.columns, node_types=node_types)
for n1, n2 in dag.arcs():
    spbn.add_arc(n1, n2)
score = pbn.CVLikelihood(df, k=5)

op_set = pbn.ChangeNodeTypeSet()
op_set.cache_scores(spbn, score)


In [27]:
while top_sort:
    lglk_checks = []
    for n in top_sort:
        check = True
        for p in dag.parents(n):
            if p in top_sort:
                check = False
        if check:
            lglk_checks.append(n)

        
    delta = op_set.get_delta()
    type_changes_ops_idx = np.nonzero([len(x) for x in delta])[0]
    type_changes_ops = [['ChangeNodeType', spbn.nodes()[i], ('CKDEType' if str(spbn.node_type(spbn.nodes(
    )[i])) == 'LinearGaussianFactor' else 'LinearGaussianCPDType'), delta[i][0]] for i in type_changes_ops_idx]

    type_changes_ops = [x for x in type_changes_ops if x[1] in lglk_checks]

    selected_op = max(type_changes_ops, key=lambda x: x[3])
    if selected_op[3] < 0:
        top_sort.remove(selected_op[1])
        continue
    op = pbn.ChangeNodeType(node=selected_op[1], node_type=getattr(
                    pbn, selected_op[2])(), delta=selected_op[3])

    op.apply(spbn)
    op_set.update_scores(spbn, score, op.nodes_changed(spbn))
    top_sort.remove(selected_op[1])
    print(selected_op)

['ChangeNodeType', 'selector', 'CKDEType', np.float64(301.30459090317436)]
['ChangeNodeType', 'sgpt', 'CKDEType', np.float64(127.59542685397287)]
['ChangeNodeType', 'gammagt', 'CKDEType', np.float64(88.26956040449636)]
['ChangeNodeType', 'alkphos', 'CKDEType', np.float64(12.637694592526941)]
['ChangeNodeType', 'drinks', 'CKDEType', np.float64(41.46302390029507)]


In [28]:
for n in spbn.nodes():
    print(n, spbn.node_type(n))

mcv LinearGaussianFactor
alkphos CKDEFactor
sgpt CKDEFactor
sgot LinearGaussianFactor
gammagt CKDEFactor
drinks CKDEFactor
selector CKDEFactor
