# Imports

In [None]:
import pandas as pd
import os
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
from sklearn.decomposition import PCA
import numpy as np
import networkx as nx

# Remove Redundant

In [2]:
score_files = os.listdir("C:/Users/jacco/Documents/GitHub/Thesis/newOutputs/scores3/")

for structure in os.listdir("C:/Users/jacco/Documents/GitHub/Thesis/newOutputs/structures3/"):
    split = structure.split("_")
    if split[0]+"_"+split[1]+".csv" not in score_files:
        os.remove("C:/Users/jacco/Documents/GitHub/Thesis/newOutputs/structures3/"+structure)
        
for structure in os.listdir("C:/Users/jacco/Documents/GitHub/Thesis/newOutputs/groundtruths3/"):
    if structure[:-7]+".csv" not in score_files:
#         print(structure[:-7])
#         break
        os.remove("C:/Users/jacco/Documents/GitHub/Thesis/newOutputs/groundtruths3/"+structure)

# Define Functions

In [3]:
def get_max_degree(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return max(G.out_degree, key=lambda x: x[1])[1]/len(bn.nodes())

def get_nr_roots(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return len([x for x in G.nodes() if G.in_degree(x)==0])/len(bn.nodes())

def get_nr_leaves(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return len([x for x in G.nodes() if G.out_degree(x)==0])/len(bn.nodes())

def get_longest_path(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return len(nx.dag_longest_path(G))/len(bn.nodes())

def get_nr_communities(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    
    communities_generator = nx.community.girvan_newman(G)
    top_level_communities = next(communities_generator)
    next_level_communities = next(communities_generator)
    return len(sorted(map(sorted, next_level_communities)))/len(bn.nodes())

def get_mean_constraint(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    constraints = np.nan_to_num([v for n, v in dict(nx.constraint(G)).items()])
    return np.mean(constraints)

def get_mean_efficiency(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    
    esize = nx.effective_size(G)
    efficiencies = np.nan_to_num([v / G.degree(n) for n, v in esize.items()])
    return np.mean(efficiencies)

def get_s_metric(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    
    return nx.s_metric(G, normalized=False)/len(bn.arcs())

def get_mean_similarity(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return pd.DataFrame(nx.simrank_similarity(G)).mean().mean()

def get_mean_nr_similarities(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return pd.DataFrame(nx.simrank_similarity(G)).astype(bool).sum(axis=0).mean()/len(bn.nodes())

def get_mean_degrees(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return np.mean([v for x,v in dict(G.out_degree()).items()])/len(bn.nodes())

def get_PHI(bn):
    G = nx.DiGraph()
    G.add_nodes_from(bn.nodes())
    G.add_edges_from(bn.arcs())
    return sum([2**v for x,v in dict(G.out_degree()).items()])

# Classify 

In [4]:
%%time
input_columns = ["items","nodes","arcs_ratio","timesteps","values","algorithm","scorefunction","nodestates","arcs","items/nodestates","items/arcs"]

df2 = pd.DataFrame()

files = os.listdir("newOutputs/scores3")
for ifile, file in enumerate(files):
# for ifile, file in enumerate(np.random.choice(files,10)):
    extra = pd.read_csv('newOutputs/scores3/'+file, index_col=0)[["items","nodes","timesteps","arcs_ratio","values","algorithm","scorefunction","fscore","tp","fp","tn","fn"]]
    extra.loc[:,"structure_id"] = file.split("_")[0][9:]
    
    structure, rep = file[9:-4].split("_")
    bn = gum.BayesNet()
    bn.loadBIFXML(f"newOutputs/groundtruths3/structure{structure}_{rep}.bifxml")

    pcas = pd.DataFrame()
    for _,row in pd.read_csv(f"newOutputs/scores3/structure{structure}_{rep}.csv").iterrows():
    
        data,_ = gum.generateSample(bn,row["items"],None,False)
        data = data.reindex(sorted(data.columns, key=lambda x: (len(x), int(x.split(".")[1]), x.split(".")[0])), axis=1)
        data = data.astype('int')

        cov = data.cov()

        pca = PCA()
        _ = pca.fit_transform(cov)
        PC_components = np.arange(pca.n_components_) + 1

        explained = pca.explained_variance_
        ratios = pca.explained_variance_ratio_
        
        threshold = 0.95
        pcasum = 0
        for i_pca, weight in enumerate(ratios):
            if pcasum > threshold:
                break
            pcasum += weight
        i_pca /= (row["nodes"]*row["timesteps"])
        
        pcas = pd.concat([pcas,pd.DataFrame({"PC1":explained[0], "PC2":explained[1], "PC3":explained[2], "PC4":explained[3], "PCweight1":[ratios[0]], "PCweight2":[ratios[1]], "PCweight3":[ratios[2]], "PCweight4":[ratios[3]], "PCAsAbove95":[i_pca], "maxdegree/nodes":[get_max_degree(bn)], "roots/nodes":[get_nr_roots(bn)], "leaves/nodes":[get_nr_leaves(bn)], "longestpath/nodes":[get_longest_path(bn)], "communities/nodes":[get_nr_communities(bn)], "mean_constraint":[get_mean_constraint(bn)], "mean_efficiency":[get_mean_efficiency(bn)], "s_metric/arcs":[get_s_metric(bn)], "mean_similarity":[get_mean_similarity(bn)], "mean_nr_similarities/nodes":[get_mean_nr_similarities(bn)], "mean_degree":[get_mean_degrees(bn)], "PHI":[get_PHI(bn)]})], ignore_index=True)

    extra = pd.concat([extra, pcas], axis=1)
    df2 = pd.concat([df2, extra], ignore_index=True)

    print(ifile/len(files))
df2

0.0
0.0004761904761904762
0.0009523809523809524
0.0014285714285714286
0.0019047619047619048
0.002380952380952381
0.002857142857142857
0.0033333333333333335
0.0038095238095238095
0.004285714285714286
0.004761904761904762
0.005238095238095238
0.005714285714285714
0.006190476190476191
0.006666666666666667
0.007142857142857143
0.007619047619047619
0.008095238095238095
0.008571428571428572
0.009047619047619047
0.009523809523809525
0.01
0.010476190476190476
0.010952380952380953
0.011428571428571429
0.011904761904761904
0.012380952380952381
0.012857142857142857
0.013333333333333334
0.01380952380952381
0.014285714285714285
0.014761904761904763
0.015238095238095238
0.015714285714285715
0.01619047619047619
0.016666666666666666
0.017142857142857144
0.017619047619047618
0.018095238095238095
0.018571428571428572
0.01904761904761905
0.019523809523809523
0.02
0.020476190476190478
0.02095238095238095
0.02142857142857143
0.021904761904761906
0.02238095238095238
0.022857142857142857
0.023333333333333334

0.20238095238095238
0.20285714285714285
0.20333333333333334
0.2038095238095238
0.2042857142857143
0.20476190476190476
0.20523809523809525
0.2057142857142857
0.2061904761904762
0.20666666666666667
0.20714285714285716
0.20761904761904762
0.20809523809523808
0.20857142857142857
0.20904761904761904
0.20952380952380953
0.21
0.21047619047619048
0.21095238095238095
0.21142857142857144
0.2119047619047619
0.2123809523809524
0.21285714285714286
0.21333333333333335
0.2138095238095238
0.21428571428571427
0.21476190476190476
0.21523809523809523
0.21571428571428572
0.21619047619047618
0.21666666666666667
0.21714285714285714
0.21761904761904763
0.2180952380952381
0.21857142857142858
0.21904761904761905
0.2195238095238095
0.22
0.22047619047619046
0.22095238095238096
0.22142857142857142
0.2219047619047619
0.22238095238095237
0.22285714285714286
0.22333333333333333
0.22380952380952382
0.22428571428571428
0.22476190476190477
0.22523809523809524
0.2257142857142857
0.2261904761904762
0.22666666666666666
0.

0.4095238095238095
0.41
0.4104761904761905
0.41095238095238096
0.4114285714285714
0.4119047619047619
0.4123809523809524
0.41285714285714287
0.41333333333333333
0.4138095238095238
0.4142857142857143
0.4147619047619048
0.41523809523809524
0.4157142857142857
0.41619047619047617
0.4166666666666667
0.41714285714285715
0.4176190476190476
0.4180952380952381
0.4185714285714286
0.41904761904761906
0.4195238095238095
0.42
0.4204761904761905
0.42095238095238097
0.42142857142857143
0.4219047619047619
0.42238095238095236
0.4228571428571429
0.42333333333333334
0.4238095238095238
0.42428571428571427
0.4247619047619048
0.42523809523809525
0.4257142857142857
0.4261904761904762
0.4266666666666667
0.42714285714285716
0.4276190476190476
0.4280952380952381
0.42857142857142855
0.42904761904761907
0.42952380952380953
0.43
0.43047619047619046
0.430952380952381
0.43142857142857144
0.4319047619047619
0.43238095238095237
0.4328571428571429
0.43333333333333335
0.4338095238095238
0.4342857142857143
0.4347619047619

0.6214285714285714
0.621904761904762
0.6223809523809524
0.6228571428571429
0.6233333333333333
0.6238095238095238
0.6242857142857143
0.6247619047619047
0.6252380952380953
0.6257142857142857
0.6261904761904762
0.6266666666666667
0.6271428571428571
0.6276190476190476
0.628095238095238
0.6285714285714286
0.6290476190476191
0.6295238095238095
0.63
0.6304761904761905
0.6309523809523809
0.6314285714285715
0.6319047619047619
0.6323809523809524
0.6328571428571429
0.6333333333333333
0.6338095238095238
0.6342857142857142
0.6347619047619047
0.6352380952380953
0.6357142857142857
0.6361904761904762
0.6366666666666667
0.6371428571428571
0.6376190476190476
0.638095238095238
0.6385714285714286
0.6390476190476191
0.6395238095238095
0.64
0.6404761904761904
0.6409523809523809
0.6414285714285715
0.6419047619047619
0.6423809523809524
0.6428571428571429
0.6433333333333333
0.6438095238095238
0.6442857142857142
0.6447619047619048
0.6452380952380953
0.6457142857142857
0.6461904761904762
0.6466666666666666
0.647

0.8352380952380952
0.8357142857142857
0.8361904761904762
0.8366666666666667
0.8371428571428572
0.8376190476190476
0.8380952380952381
0.8385714285714285
0.839047619047619
0.8395238095238096
0.84
0.8404761904761905
0.840952380952381
0.8414285714285714
0.8419047619047619
0.8423809523809523
0.8428571428571429
0.8433333333333334
0.8438095238095238
0.8442857142857143
0.8447619047619047
0.8452380952380952
0.8457142857142858
0.8461904761904762
0.8466666666666667
0.8471428571428572
0.8476190476190476
0.8480952380952381
0.8485714285714285
0.849047619047619
0.8495238095238096
0.85
0.8504761904761905
0.8509523809523809
0.8514285714285714
0.8519047619047619
0.8523809523809524
0.8528571428571429
0.8533333333333334
0.8538095238095238
0.8542857142857143
0.8547619047619047
0.8552380952380952
0.8557142857142858
0.8561904761904762
0.8566666666666667
0.8571428571428571
0.8576190476190476
0.8580952380952381
0.8585714285714285
0.8590476190476191
0.8595238095238096
0.86
0.8604761904761905
0.8609523809523809


Unnamed: 0,items,nodes,timesteps,arcs_ratio,values,algorithm,scorefunction,fscore,tp,fp,...,leaves/nodes,longestpath/nodes,communities/nodes,mean_constraint,mean_efficiency,s_metric/arcs,mean_similarity,mean_nr_similarities/nodes,mean_degree,PHI
0,100,4,1,0.999999,2,HC,BDEU,0.000000,0,1,...,0.750000,0.5,0.75,0.083333,0.250000,3.0,0.587500,0.625000,0.187500,11
1,100,4,1,0.999999,2,HC,AIC,0.000000,0,1,...,0.750000,0.5,0.75,0.083333,0.250000,3.0,0.587500,0.625000,0.187500,11
2,100,4,1,0.999999,2,HC,BIC,0.500000,1,0,...,0.750000,0.5,0.75,0.083333,0.250000,3.0,0.587500,0.625000,0.187500,11
3,100,4,1,0.999999,2,HC,K2,0.400000,1,1,...,0.750000,0.5,0.75,0.083333,0.250000,3.0,0.587500,0.625000,0.187500,11
4,100,4,1,0.999999,2,HC,L2L,0.571429,2,2,...,0.750000,0.5,0.75,0.083333,0.250000,3.0,0.587500,0.625000,0.187500,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94495,10000,6,1,0.999999,2,MMHC,BDEU,0.800000,4,1,...,0.333333,0.5,0.50,0.500000,0.666667,3.8,0.183333,0.222222,0.138889,12
94496,10000,6,1,0.999999,2,MMHC,BIC,1.000000,5,0,...,0.333333,0.5,0.50,0.500000,0.666667,3.8,0.183333,0.222222,0.138889,12
94497,10000,6,1,0.999999,2,MMHC,K2,0.600000,3,2,...,0.333333,0.5,0.50,0.500000,0.666667,3.8,0.183333,0.222222,0.138889,12
94498,10000,6,1,0.999999,2,3OFF2,L2L,0.800000,4,1,...,0.333333,0.5,0.50,0.500000,0.666667,3.8,0.183333,0.222222,0.138889,12


In [5]:
df2["nodestates"] = df2["values"]**df2["nodes"]
df2["arcs"] = np.floor(df2["nodes"]*df2["arcs_ratio"]).astype(int)
df2["items/nodestates"] = df2["items"]/df2["nodestates"]
df2["items/arcs"] = df2["items"]/df2["arcs"]
df2.loc[df2["algorithm"]=="3OFF2","scorefunction"] = "3OFF2"
df2.loc[df2["algorithm"]=="MIIC","scorefunction"] = "MIIC"

df2["hamming"] = (df2["tn"] + df2["tp"])/(df2["tp"] + df2["fp"] + df2["tn"] + df2["fn"])
df2["items/PHI"] = df2["items"]/df2["PHI"]

df2.to_csv("newOutputs/classifiers.csv")
df2

Unnamed: 0,items,nodes,timesteps,arcs_ratio,values,algorithm,scorefunction,fscore,tp,fp,...,mean_similarity,mean_nr_similarities/nodes,mean_degree,PHI,nodestates,arcs,items/nodestates,items/arcs,hamming,items/PHI
0,100,4,1,0.999999,2,HC,BDEU,0.000000,0,1,...,0.587500,0.625000,0.187500,11,16,3,6.25,33.333333,0.666667,9.090909
1,100,4,1,0.999999,2,HC,AIC,0.000000,0,1,...,0.587500,0.625000,0.187500,11,16,3,6.25,33.333333,0.666667,9.090909
2,100,4,1,0.999999,2,HC,BIC,0.500000,1,0,...,0.587500,0.625000,0.187500,11,16,3,6.25,33.333333,0.833333,9.090909
3,100,4,1,0.999999,2,HC,K2,0.400000,1,1,...,0.587500,0.625000,0.187500,11,16,3,6.25,33.333333,0.750000,9.090909
4,100,4,1,0.999999,2,HC,L2L,0.571429,2,2,...,0.587500,0.625000,0.187500,11,16,3,6.25,33.333333,0.750000,9.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94495,10000,6,1,0.999999,2,MMHC,BDEU,0.800000,4,1,...,0.183333,0.222222,0.138889,12,64,5,156.25,2000.000000,0.933333,833.333333
94496,10000,6,1,0.999999,2,MMHC,BIC,1.000000,5,0,...,0.183333,0.222222,0.138889,12,64,5,156.25,2000.000000,1.000000,833.333333
94497,10000,6,1,0.999999,2,MMHC,K2,0.600000,3,2,...,0.183333,0.222222,0.138889,12,64,5,156.25,2000.000000,0.866667,833.333333
94498,10000,6,1,0.999999,2,3OFF2,3OFF2,0.800000,4,1,...,0.183333,0.222222,0.138889,12,64,5,156.25,2000.000000,0.933333,833.333333
