# Complex Bayesian Network notebook

#### Import dependencies

In [None]:
# Import dependencies
#%pip install pgmpy==0.1.14
#%pip install tabulate
#%pip install pandas
#%pip install networkx
#%pip install matplotlib

#### Load libraries

In [None]:
# Load libraries
import pgmpy as pg
import tabulate as tb
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

## Data import and cleaning

#### Import BigPerf data

In [None]:
# Import data
df = pd.read_csv("C:\\Users\\Keir McGlinn-Shaw\\Documents\\Stage-3\\Dissertation\\Data\\Use\\BigPerfMetrics.csv")

df.head()

#### Clean data of unwanted variables and missing values

In [None]:
# Clean data
df.dropna()

df.drop(labels=['Block_ID', 'Mapper_ID', 'Reducer_ID', 'Block_ID.1'], axis=1, inplace=True)

df.head()

#### Rename columns to better fit in drawn charts and for ease of use

In [None]:
# Rename column headers to abbreviations

nodes = ['HBW', 'HTT', 'MMIPS', 'MET', 'MBW', 'MTT', 'RMIPS', 'RET', 'RBW', 'RTT', 'BBW', 'BTT', 'TT']

df.columns = nodes

df.head(10)

#### Define funtion to discretise data into three catagories

In [None]:
'''
Splits data into three bins of equal size using: https://pandas.pydata.org/docs/reference/api/pandas.cut.html
'''
def split_tertiles(df):
    tertile_labels = ['Low', 'Medium', 'High']

    tert_df = pd.DataFrame()

    for column_name in df:
        tert_df[column_name] = pd.cut(df[column_name], 3, labels=tertile_labels, ordered=True)

    return tert_df

#### Descritise data into catagories

In [None]:
# Discretise BigPerf dataset
cat_df = split_tertiles(df)

cat_df.head()

In [None]:
from itertools import combinations

import networkx as nx
from sklearn.metrics import f1_score
from pgmpy.models import BayesianModel, NaiveBayes
from pgmpy.estimators import PC, HillClimbSearch, ExhaustiveSearch
from pgmpy.estimators import K2Score, BicScore, BDeuScore
from pgmpy.utils import get_example_model
from pgmpy.sampling import BayesianModelSampling

In [None]:
scoring_method = K2Score(data=cat_df)
max_iter = 10000
# est = HillClimbSearch(data=cat_df)
# model = est.estimate(scoring_method=scoring_method, max_indegree=4, max_iter=max_iter)
hc = HillClimbSearch(cat_df)
model = hc.estimate()

In [None]:
bayes_model = BayesianModel(model.edges)

In [None]:
# from pgmpy.models import BayesianModel
# from pgmpy.estimators import BayesianEstimator

# model.fit(cat_df, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5
# for cpd in model.get_cpds():
#     print(cpd)

In [None]:
from pgmpy.estimators import BayesianEstimator, MaximumLikelihoodEstimator
from IPython.core.display import display, HTML

# disable text wrapping in output cell
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

bayes_model.cpds = []

bayes_model.fit(data=cat_df,
    estimator=BayesianEstimator,
    prior_type='BDeu',
    complete_samples_only=False)

In [None]:
G = nx.DiGraph()
G.add_edges_from(bayes_model.edges)
G.add_nodes_from(bayes_model.nodes)
pos = nx.kamada_kawai_layout(G)
nx.draw(G, pos=pos, with_labels=True, arrowsize=30, node_size=800, alpha=0.7, font_weight="bold")  # Draw the original graph

In [None]:
#print(f'Check model: {bayes_model.check_model()}\n')

file = open('CPDs.txt', 'w')

for cpd in bayes_model.get_cpds():
    file.write(f'CPT of {cpd.variable}:\n')
    file.write(f'{cpd}\n')
    # print(f'CPT of {cpd.variable}:')
    # print(cpd, '\n')

In [None]:
# cpd_list = bayes_model.get_cpds()

# for cpd in cpd_list:

#     values = cpd.

#     df = pd.DataFrame(values, columns=cpd.variables)
    #print(values)

In [None]:
# print(f'Check model: {bayes_model.check_model()}\n')
# for cpd in bayes_model.get_cpds():
#     print(f'CPT of {cpd.variable}:')
#     print(cpd, '\n')