#### Wymagane zależności
```sh
pip install liac-arff
pip install graphviz
```
Python 3

In [None]:
import math
import graphviz
import math
import copy
import sys

### Entropia
<img src="img/h.png" style="height: 100px">


In [None]:
class EntropyMetric(object):
    """Entropy metric H(Bs, D) module (Bs - Bayes net, D - test data set)"""
    def __init__(self, Bs, D):
        self.N = len(D)
        self.Bs = Bs
        self.D = D
        
    def check_occurences(self, node_name, parent_name, node_state, parent_state):
        result = 0
        
        for i, p in enumerate(self.D):
            found_node_state = p[node_name]
            
            if (parent_name != ''):
                found_parent_state = p[parent_name]
                is_found = (found_node_state == node_state and found_parent_state == parent_state)

                if (is_found):
                    result += 1
            else:
                if (found_node_state == node_state):
                    result += 1

        return result

    def check_parent_occurences(self, parent_name, parent_state):
        result = 0
        # brak rodzica - przyjmuje się, że wynikiem jest liczba próbek (czyli N)
        if (parent_name == ''):
            return self.N

        for i, p in enumerate(self.D):
            if (p[parent_name] == parent_state):
                result += 1

        return result
    
    def compute_entropy_metric(self):
        result = 0
        
        for i in range(0, len(self.Bs)):
            r_i = self.Bs[i]['r']
            q_i = self.Bs[i]['q']

            for j in range(0, len(q_i)):
                for k in range(0, len(r_i)):
                    N_i_j_k = self.check_occurences(self.Bs[i]['name'], self.Bs[i]['parent'], r_i[k], q_i[j])
                    N_i_j = self.check_parent_occurences(self.Bs[i]['parent'], q_i[j])

                    if (N_i_j_k != 0 and N_i_j != 0):
                        result += (N_i_j_k / self.N) * math.log10(N_i_j_k / N_i_j)
                        
        return -1 * self.N * result

### Metryka AIC 
<img src="img/qaic.png">
<img src="img/k.png">

In [None]:
class AICMetric(object):
    def compute_aic_metric(self, Bs, D):
        return EntropyMetric(Bs, D).compute_entropy_metric() + self.compute_k_factor(Bs)
    
    def compute_k_factor(self, Bs):
        k_factor = 0
        for i, p in enumerate(Bs):
            k_factor += (len(p['r']) - 1) * len(p['q'])
        
        return k_factor

### Metryka MDL (Minimum Description Length)
<img src="img/mdl.png" style="height: 60px"> <br/>
<img src="img/k.png" style="height: 60px">

In [None]:
class MDLMetric(object):
    def compute_mdl_metric(self, Bs, D):
        return EntropyMetric(Bs, D).compute_entropy_metric() + (self.compute_k_factor(Bs) / 2) * math.log10(len(D))
    
    def compute_k_factor(self, Bs):
        k_factor = 0
        for i, p in enumerate(Bs):
            k_factor += (len(p['r']) - 1) * len(p['q'])
        
        return k_factor

### Bayesian metric
<img src="img/bayes.png" style="height: 100px">

In [None]:
class BayesianMetric(object):
    def compute_bayesian_metric(self, Bs, D):
        self.D = D
        result = 1
        
        for node in Bs:
            # -- Obliczanie dla węzła bez rodzica; N'ij oraz N'ijk = 1, Nij = len(D), -- #
            # -- Nijk = wszystkie wpisy, dla których węzeł jest wstanie k -------------- #
            if (node['parent'] == ""):
                N_i_j = len(D)
                N_prim_i_j = 1
                N_prim_i_j_k = 1
                result *= math.gamma(N_prim_i_j) / math.gamma(N_prim_i_j + N_i_j)
                
                for state in node['r']:
                    N_i_j_k = self.check_occurences(node['name'], '', state, state)
                    result *= math.gamma(N_prim_i_j_k + N_i_j_k) / math.gamma(N_prim_i_j_k)
                
                continue
            # --------------------------------------------------------------------------- #
                
            N_prim_i_j = 1 / len(node['q'])
            N_prim_i_j_k = 1 / (len(node['q'] * len(node['r'])))
            
            for parent_state in node['q']:
                N_i_j = 0
                
                for state in node['r']:
                    N_i_j += self.check_occurences(node['name'], node['parent'], state, parent_state)
            
                result *= math.gamma(N_prim_i_j) / math.gamma(N_prim_i_j + N_i_j)
                
                for state in node['r']:
                    N_i_j_k = self.check_occurences(node['name'], node['parent'], state, parent_state)
                    result *= math.gamma(N_prim_i_j_k + N_i_j_k) / math.gamma(N_prim_i_j_k)
        
        return result
    
    
    def check_occurences(self, node_name, parent_name, node_state, parent_state):
        result = 0
        
        for i, p in enumerate(self.D):
            found_node_state = p[node_name]
            
            if (parent_name != ''):
                found_parent_state = p[parent_name]
                is_found = (found_node_state == node_state and found_parent_state == parent_state)

                if (is_found):
                    result += 1
            else:
                if (found_node_state == node_state):
                    result += 1

        return result
    

------

### Algorytm K2

In [None]:
class K2Algorithm(object):
    def __init__(self, attributes, test_data, scoring_method):
        self.test_data = test_data
        self.scoring_method = scoring_method
        self.attributes = attributes
        
    def new_find_optimal_net(self):
        optimal_net = []
        
        for i, attribute in enumerate(self.attributes):
            parents_of_node = []
            old_score = -sys.maxsize - 1
            
            # Pierwszy element w tablicy atrybutów - brak rodziców
            if i == 0:
                optimal_net.append({'r': attribute['states'], 'q': [''], 'name': attribute['name'], 'parent': ''})
                old_score = self.compute_metric(optimal_net)
                continue
            else:
                old_score = self.compute_metric(optimal_net)
                       
            parent_index_with_max_score, new_score = self.find_parent_with_max_score(i, optimal_net, attribute, old_score)
    
            if (new_score > old_score):
                #print("New parent:", self.attributes[parent_index_with_max_score]['name'].upper(), new_score)
                optimal_net.append({'r': attribute['states'], 'q': self.attributes[parent_index_with_max_score]['states'], 'name': attribute['name'], 'parent': self.attributes[parent_index_with_max_score]['name']})
            else:
                #print("NO parent")
                optimal_net.append({'r': attribute['states'], 'q': [''], 'name': attribute['name'], 'parent': ''})
            
        return optimal_net
    
    def find_parent_with_max_score(self, i, optimal_net, attribute, old_score):
        parent_index_with_max_score = -1
        new_score = -sys.maxsize - 1
        for parent_index in range(0, i):
            net_with_parent = copy.deepcopy(optimal_net)
            net_with_parent.append({'r': attribute['states'], 'q': self.attributes[parent_index]['states'], 'name': attribute['name'], 'parent': self.attributes[parent_index]['name']})

            score = self.compute_metric(net_with_parent)

            if (score > new_score):
                new_score = score
                parent_index_with_max_score = parent_index
            #print("Current attribute", attribute['name'], "old_score: ", old_score, "score_for_parent", self.attributes[parent_index]['name'], score, "parent_index", parent_index, "max_parent_index", parent_index_with_max_score)
        
        return (parent_index_with_max_score, new_score)
    
    def compute_metric(self, net):
        if self.scoring_method == 'aic':
            return AICMetric().compute_aic_metric(net, self.test_data)
        elif self.scoring_method == 'mdl':
            return MDLMetric().compute_mdl_metric(net, self.test_data)
        elif self.scoring_method == 'bayes':
            return BayesianMetric().compute_bayesian_metric(net, self.test_data)
        
        raise ValueError(self.scoring_method + " is not a valid scoring method!")

------

### Ładowanie pliku ARFF

In [None]:
import arff
import pprint

pp = pprint.PrettyPrinter(depth = 6)

with open('data/weather.arff') as fh:
    data = arff.load(fh)
    
    attributes = []
    for i, p in enumerate(data['attributes']):
        attributes.append({'name': p[0], 'states': p[1]})
    
    sample_data = []
    for i, p in enumerate(data['data']):
        temp_dict = {}
        for j, d in enumerate(p):
            temp_dict.update({attributes[j]['name']: d})
        sample_data.append(temp_dict)

------

### Wybór argumentu - klasy

In [None]:
    index_of_class_attribute = 4
    print("Class argument: ")
    pp.pprint(attributes[index_of_class_attribute])
    
    final_attributes = []
    final_attributes.append(attributes[index_of_class_attribute])
    
    for i, p in enumerate(attributes):
        if i != index_of_class_attribute:
            final_attributes.append(p)

-----
### Główne wywołanie algorytmu i rysowanie grafu

In [None]:
# Metody oceny sieci: 'aic', 'mdl' lub 'bayes'
bayesian_network = K2Algorithm(final_attributes, sample_data, 'bayes').new_find_optimal_net()

graph = graphviz.Digraph('generated graph')
for i, p in enumerate(bayesian_network):
    if p['parent'] == "":
        graph.node(p['name'] , label = p['name'])
    else:
        graph.edge(p['parent'], p['name'])
graph

In [None]:
probabilities = {}

for argument in bayesian_network:
    for state in argument['r']:
        number_of_states = len(argument['r'])
        number_of_all_samples = len(sample_data)
        number_of_found_states = 0

        for i, p in enumerate(sample_data):
            if (p[argument['name']] == state):
                number_of_found_states += 1

        probabilities.update({state: number_of_found_states})

print(probabilities)