In [25]:
import pandas as pd
import numpy as np
import math
from numpy import log2 as log
from anytree import Node, RenderTree
from anytree.exporter import DotExporter

In [28]:
class ID3():
    def __init__(self, document):
        self.document = document
        self.df = pd.read_csv(document)
        self.df_columns = self.df.columns.values.tolist()
        self.attributes = self.df_columns[1:len(self.df_columns)]
        self.df_attributes = pd.DataFrame(self.df,columns=self.attributes)
        self.num_days = self.df.shape[0]
        self.num_features = self.df.shape[1] - 2
    
    def __get_document(self):
        return self.document
    
    def __prob_of_playing(self):
        '''
        Premenna choices obsahuje YES & NO values ktore pochadzaju z df.
        Tieto yes / no su nasledne priradene do premennych num_no,num_yes.
        Nasledne pocitame pravdepodobnost ze za 14 dní sa podla df 
        uskutocnil / neuskutocnil tenis -> prob_a (uskutocnil), prob_b (neuskutocnil)
        '''
        num_no, num_yes = 0,0
        choices = self.df.iloc[:,5]
        for i in range(0,self.num_days):
            if choices[i] == "Yes":
                num_yes += 1
            else:
                num_no += 1
        prob_a = num_yes / self.num_days
        prob_b = num_no /  self.num_days
        probability = [prob_a,prob_b]
        return probability
    
    def __system_entropy(self):
        '''
        Funkcia vypocita entropiu celého systému -> 0,940
        '''
        prob = self.__prob_of_playing()
        entropy = -prob[0] * math.log2(prob[0]) - prob[1] * math.log2(prob[1])
        entropy = round(entropy,3)
        return entropy
    
    def __attribute_entropy(self,df_attribute,attribute):
        '''
        Funkcia vypocita entropiu vsetkych atributov,
        atribut = column z dataframu kt. don vchadza
        '''
        absolute_zero = np.finfo(float).eps 
        target_variables = df_attribute[df_attribute.keys()[-1]].unique()  
        variables = df_attribute[attribute].unique()
        entropy2 = 0
        for variable in variables:
            entropy = 0
            for target_variable in target_variables:
                    num = len(df_attribute[attribute][df_attribute[attribute]==variable][df_attribute[df_attribute.keys()[-1]] ==target_variable])
                    den = len(df_attribute[attribute][df_attribute[attribute]==variable])
                    fraction = num/(den+absolute_zero)
                    entropy += -fraction*log(fraction+absolute_zero)
            fraction2 = den/len(df_attribute)
            entropy2 += abs(-fraction2*entropy)
        return round(entropy2,3)
    
    def __df_attribute_table(self,df_attributes, attribute, value):
        return df_attributes[df_attributes[attribute] == value].reset_index(drop=True)
    
    def __best_attribute(self,df_attributes):
        attribute_entropy = []
        for key in df_attributes.keys()[:-1]:
            attribute_entropy.append(self.__system_entropy() - self.__attribute_entropy(df_attributes,key))
            
        best_candidate = df_attributes.keys()[:-1][np.argmax(attribute_entropy)]
        return best_candidate
    
    def info(self):
        print("CSV document: {document}".format(document = self.__get_document()))
        print("System entropy: " ,self.__system_entropy(),"(rounded to 3 decimals)")
        print("Best attribute from first iteration (parent of node): " ,self.__best_attribute(self.df_attributes))
    
    def __node_dict(self,df,tree=None):
        best_candidate = self.__best_attribute(df)
        attValue = np.unique(df[best_candidate])

        if tree is None:                    
            tree={}
            tree[best_candidate] = {}

        for value in attValue:
            subtable = self.__df_attribute_table(df,best_candidate,value)
            clValue,counts = np.unique(subtable['play'],return_counts=True)                        
            if len(counts)==1:
                tree[best_candidate][value] = clValue[0]     
            else:        
                tree[best_candidate][value] = self.__node_dict(subtable)
        return tree
    
    def create_tree(self):
        tree = self.__node_dict(self.df_attributes)
        print(tree.keys())
        
        second_dict = tree[str(*tree.keys())]
        leader_node = Node([*tree.keys()][0])
        first_node_layer = [i for i in [*tree[str(*tree.keys())]]] #['Overcast', 'Rain', 'Sunny']
        
        ''' Tvorba stromu -> graf '''
        index = 0
        attribute_list = []

        for attribute in first_node_layer:
            #Create first NodeNode(attribute, parent = leader_node) layer
            attribute = Node(first_node_layer[index], parent = leader_node)
            attribute_list.append(str(attribute))
            #print(attribute.name)

            if attribute.name == first_node_layer[index]:
                if len(second_dict.get(first_node_layer[index])) == 1:
                    #len == 1"
                    second_attribute = Node([*second_dict.get(first_node_layer[index]).keys()][0] ,parent = attribute)
                    #print("Second attribute ",second_attribute)

                    if len([*second_dict.get(first_node_layer[index])[second_attribute.name].keys()]) == 1:
                        ("Err")
                    else:
                        for idx in range(0,2):
                            third_attribute = Node([*second_dict.get(first_node_layer[index])[second_attribute.name].keys()][idx],parent = second_attribute)
                            #print("third attribute: ",third_attribute)

                            if third_attribute.name == [*second_dict.get(first_node_layer[index])[second_attribute.name].keys()][idx]:
                                second_value = Node("Class: {}".format(second_dict.get(first_node_layer[index])[second_attribute.name][third_attribute.name]), parent = third_attribute)
                                #print(second_value)

                else:
                    value = Node("Class: {}".format(second_dict.get(attribute.name)), parent = attribute)
                    #print(value)
            #print(attribute)
            index += 1
        
        ''' Vykreslovanie stromu graficky + export do .png  '''
        for pre, fill, node in RenderTree(leader_node):
            print("%s%s" % (pre, node.name))
            
        DotExporter(leader_node).to_picture("id3_tree.png")
    
        return tree

In [29]:
id3 = ID3("tennis.csv")

In [30]:
id3.create_tree()

dict_keys(['outlook'])
outlook
├── Overcast
│   └── Class: Yes
├── Rain
│   └── wind
│       ├── Strong
│       │   └── Class: No
│       └── Weak
│           └── Class: Yes
└── Sunny
    └── humidity
        ├── High
        │   └── Class: No
        └── Normal
            └── Class: Yes


{'outlook': {'Overcast': 'Yes',
  'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
  'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}

In [24]:
id3.attributes

['outlook', 'temp', 'humidity', 'wind', 'play']

In [19]:
df2 = pd.read_csv("tennis.csv")