In [1]:
import pandas as pd
from treelib import Tree
tree = Tree()

In [2]:
dataset = pd.read_csv('id3data.csv')

In [3]:
dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes
5,rain,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


In [4]:
import math
eps = 1e-5
def calculate_entropy(dataset):
    pos = 0
    neg = 0
    for i in range(len(dataset)):
        answer = dataset.loc[i]['Answer']
        if answer == 'yes': 
            pos += 1
        else:
            neg += 1
    posfrac = pos/(pos + neg)
    negfrac = neg/(pos + neg)
    entropy = -(posfrac*math.log2(posfrac+eps) + negfrac*math.log2(negfrac+eps))
    return entropy

In [5]:
def calculate_gain(dataset, feature):
    feature_values = set(dataset[feature])
    #feature_values.remove('overcast')
    entropy = calculate_entropy(dataset)
    sum = 0
    
    for value in feature_values:
        partial_dataset = dataset[dataset[feature] == value]
        partial_dataset.set_index(pd.Index([n for n in range(len(partial_dataset))]), inplace = True)
        sum += len(partial_dataset)/len(dataset)*calculate_entropy(partial_dataset)
    
    gain = entropy - sum
    return gain

In [6]:
def get_poscount(dataset):
    pos = 0
    for i in range(len(dataset)):
        if dataset.loc[i]['Answer'] == 'yes':
            pos += 1
    return pos

In [7]:
class DecisionTree:
    def __init__(self, feature):
        self.feature = feature
        self.children = dict()
        self.attributes = None
        self.dataset = None
        self.type = None
        self.label = None
        
    def add_children(self):
        feature_values = set(self.dataset[self.feature])
        for value in feature_values:

            sub_dataset = self.dataset[self.dataset[self.feature] == value]
            sub_dataset.set_index(pd.Index(range(len(sub_dataset))), inplace = True)
            
            poscount = get_poscount(sub_dataset)
            
            sub_features = self.attributes
            if self.feature in sub_features:
                sub_features.remove(self.feature)
    
            max_val = -math.inf
            max_feature = None
    
            for feature in sub_features:
                gain = calculate_gain(sub_dataset, feature)
                if gain > max_val:
                    max_val = gain
                    max_feature = feature
            
            new_node = DecisionTree(max_feature)
    
            if poscount in [len(sub_dataset), 0]:
                new_node.type = 'leaf'
                new_node.label = sub_dataset.iloc[0]['Answer']
                tree.create_node('( '+value+' ) --> '+new_node.label, new_node.feature+value, parent= self.feature)
            else:
                new_node.type = 'node'
                tree.create_node('( '+value+' )--> '+new_node.feature.upper(), new_node.feature, parent= self.feature)
            
            new_node.dataset = sub_dataset
            new_node.attributes = sub_features
            self.children[value] = new_node


In [8]:
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']

In [9]:
root = None
max_feature = None
max_val = 0

In [10]:
if root == None:
    for feature in features:
        val = calculate_gain(dataset, feature)
        if val > max_val:
            max_val = val
            max_feature = feature
    root = DecisionTree(max_feature)
    tree.create_node(root.feature.upper(), root.feature)
    
    feature_values = set(dataset[root.feature])
    
    if get_poscount(dataset) in [len(dataset), 0]:
        root.type = 'leaf'
    else:
        root.type = 'node'
    
    root.dataset = dataset
    root.attributes = features
    root.add_children()

In [11]:
for key in root.children.keys():
    if root.children[key].type == 'node':
        root.children[key].add_children()

In [12]:
tree.show()

OUTLOOK
├── ( overcast ) --> yes
├── ( rain )--> WIND
│   ├── ( strong ) --> no
│   └── ( weak ) --> yes
└── ( sunny )--> HUMIDITY
    ├── ( high ) --> no
    └── ( normal ) --> yes

