In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('id3data.csv')

In [3]:
dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes
5,rain,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


In [4]:
import math
eps = 1e-5
def calculate_entropy(dataset):
    pos = 0
    neg = 0
    for i in range(len(dataset)):
        answer = dataset.loc[i]['Answer']
        if answer == 'yes': 
            pos += 1
        else:
            neg += 1
    posfrac = pos/(pos + neg)
    negfrac = neg/(pos + neg)
    entropy = -(posfrac*math.log2(posfrac+eps) + negfrac*math.log2(negfrac+eps))
    return entropy

In [5]:
print(calculate_entropy(dataset))

0.9402571050839952


In [6]:
df = dataset[dataset['Outlook'] == 'sunny']
df.set_index(pd.Index([n for n in range(len(df))]))

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,sunny,mild,high,weak,no
3,sunny,cool,normal,weak,yes
4,sunny,mild,normal,strong,yes


In [7]:
def calculate_gain(dataset, feature):
    feature_values = set(dataset[feature])
    #feature_values.remove('overcast')
    entropy = calculate_entropy(dataset)
    sum = 0
    
    for value in feature_values:
        partial_dataset = dataset[dataset[feature] == value]
        partial_dataset.set_index(pd.Index([n for n in range(len(partial_dataset))]), inplace = True)
        sum += len(partial_dataset)/len(dataset)*calculate_entropy(partial_dataset)
    
    gain = entropy - sum
    return gain

In [8]:
print(calculate_gain(dataset, 'Outlook'))

0.24674569786749656


In [9]:
class Node:
    def __init__(self, feature):
        self.feature = feature
        self.branch_names = list(set(dataset[self.feature]))
        self.children = []
        self.node_type = None

    def add_children(self):
        pass


In [10]:
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']

In [11]:
def get_poscount(dataset):
    pos = 0
    for i in range(len(dataset)):
        if dataset.loc[i]['Answer'] == 'yes':
            pos += 1
    return pos

In [12]:
max = 0
max_feature = None
for feature in features:
    if calculate_gain(dataset, feature) > max:
        max = calculate_gain(dataset, feature)
        max_feature = feature

root = Node(max_feature)
sub_features = features
sub_features.remove(root.feature)
for branch in root.branch_names:
    sub_dataset = dataset[dataset[root.feature] == branch]
    sub_dataset.set_index(pd.Index([n for n in range(len(sub_dataset))]), inplace = True)
    poscount = get_poscount(sub_dataset)
    if poscount == len(sub_dataset):
        root.children.append('yes')
    elif poscount == 0:
        root.children.append('no')
    else:
        max = 0
        max_feature = None
        for feature in sub_features:
            if calculate_gain(sub_dataset, feature) > max:
                max = calculate_gain(sub_dataset, feature)
                max_feature = feature
        root.children.append(Node(max_feature))

print(root.children)
        

[<__main__.Node object at 0x7ff77578c1c0>, 'yes', <__main__.Node object at 0x7ff77578c130>]
