In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('id3data.csv')

In [3]:
dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes
5,rain,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


In [4]:
import math
eps = 1e-5
def calculate_entropy(dataset):
    pos = 0
    neg = 0
    for i in range(len(dataset)):
        answer = dataset.loc[i]['Answer']
        if answer == 'yes': 
            pos += 1
        else:
            neg += 1
    posfrac = pos/(pos + neg)
    negfrac = neg/(pos + neg)
    entropy = -(posfrac*math.log2(posfrac+eps) + negfrac*math.log2(negfrac+eps))
    return entropy

In [5]:
print(calculate_entropy(dataset))

0.9402571050839954


In [6]:
df = dataset[dataset['Outlook'] == 'sunny']
df.set_index(pd.Index([n for n in range(len(df))]))

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Answer
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,sunny,mild,high,weak,no
3,sunny,cool,normal,weak,yes
4,sunny,mild,normal,strong,yes


In [7]:
def calculate_gain(dataset, feature):
    feature_values = set(dataset[feature])
    #feature_values.remove('overcast')
    entropy = calculate_entropy(dataset)
    sum = 0
    
    for value in feature_values:
        partial_dataset = dataset[dataset[feature] == value]
        partial_dataset.set_index(pd.Index([n for n in range(len(partial_dataset))]), inplace = True)
        sum += len(partial_dataset)/len(dataset)*calculate_entropy(partial_dataset)
    
    gain = entropy - sum
    return gain

In [8]:
print(calculate_gain(dataset, 'Outlook'))

0.24674569786749678


In [9]:
class Tree:
    def __init__(self, feature):
        self.feature = feature
        self.children = dict()
        self.dataset = None
        self.type = None
        self.label = None

In [10]:
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']

In [11]:
def get_poscount(dataset):
    pos = 0
    for i in range(len(dataset)):
        if dataset.loc[i]['Answer'] == 'yes':
            pos += 1
    return pos

In [12]:
root = None
max_feature = None
max_val = 0

In [13]:
if root == None:
    for feature in features:
        val = calculate_gain(dataset, feature)
        if val > max_val:
            max_val = val
            max_feature = feature
    root = Tree(max_feature)
print(max_feature)

Outlook


In [14]:
feature_values = list(set(dataset[root.feature]))
if get_poscount(dataset) in [len(dataset), 0]:
    root.type = 'leaf'
else:
    root.type = 'node'
root.dataset = dataset

In [15]:
for value in feature_values:
    sub_dataset = dataset[dataset[root.feature] == value]
    sub_dataset.set_index(pd.Index(range(len(sub_dataset))), inplace = True)
    poscount = get_poscount(sub_dataset)
    sub_features = features
    if root.feature in sub_features:
        sub_features.remove(root.feature)
    
    max_val = 0
    max_feature = None
    
    for feature in sub_features:
        gain = calculate_gain(sub_dataset, feature)
        if gain > max_val:
            max_val = gain
            max_feature = feature
    new_node = Tree(max_feature)
    
    if poscount in [len(sub_dataset), 0]:
        new_node.type = 'leaf'
        new_node.label = sub_dataset.iloc[0]['Answer']
    else:
        new_node.type = 'node'
    new_node.dataset = sub_dataset
    root.children[value] = new_node

In [16]:
for key in root.children.keys():
    child = root.children[key]
    if child.type != 'leaf':
        feature_values = list(set(dataset[child.feature]))
        for value in feature_values:
            sub_dataset = child.dataset[child.dataset[child.feature] == value]
            sub_dataset.set_index(pd.Index(range(len(sub_dataset))), inplace = True)
            poscount = get_poscount(sub_dataset)
            sub_features = features
            if root.feature in sub_features:
                sub_features.remove(root.feature)
            if child.feature in sub_features:
                sub_features.remove(child.feature)
    
            max_val = 0
            max_feature = None
    
            for feature in sub_features:
                gain = calculate_gain(sub_dataset, feature)
                if gain > max_val:
                    max_val = gain
                    max_feature = feature
            new_node = Tree(max_feature)
    
            if poscount in [len(sub_dataset), 0]:
                new_node.type = 'leaf'
                new_node.label = sub_dataset.iloc[0]['Answer']
            else:
                new_node.type = 'node'
            new_node.dataset = sub_dataset
            child.children[value] = new_node

In [20]:
for key in root.children.keys():
    print(root.children[key].feature)
    print(root.children[key].dataset)
    for child_key in root.children[key].children.keys():
        print(root.children[key].children[child_key].dataset)

None
    Outlook Temperature Humidity    Wind Answer
0  overcast         hot     high    weak    yes
1  overcast        cool   normal  strong    yes
2  overcast        mild     high  strong    yes
3  overcast         hot   normal    weak    yes
Humidity
  Outlook Temperature Humidity    Wind Answer
0   sunny         hot     high    weak     no
1   sunny         hot     high  strong     no
2   sunny        mild     high    weak     no
3   sunny        cool   normal    weak    yes
4   sunny        mild   normal  strong    yes
  Outlook Temperature Humidity    Wind Answer
0   sunny        cool   normal    weak    yes
1   sunny        mild   normal  strong    yes
  Outlook Temperature Humidity    Wind Answer
0   sunny         hot     high    weak     no
1   sunny         hot     high  strong     no
2   sunny        mild     high    weak     no
Wind
  Outlook Temperature Humidity    Wind Answer
0    rain        mild     high    weak    yes
1    rain        cool   normal    weak    yes
2    