In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log

# to avoid 0 in the denominator
eps = np.finfo(float).eps

In [2]:
# WORKFLOW OF ID3

# 1.compute the entropy for data-set
# 2.for every attribute/feature:
#        1.calculate entropy for all categorical values
#        2.take average information entropy for the current attribute
#        3.calculate gain for the current attribute
# 3. pick the highest gain attribute.
# 4. Repeat until we get the tree we desired

In [3]:
# 3 weather options: Sunny, Cloudy, Rainy
weat  = ['sun', 'sun', 'cloud', 'rain', 'rain', 'rain', 'cloud', 
         'sun', 'sun', 'rain', 'sun', 'cloud', 'cloud', 'rain']

# 3 temperature options: Hot, Medium, Cold
temp  = ['hot', 'hot', 'hot', 'med', 'cold', 'cold', 'cold', 
         'med', 'cold', 'med', 'med', 'med', 'hot', 'med']

# 2 humidity options: high, normal
hum   = ['high', 'high', 'high', 'high', 'norm', 'norm', 'norm', 
         'high', 'norm', 'norm', 'norm', 'high', 'norm', 'high']

# 2 wind parameters: strong, weak
wind  = ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 
         'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong']

# 2 targets: activity is still on (active), canceled
activ = ['cancel', 'cancel', 'active', 'active', 'active', 'cancel', 'active', 
         'cancel', 'active', 'active', 'active', 'active', 'active', 'cancel']

In [4]:
# merging all attributes into one pandas dataframe
dataset = {'weather': weat, 'temperature': temp, 'humidity': hum, 'wind': wind, 'activity': activ}
df = pd.DataFrame(dataset, columns=['weather', 'temperature', 'humidity', 'wind', 'activity'])
df

Unnamed: 0,weather,temperature,humidity,wind,activity
0,sun,hot,high,weak,cancel
1,sun,hot,high,strong,cancel
2,cloud,hot,high,weak,active
3,rain,med,high,weak,active
4,rain,cold,norm,weak,active
5,rain,cold,norm,strong,cancel
6,cloud,cold,norm,strong,active
7,sun,med,high,weak,cancel
8,sun,cold,norm,weak,active
9,rain,med,norm,weak,active


In [5]:
global Class
Class = df.keys()[-1]

In [6]:
#------------------Compute Entropy Of Whole Dataset ------------------#

def Entropy(df):
    entropy = 0
    values = df[Class].unique() #cancel, active

    for v in values:
        fraction = df.activity.value_counts()[v]/len(df.activity)  # 9/14, 5/14
        entropy += -fraction*np.log2(fraction) # -( 9/14log2(9/14) + 5/14(log2(5/14))
    return entropy


entropy = Entropy(df)
print('Entropy for the whole dataset is: {0: .4f}'.format(entropy))

Entropy for the whole dataset is:  0.9403


In [7]:
#------------------Compute Entropy Of Attributes------------------#

def Entropy_Att(df, attribute):
    target_variables = df.activity.unique()   #cancel, active
    variables        = df[attribute].unique() #unique target of an attribute

    entropy_attribute = 0
    for variable in variables:
        entropy_each_feature = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df.activity ==target_variable]) #numerator
            den = len(df[attribute][df[attribute]==variable])  #denominator
            fraction = num/(den+eps)  #pi
            entropy_each_feature += -fraction*log(fraction+eps) #This calculates entropy for one feature like 'Sweet'
        fraction2 = den/len(df)
        entropy_attribute += -fraction2*entropy_each_feature   #Sums up all the entropy ETaste

    return(abs(entropy_attribute))

a_entropy = {i: Entropy_Att(df, i) for i in df.columns[:-1]} #calcuate entropy for all columns except the last one
print('Entropies for all the columns are:')
a_entropy

Entropies for all the columns are:


{'weather': 0.6935361388961914,
 'temperature': 0.9110633930116756,
 'humidity': 0.7884504573082889,
 'wind': 0.892158928262361}

In [8]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(Entropy(df)-Entropy_Att(df,key))
    return df.keys()[:-1][np.argmax(IG)]

find_winner(df)

'weather'

In [9]:
def get_subtable(df, node, value):
  return df[df[node] == value].reset_index(drop=True)

# for v in attValue:
#     subtable = get_subtable(df, node, v)
#     print(subtable)

In [10]:
def buildTree(df,tree=None): 
    #Get attribute with maximum information gain
    node = find_winner(df)
    
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
    #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[Class],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 
                   
    return tree

In [11]:
t = buildTree(df)

import pprint
pprint.pprint(t)

{'weather': {'cloud': 'active',
             'rain': {'wind': {'strong': 'cancel', 'weak': 'active'}},
             'sun': {'humidity': {'high': 'cancel', 'norm': 'active'}}}}
