In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log

In [2]:
# WORKFLOW OF ID3

# 1.compute the entropy for data-set
# 2.for every attribute/feature:
#        1.calculate entropy for all categorical values
#        2.take average information entropy for the current attribute
#        3.calculate gain for the current attribute
# 3. pick the highest gain attribute.
# 4. Repeat until we get the tree we desired

In [3]:
#------------------Example No.2------------------#

In [4]:
# 3 weather options: Sunny, Cloudy, Rainy
weat  = ['sun', 'sun', 'cloud', 'rain', 'rain', 'rain', 'cloud', 
         'sun', 'sun', 'rain', 'sun', 'cloud', 'cloud', 'rain']

# 3 temperature options: Hot, Medium, Cold
temp  = ['hot', 'hot', 'hot', 'med', 'cold', 'cold', 'cold', 
         'med', 'cold', 'med', 'med', 'med', 'hot', 'med']

# 2 humidity options: high, normal
hum   = ['high', 'high', 'high', 'high', 'norm', 'norm', 'norm', 
         'high', 'norm', 'norm', 'norm', 'high', 'norm', 'high']

# 2 wind parameters: strong, weak
wind  = ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 
         'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong']

# 2 targets: activity is still on (active), canceled
activ = ['cancel', 'cancel', 'active', 'active', 'active', 'cancel', 'active', 
         'cancel', 'active', 'active', 'active', 'active', 'active', 'cancel']

In [5]:
# merging all attributes into one pandas dataframe
dataset = {'weather': weat, 'temperature': temp, 'humidity': hum, 'wind': wind, 'activity': activ}
df = pd.DataFrame(dataset, columns=['weather', 'temperature', 'humidity', 'wind', 'activity'])
df

Unnamed: 0,weather,temperature,humidity,wind,activity
0,sun,hot,high,weak,cancel
1,sun,hot,high,strong,cancel
2,cloud,hot,high,weak,active
3,rain,med,high,weak,active
4,rain,cold,norm,weak,active
5,rain,cold,norm,strong,cancel
6,cloud,cold,norm,strong,active
7,sun,med,high,weak,cancel
8,sun,cold,norm,weak,active
9,rain,med,norm,weak,active


In [6]:
#------------------Compute Entropy Of Whole Dataset ------------------#

In [7]:
def Entropy(df):
    entropy = 0
    values = df[Class].unique() #cancel, active

    for v in values:
        fraction = df[Class].value_counts()[v]/len(df[Class])  # 9/14, 5/14
        entropy += -fraction*np.log2(fraction) # -( 9/14log2(9/14) + 5/14(log2(5/14))
    return entropy

# entropy = Entropy(df)
# print('Entropy for the whole dataset is: {0: .4f}'.format(entropy))

In [8]:
#------------------Compute Entropy Of Attributes------------------#

In [9]:
def Entropy_Att(df, attribute):
    targets = df[Class].unique()       #cancel, active
    variables = df[attribute].unique() #unique targets of an attribute

    entropy_attribute = 0

    for i in variables:
        entropy_each_feature = 0
        for j in targets:
            #set up denominator and numerator, and calculate fraction
            num = len(df[attribute][df[attribute]==i][df[Class] == j])
            den = len(df[attribute][df[attribute]==i])
            fraction = num/(den+eps)
            #pluck fraction into the formula and calculate entropy for each feature
            entropy_each_feature += -fraction*log(fraction+eps)
        #calculate weighted sum
        fraction2 = den/len(df)
        entropy_attribute += -fraction2*entropy_each_feature   #Sums up all the entropy ETaste

    return(abs(entropy_attribute))

# a_entropy = {i: Entropy_Att(df, i) for i in df.columns[:-1]} #calcuate entropy for all columns except the last one
# print('Entropies for all the columns are:')
# a_entropy

In [10]:
#------------------Compute Infromation Gain------------------#

In [11]:
def info_gain_win(df, show_stat):
    Entropy_att = []
    IG = [] #information gain
    count = 0

    #compute infromation gain for all attributes
    for i in df.columns[:-1]:
        IG.append(Entropy(df) - Entropy_Att(df, i))

    IG_max = df.columns[:-1][np.argmax(IG)]

    if show_stat == True:
        print('#-------------{}-------------#'.format(IG_max.upper()))
        for i in df.columns[:-1]:
            print("{0}\n info ={1: .4f}, Info Gain ={2: .4f}\n".format(i, Entropy_Att(df, i), IG[count]))
            count += 1

    #return an attribute with largest IG
    return IG_max

# info_gain_win(df, show_stat=True)

In [12]:
#------------------Output Decision tree as Dictionary------------------#

In [13]:
# create subtable for branching
def get_subtable(df, node, value):
  return df[df[node] == value].reset_index(drop=True)

def buildTree(df, show_stat, tree=None):  
    #attribute with highest infromation gain
    node = info_gain_win(df, show_stat)
    #unique targets of the attribute
    att_targs = np.unique(df[node])
    
    #create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
    #fill the tree with attributes recursively
    for value in att_targs:
        
        subtable = get_subtable(df, node, value)
        clValue, counts = np.unique(subtable[Class], return_counts=True)                        
        
        if len(counts) == 1:#checking purity of subset
            tree[node][value] = clValue[0] #if pure, set give a leaf                                                    
        else:        
            tree[node][value] = buildTree(subtable, show_stat) #if not, keep splitting
                   
    return tree

In [14]:
# to avoid 0 in the denominator
eps = np.finfo(float).eps

# define global variables
global Class
Class = df.columns[-1]

t = buildTree(df, show_stat=True)

import pprint
#print in a pretty way
pprint.pprint(t)

#-------------WEATHER-------------#
weather
 info = 0.6935, Info Gain = 0.2467

temperature
 info = 0.9111, Info Gain = 0.0292

humidity
 info = 0.7885, Info Gain = 0.1518

wind
 info = 0.8922, Info Gain = 0.0481

#-------------WIND-------------#
weather
 info = 0.9710, Info Gain = 0.0000

temperature
 info = 0.9510, Info Gain = 0.0200

humidity
 info = 0.9510, Info Gain = 0.0200

wind
 info = 0.0000, Info Gain = 0.9710

#-------------HUMIDITY-------------#
weather
 info = 0.9710, Info Gain = 0.0000

temperature
 info = 0.4000, Info Gain = 0.5710

humidity
 info = 0.0000, Info Gain = 0.9710

wind
 info = 0.9510, Info Gain = 0.0200

{'weather': {'cloud': 'active',
             'rain': {'wind': {'strong': 'cancel', 'weak': 'active'}},
             'sun': {'humidity': {'high': 'cancel', 'norm': 'active'}}}}


In [15]:
#------------------Example No.2------------------#

In [16]:
#case No.2
#is a person office worker
A1 = ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 
      'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no']

#age
A2 = ['<=40', '<=40', '41~50', '>50', '>50', '>50', '41~50', 
      '<=40', '<=40', '>50', '<=40', '41~50', '41~50', '>50']

#income
A3 = ['high', 'high', 'high', 'mid', 'low', 'low', 'low',
      'mid', 'low', 'mid', 'mid', 'mid', 'high', 'mid']

#credit history
A4 = ['ok', 'good', 'ok', 'ok', 'ok', 'good', 'good', 
      'ok', 'ok', 'ok', 'good', 'good', 'ok', 'good']

#insurance type
C = ['c2', 'c2', 'c1', 'c1', 'c1', 'c2', 'c1', 
     'c2', 'c1', 'c1', 'c1', 'c1', 'c1', 'c2']

In [17]:
# merging all attributes into one pandas dataframe
dataset2 = {'Office_worker': A1, 'age': A2, 'income': A3, 'credit': A4, 'insurance': C}
df2 = pd.DataFrame(dataset2, columns=['Office_worker', 'age', 'income', 'credit', 'insurance'])
df2

Unnamed: 0,Office_worker,age,income,credit,insurance
0,no,<=40,high,ok,c2
1,no,<=40,high,good,c2
2,no,41~50,high,ok,c1
3,no,>50,mid,ok,c1
4,yes,>50,low,ok,c1
5,yes,>50,low,good,c2
6,yes,41~50,low,good,c1
7,no,<=40,mid,ok,c2
8,yes,<=40,low,ok,c1
9,yes,>50,mid,ok,c1


In [18]:
# define global variables
global Class
Class = df2.columns[-1]

In [19]:
t2 = buildTree(df2, show_stat=True)
pprint.pprint(t2)

#-------------AGE-------------#
Office_worker
 info = 0.7885, Info Gain = 0.1518

age
 info = 0.6935, Info Gain = 0.2467

income
 info = 0.9111, Info Gain = 0.0292

credit
 info = 0.8922, Info Gain = 0.0481

#-------------OFFICE_WORKER-------------#
Office_worker
 info = 0.0000, Info Gain = 0.9710

age
 info = 0.9710, Info Gain = 0.0000

income
 info = 0.4000, Info Gain = 0.5710

credit
 info = 0.9510, Info Gain = 0.0200

#-------------CREDIT-------------#
Office_worker
 info = 0.9510, Info Gain = 0.0200

age
 info = 0.9710, Info Gain = 0.0000

income
 info = 0.9510, Info Gain = 0.0200

credit
 info = 0.0000, Info Gain = 0.9710

{'age': {'41~50': 'c1',
         '<=40': {'Office_worker': {'no': 'c2', 'yes': 'c1'}},
         '>50': {'credit': {'good': 'c2', 'ok': 'c1'}}}}
