In [7]:
import numpy as np
import pandas as pd


def calc_ent(df, target_label):
    """ Calculate the entropy of df.
    
    :param df: the data frame to calculate
    :param target_label: the index name of target
    :return ent: the entropy of df
    """
    df_value_list = set()
    
    for index, row in df.iterrows():
        df_value_list.add(row[target_label])
        
    ent = 0.0
    for df_value in df_value_list:
        p = float(df[df.loc[:,target_label] == df_value].shape[0])/df.shape[0]
        log_p = np.log2(p)
        ent -= p * log_p 
        
    return ent


def calc_info_gain(df, a, target_label):
    """ Calculate the information gain Gain(df, a, target_label).
    
    :param df: the data frame to calculate
    :param a: the attribute
    :param target_label: the index name of target
    :return: the information gain of df by the given attribute a
    """
    # entropy of x
    ent_df = calc_ent(df, target_label)
    
    # a_value_list
    a_value_list = set()
    for index, row in df.iterrows():
        a_value_list.add(row[a])
    
    # calculate the information gain
    info_gain = ent_df
    for a_value in a_value_list:
        df_a_value = df[df.loc[:,a] == a_value]
        ent_df_a_value = calc_ent(df_a_value, target_label)
        info_gain -= float(df_a_value.shape[0]) / df.shape[0] * ent_df_a_value
        
    return info_gain


# Read the file.
tennis = pd.read_csv('tennis.csv', header=0, usecols=[1, 2, 3, 4, 5]) 
    

# (1) Test of (1)
print('(1) Calculate the entropy of S')
print('\nEntropy(tennis) = ', calc_ent(tennis, 'PlayTennis'))

# (2) Test of (2)
print('\n(2) Calculate the information gain Gain(S,A):\n')
print('Gain(tennis, Outlook) = ', calc_info_gain(tennis, 'Outlook', 'PlayTennis'))
print('Gain(tennis, Temperature) = ', calc_info_gain(tennis, 'Temperature', 'PlayTennis'))
print('Gain(tennis, Humidity) = ', calc_info_gain(tennis, 'Humidity', 'PlayTennis'))
print('Gain(tennis, Wind) = ', calc_info_gain(tennis, 'Wind', 'PlayTennis'))

# (3) Estimate the Information Gain of all the attributes. 
print('\n(3) Estimate the Information Gain of all the attributes:\n')
print('Gain(tennis, Outlook) = ', calc_info_gain(tennis, 'Outlook', 'PlayTennis'))
print('Gain(tennis, Temperature) = ', calc_info_gain(tennis, 'Temperature', 'PlayTennis'))
print('Gain(tennis, Humidity) = ', calc_info_gain(tennis, 'Humidity', 'PlayTennis'))
print('Gain(tennis, Wind) = ', calc_info_gain(tennis, 'Wind', 'PlayTennis'))

print('\nBecause the attribute Outlook has the highest information gain, '
      'so we choose the Outlook as the root node.')





(1) Calculate the entropy of S

Entropy(tennis) =  0.9402859586706311

(2) Calculate the information gain Gain(S,A):

Gain(tennis, Outlook) =  0.24674981977443933
Gain(tennis, Temperature) =  0.02922256565895487
Gain(tennis, Humidity) =  0.15183550136234164
Gain(tennis, Wind) =  0.048127030408269544

(3) Estimate the Information Gain of all the attributes:

Gain(tennis, Outlook) =  0.24674981977443933
Gain(tennis, Temperature) =  0.02922256565895487
Gain(tennis, Humidity) =  0.15183550136234164
Gain(tennis, Wind) =  0.048127030408269544

Because the attribute Outlook has the highest information gain, so we choose the Outlook as the root node.
