In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
%matplotlib inline


In [2]:
data_dict = {
    'Outlook' : ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny','Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy']
    ,'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild','Mild','Mild', 'Hot', 'Mild']
    ,'Humidity' : ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High','Normal','Normal', 'Normal', 'High', 'Normal', 'High']
    ,'Wind': ['False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True']
    ,'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
tennis_data = pd.DataFrame(data_dict, columns=data_dict.keys())
tennis_data


Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Rainy,Cool,Normal,False,Yes
5,Rainy,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Sunny,Mild,High,False,No
8,Sunny,Cool,Normal,False,Yes
9,Rainy,Mild,Normal,False,Yes


In [3]:
%%latex
Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$

<IPython.core.display.Latex object>

In [4]:
def entropy_calculate(prob_list):

    entropy = 0
    for item in prob_list:
        entropy -= item * np.log2(item)
    return entropy

In [5]:
cases,counts = np.unique(tennis_data.PlayTennis,return_counts=True)
P = [count/len(tennis_data) for count in counts]
print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))

entropy_entire = entropy_calculate(P)

print('Entire syetems entropy is %.3f bits'%entropy_entire)
cases_outlook,counts_outlook= np.unique(tennis_data.Outlook,return_counts=True)


Probabilities of No and Yes are 0.357, 0.643 respectively
Entire syetems entropy is 0.940 bits


In [6]:
cases_outlook,counts_outlook= np.unique(tennis_data.Outlook,return_counts=True)
P_outlook = [count/len(tennis_data) for count in counts_outlook]
print('For outlook:')
for case, prob in zip(cases_outlook,P_outlook):
    print('\tProbabality of %s is %.3f'%(case, prob))


For outlook:
	Probabality of Overcast is 0.286
	Probabality of Rainy is 0.357
	Probabality of Sunny is 0.357


In [7]:
entropy_outlook={}
total_entropy_outlook=0
for case, prob in zip(cases_outlook,P_outlook):
    cases,counts = np.unique(tennis_data.PlayTennis[tennis_data.Outlook==case],return_counts=True)
    P = [count/len(tennis_data[tennis_data.Outlook==case]) for count in counts]
    entropy_outlook[case]=entropy_calculate(P)
    total_entropy_outlook += entropy_calculate(P)*prob

for case, entropy in entropy_outlook.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at Outlook decision level is %.3f'%total_entropy_outlook)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_outlook))


Entropy for Overcast is 0.00
Entropy for Rainy is 0.97
Entropy for Sunny is 0.97

Entropy at Outlook decision level is 0.694

Information gain is 0.247


In [8]:
cases_temperature,counts_temperature= np.unique(tennis_data.Temperature,return_counts=True)
P_temperature = [count/len(tennis_data) for count in counts_temperature]
print('For temperature:')
for case, prob in zip(cases_temperature,P_temperature):
    print('\tProbabality of %s is %.3f'%(case, prob))


For temperature:
	Probabality of Cool is 0.286
	Probabality of Hot is 0.286
	Probabality of Mild is 0.429


In [9]:
entropy_temperature={}
total_entropy_temperature=0
for case, prob in zip(cases_temperature,P_temperature):
    cases,counts = np.unique(tennis_data.PlayTennis[tennis_data.Temperature==case],return_counts=True)
    P = [count/len(tennis_data[tennis_data.Temperature==case]) for count in counts]
    entropy_temperature[case]=entropy_calculate(P)
    total_entropy_temperature += entropy_calculate(P)*prob

for case, entropy in entropy_temperature.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at Temperature decision level is %.3f'%total_entropy_temperature)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_temperature))


Entropy for Cool is 0.81
Entropy for Hot is 1.00
Entropy for Mild is 0.92

Entropy at Temperature decision level is 0.911

Information gain is 0.029


In [10]:
cases_wind,counts_wind= np.unique(tennis_data.Wind,return_counts=True)
P_wind = [count/len(tennis_data) for count in counts_wind]
print('For wind:')
for case, prob in zip(cases_wind,P_wind):
    print('\tProbabality of %s is %.3f'%(case, prob))


For wind:
	Probabality of False is 0.571
	Probabality of True is 0.429


In [11]:
entropy_wind={}
total_entropy_wind=0
for case, prob in zip(cases_wind,P_wind):
    cases,counts = np.unique(tennis_data.PlayTennis[tennis_data.Wind==case],return_counts=True)
    P = [count/len(tennis_data[tennis_data.Wind==case]) for count in counts]
    entropy_wind[case]=entropy_calculate(P)
    total_entropy_wind += entropy_calculate(P)*prob

for case, entropy in entropy_wind.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at Wind decision level is %.3f'%total_entropy_wind)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_wind))


Entropy for False is 0.81
Entropy for True is 1.00

Entropy at Wind decision level is 0.892

Information gain is 0.048


In [12]:
cases_humidity,counts_humidity= np.unique(tennis_data.Humidity,return_counts=True)
P_humidity = [count/len(tennis_data) for count in counts_humidity]
print('For humidity:')
for case, prob in zip(cases_humidity,P_humidity):
    print('\tProbabality of %s is %.3f'%(case, prob))


For humidity:
	Probabality of High is 0.500
	Probabality of Normal is 0.500


In [13]:
entropy_humidity={}
total_entropy_humidity=0
for case, prob in zip(cases_humidity,P_humidity):
    cases,counts = np.unique(tennis_data.PlayTennis[tennis_data.Humidity==case],return_counts=True)
    P = [count/len(tennis_data[tennis_data.Humidity==case]) for count in counts]
    entropy_humidity[case]=entropy_calculate(P)
    total_entropy_humidity += entropy_calculate(P)*prob

for case, entropy in entropy_humidity.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at Humidity decision level is %.3f'%total_entropy_humidity)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_humidity))


Entropy for High is 0.99
Entropy for Normal is 0.59

Entropy at Humidity decision level is 0.788

Information gain is 0.152
