In [136]:
import numpy as np
import pandas as pd
from collections import Counter

In [25]:
df = pd.DataFrame(pd.read_csv("PlayTennis.csv"))
df.head(6)

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Water,Play tennis
0,sunny,warm,normal,strong,warm,yes
1,sunny,warm,high,strong,warm,yes
2,rain,cold,high,strong,warm,no
3,sunny,warm,high,strong,cool,yes
4,rain,warm,high,weak,cool,no
5,sunny,cold,normal,strong,cool,no


In [142]:
def entropy(probs):
    return np.sum([-prob * np.log2(prob) for prob in probs])

In [143]:
def entropy_of_list(attr_list):
    cnt = Counter(x for x in attr_list)
    print("Count of Yes and No",cnt)
    no_of_instances = len(attr_list) * 1.0
    probs = [x/no_of_instances for x in cnt.values()]
    return entropy(probs)

In [10]:
total_entropy = entropy_of_list(df["Play tennis"])
total_entropy

Yes or No: Counter({'yes': 3, 'no': 3})


1.0

In [145]:
def info_gain(df,attr,target,trace=0):
    df_split = df.groupby(attr)
    for i , rows in df_split:
        print(i)
        print(rows)        
    nobs = len(df.index) * 1.0 
    df_agg_ent = df_split.agg({target:[entropy_of_list,lambda x: len(x)/nobs]})[target]
    df_agg_ent.columns = ["Entropy","PropObser"]
    new_entropy = np.sum(df_agg_ent["Entropy"] * df_agg_ent["PropObser"])
    total_entropy = entropy_of_list(df[target])
    return total_entropy - new_entropy

In [13]:
print("Humidity",info_gain(df,"Humidity","Play tennis"))
print("Outlook",info_gain(df,"Outlook","Play tennis"))
print("Temperature",info_gain(df,"Temperature","Play tennis"))
print("Wind",info_gain(df,"Wind","Play tennis"))
print("Water",info_gain(df,"Water","Play tennis"))

Yes or No: Counter({'yes': 2, 'no': 2})
Yes or No: Counter({'yes': 1, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Humidity 0.0
Yes or No: Counter({'no': 2})
Yes or No: Counter({'yes': 3, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Outlook 0.4591479170272448
Yes or No: Counter({'no': 2})
Yes or No: Counter({'yes': 3, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Temperature 0.4591479170272448
Yes or No: Counter({'yes': 3, 'no': 2})
Yes or No: Counter({'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Wind 0.19087450462110944
Yes or No: Counter({'no': 2, 'yes': 1})
Yes or No: Counter({'yes': 2, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Water 0.08170416594551044


In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def entropy(probs):
    return np.sum([(-prob*np.log2(prob)) for prob in probs])

In [6]:
def entropy_of_list(attr_list):
    cnt = Counter(x for x in attr_list)
    print("Yes or No:",cnt)
    no_of_instances = len(attr_list) * 1.0
    probs = [x/no_of_instances for x in cnt.values()]
    return entropy(probs)

In [12]:
def info_gain(df,attr,target,trace = 0):
    df_split = df.groupby(attr)
    nobs = len(df.index)
    df_agg_ent = df_split.agg({target:[entropy_of_list,lambda x: len(x)/nobs ]})[target]
    df_agg_ent.columns = ["entropy","propObr"]
    new_entropy = np.sum(df_agg_ent["entropy"]*df_agg_ent["propObr"])
    total_entropy = entropy_of_list(df[target])
    return total_entropy - new_entropy

In [65]:
def id3(df,attrs,target,default = None):
    print(df)
    cnt = Counter(x for x in df[target])
    if len(cnt) == 1:
        return next(iter(cnt))
    
    elif df.empty or not(attrs):
        return default
    
    else:
        gain = [info_gain(df,attr,target) for attr in attrs]
        index_of_max = gain.index(max(gain))
        best_attr = attrs[index_of_max]
        tree = {best_attr : {}}
        rem_attrs = [i for i in attrs if i != best_attr]

        
        for subattr , subData in df.groupby(best_attr):
            subTree = id3(subData,rem_attrs,target)
            tree[best_attr][subattr] = subTree
            
        print(tree)
            
    return tree

In [66]:
attr_names = list(df.columns)
print("Total attributs:",attr_names)
target = attr_names[-1]
attr_names = attr_names[:-1]
print("Preicting attr",attr_names)
print("Target attr:",target)

Total attributs: ['Outlook', 'Temperature', 'Humidity', 'Wind', 'Water', 'Play tennis']
Preicting attr ['Outlook', 'Temperature', 'Humidity', 'Wind', 'Water']
Target attr: Play tennis


In [67]:
id3(df,attr_names,target)

  Outlook Temperature Humidity    Wind Water Play tennis
0   sunny        warm   normal  strong  warm         yes
1   sunny        warm     high  strong  warm         yes
2    rain        cold     high  strong  warm          no
3   sunny        warm     high  strong  cool         yes
4    rain        warm     high    weak  cool          no
5   sunny        cold   normal  strong  cool          no
Yes or No: Counter({'no': 2})
Yes or No: Counter({'yes': 3, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Yes or No: Counter({'no': 2})
Yes or No: Counter({'yes': 3, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Yes or No: Counter({'yes': 2, 'no': 2})
Yes or No: Counter({'yes': 1, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Yes or No: Counter({'yes': 3, 'no': 2})
Yes or No: Counter({'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
Yes or No: Counter({'no': 2, 'yes': 1})
Yes or No: Counter({'yes': 2, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})
  Outlook Temperature Humidity 

{'Outlook': {'rain': 'no',
  'sunny': {'Temperature': {'cold': 'no', 'warm': 'yes'}}}}

In [68]:
entropy_of_list(attr_names)

Yes or No: Counter({'Outlook': 1, 'Temperature': 1, 'Humidity': 1, 'Wind': 1, 'Water': 1})


2.321928094887362

In [69]:
info_gain(df,"Outlook","Play tennis")

Yes or No: Counter({'no': 2})
Yes or No: Counter({'yes': 3, 'no': 1})
Yes or No: Counter({'yes': 3, 'no': 3})


0.4591479170272448