Import Libraries

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import Dataset

In [51]:
dataSet = pd.read_csv('Enjoy sports.csv')
dataSet = dataSet.drop(columns=['Day']) 
print('Dataset Length : ', len(dataSet))
print('DataSet Shape : ', dataSet.shape)

Dataset Length :  14
DataSet Shape :  (14, 5)


Entropy Function

In [41]:
def entropy (dataSet):
    d = dataSet.iloc[:, -1]
    d = d.value_counts()
    s = 0
    for v in d.keys():
        p = d[v] / sum(d)
        s -= p * np.log2(p)
    return s

In [42]:
def values (attr):
    return list(set(attr))

Information Gain Function

In [43]:
def informationGain (dataSet, attr):
    e = entropy(dataSet)
    val = values(dataSet[attr])
    s_c = dataSet[attr].value_counts()
    s_v = []
    for v in val:
        ds = dataSet[dataSet[attr] == v]
        s = 0
        for res in values(dataSet.iloc[:, -1]):
            try:
                pi = ds.iloc[:, -1].value_counts()[res] / len(ds)
                s -= pi * np.log2(pi)
            except:
                s = 0
        s_v.append(s)
    
    for i in range (len(val)):
        e = e - s_c[val[i]] * s_v[i] / sum(s_c)
        
    return e

Class of Node

In [44]:
class Node():

    def __init__(self, name=None, attr=None, is_leaf=False, classification=None):
        self.name = name
        self.attr = attr
        self.is_leaf = is_leaf
        self.classification = classification
        self.children = {}



Function for making a Decision Tree Node

In [66]:
def DTNode(dataSet, features_used):
    node = Node()
    IGmax = 0
    vbest = None
    
    valList = [v for v in dataSet.columns[:-1] if v not in features_used]

    if valList:
        for v in valList:
            gain = informationGain(dataSet, v)
            if gain > IGmax:
                IGmax = gain
                vbest = v

        if vbest:
            features_used.append(vbest)
            node.name = vbest
            node.attr = values(dataSet[vbest])
            return node
        else:
            return node
    return None


Function for classifying the Decision Tree

In [69]:
def DTClassifier(dataSet, features_used):
    root = DTNode(dataSet, features_used)
    
    if root is None:
        return None
    
    # If the dataset is pure (entropy is 0), create a leaf node
    if entropy(dataSet) == 0:
        classification = values(dataSet.iloc[:, -1])[0]
        return Node(is_leaf=True, classification=classification)
    
    for attr_val in root.attr:
        subset = dataSet[dataSet[root.name] == attr_val]
        if len(subset) == 0:
            continue
        
        # If the subset is pure, create a leaf node
        if entropy(subset) == 0:
            classification = values(subset.iloc[:, -1])[0]
            root.children[attr_val] = Node(is_leaf=True, classification=classification)
        else:
            root.children[attr_val] = DTClassifier(subset, features_used.copy())
    
    return root


Function for Printing the tree

In [70]:
def print_tree(node, depth=0, prefix=""):
    if node.is_leaf:
        print(f"{prefix}: {node.classification}")
    else:
        print(f"{prefix}{node.name}")
        
        for i, (attr_val, child_node) in enumerate(node.children.items()):
            if i == len(node.children) - 1:
                new_prefix = prefix + "└── "
            else:
                new_prefix = prefix + "├── "
            
            if child_node.is_leaf:
                print(f"{new_prefix}{attr_val}: {child_node.classification}")
            else:
                print(f"{new_prefix}{attr_val}")
                print_tree(child_node, depth + 1, new_prefix + " ")


Driver Code

In [48]:
print(entropy(dataSet))

0.9402859586706311


In [49]:
iGain = [informationGain(dataSet, 'Outlook'), informationGain(dataSet, 'Temp.'), informationGain(dataSet, 'Humidity'), informationGain(dataSet, 'Wind')]
print(iGain)

[0.24674981977443927, 0.029222565658954758, 0.15183550136234164, 0.04812703040826949]


In [71]:
features_used = []
decisionTree = DTClassifier(dataSet, features_used)
print("Decision Tree Structure:")
print_tree(decisionTree)

Decision Tree Structure:
Outlook
├── Rain
├──  Wind
├──  ├── Strong: No
├──  └── Weak: Yes
├── Sunny
├──  Humidity
├──  ├── Normal: Yes
├──  └── High: No
└── Overcast: Yes
