In [1]:
import numpy as np
import pandas as pd

# ‘eps’ here is the smallest representable number. At times we get log(0) or 0 in the denominator, to avoid that we are going to use this.
eps = np.finfo(float).eps
dataset = {'Taste': ['Salty', 'Spicy', 'Spicy', 'Spicy', 'Spicy', 'Sweet', 'Salty', 'Sweet', 'Spicy', 'Salty'],
           'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Hot', 'Cold', 'Cold', 'Hot', 'Cold', 'Hot'],
           'Texture': ['Soft', 'Soft', 'Hard', 'Hard', 'Hard', 'Soft', 'Soft', 'Soft', 'Soft', 'Hard'],
           'Eat': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']}

df = pd.DataFrame(dataset, columns=['Taste', 'Temperature', 'Texture', 'Eat'])

In [2]:
# Read the data frame
df

Unnamed: 0,Taste,Temperature,Texture,Eat
0,Salty,Hot,Soft,No
1,Spicy,Hot,Soft,No
2,Spicy,Hot,Hard,Yes
3,Spicy,Cold,Hard,No
4,Spicy,Hot,Hard,Yes
5,Sweet,Cold,Soft,Yes
6,Salty,Cold,Soft,No
7,Sweet,Hot,Soft,Yes
8,Spicy,Cold,Soft,Yes
9,Salty,Hot,Hard,Yes


In [3]:
# define a function that takes in class (target variable vector) and finds the entropy of that class.
# the fraction is ‘pi’, it is the proportion of a number of elements in that split group to the number of elements in the group before splitting(parent group).
def find_entropy(df):
    entropy = 0
    for value in ('Yes', 'No'):
        fraction = df['Eat'].value_counts()[value] / len(df['Eat'])
        entropy += -fraction * np.log2(fraction)
    return entropy

In [4]:
df['Eat'].value_counts()['Yes']

6

In [5]:
df['Taste'][df['Taste'] == 'Salty']

0    Salty
6    Salty
9    Salty
Name: Taste, dtype: object

In [6]:
len(df[df['Eat'] == 'Yes'])

6

In [7]:
# find the Entropy and then Information Gain for splitting the data set

find_entropy(df)

0.9709505944546686

In [8]:
def find_entropy_attribute(df, attribute):
    entropy_sum = 0
    for variable in df[attribute].unique():
        entropy = 0
        for target_variable in ('Yes', 'No'):
            num = len(df[attribute][df[attribute] == variable][df['Eat'] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            fraction = num / (den + eps)
            entropy += -fraction * np.log2(fraction + eps)
        entropy_weights = den / len(df)
        entropy_sum += -entropy_weights * entropy
    return abs(entropy_sum)


In [9]:
def build_tree(df):
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df) - find_entropy_attribute(df, key))
    node = df.keys()[:-1][np.argmax(IG)]  # Get attribute with maximum information gain

    tree = {}
    tree[node] = {}

    for value in np.unique(df[node]):
        sub_table = df[df[node] == value].reset_index(drop=True)
        cls, counts = np.unique(sub_table['Eat'], return_counts=True)

        if len(counts) == 1:  # Checking purity of subset
            tree[node][value] = cls[0]
        else:
            tree[node][value] = build_tree(sub_table)  # Calling the function recursively

    return tree

In [10]:
# Now we call the buildTree function and print the tree we built.
tree = build_tree(df)

import pprint
pprint.pprint(tree)

{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}},
           'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No',
                                                          'Soft': 'Yes'}},
                                     'Hot': {'Texture': {'Hard': 'Yes',
                                                         'Soft': 'No'}}}},
           'Sweet': 'Yes'}}


In [11]:
# We can write an algorithm to predict using this tree structure.
def predict(inst, tree):
    for nodes in tree.keys():
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0

        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break

    return prediction

In [12]:
# Function to predict for any input instance
inst = df.iloc[6][:-1]
inst

Taste          Salty
Temperature     Cold
Texture         Soft
Name: 6, dtype: object

In [13]:
# This takes row with index 6
df.iloc[6][-1]

'No'

In [14]:
# Get prediction
prediction = predict(inst, tree)
prediction

# Our tree has rightly predicted that the kid is not going to eat this food. Of course this is training data, not advisable to use this for testing.

'No'

In [15]:
# We’ll try with new data
data = {'Taste': 'Salty', 'Temperature': 'Cold', 'Texture':'Hard'}

In [16]:
inst = pd.Series (data)

In [17]:
# Get prediction
prediction = predict(inst, tree)
prediction

# Our tree predicted kid will eat this food (which is Salty, Cold and Hard).
# This is a simple tree building algorithm without much control parameters.

'Yes'