In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np
import math
np.seterr(divide = 'ignore', invalid = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [None]:
class binary_tree:
    def __init__(self,entropy,level,feature_name,gain_ratio,class_name,split_val):
        self.entropy = entropy
        self.gain_ratio = gain_ratio
        self.split_val = split_val
        self.class_name = class_name
        self.feature_name = feature_name
        self.level = level
        self.left = None
        self.right = None

In [None]:
def print_tree(root):
    if root is None:
        return
    a = get_setosa(output)
    b = get_versicolor(output)
    c = get_virginica(output)
    
    print("Level",root.level)
    print("Count of setosa (class 0) =",a)
    print("Count of versicolor (class 1) =",b)
    print("Count of virginica (class 2) =",c)
    print("Entropy =",root.entropy)
    print("Splitting on feature",root.feature_name," with gain ratio ",format(root.gain_ratio,".4f"))
    print("Majority class :",root.class_name)
    if root.split_val != None:
        print(root.feature_name,"<=",root.split_val)
    print()
    
    print_tree(root.left)
    print_tree(root.right)

In [None]:
dataset = datasets.load_iris()

In [None]:
input = pd.DataFrame(dataset.data)

In [None]:
output = pd.DataFrame(dataset.target)

In [None]:
input.columns = dataset.feature_names
feature = dataset.feature_names

In [None]:
def get_setosa(output):
    x = np.array(output[:])
    return (x == 0).sum()

In [None]:
def get_versicolor(output):
    x = np.array(output[:])
    return (x == 1).sum()

In [None]:
def get_virginica(output):
    x = np.array(output[:])
    return (x == 2).sum()

In [None]:
def entropy(output):
    total = len(output)
    count_setosa = get_setosa(output)
    count_versicolor = get_versicolor(output)
    count_virginica = get_virginica(output)
    
    a = count_setosa/total
    b = count_versicolor/total
    c = count_virginica/total
    
    op = 0
    if(a!=0):
        op = op+(-a * math.log(a,2))
    if(b!=0):
        op = op+(-b * math.log(b,2))
    if(c!=0):
        op = op+(-c * math.log(c,2))
    
    return op

In [None]:
def split_ratio(output1,output2,output):
    a = len(output1)/len(output)
    b = len(output2)/len(output)
    split_gain = 0
    x,y = 0,0
    if a!=0:
        x = -a*math.log(a,2)
    if b!=0:
        y = -b*math.log(b,2)
    
    return (x+y)

In [None]:
def gain_ratio(input,output,val,feature):
        no_setosa = get_setosa(input)
        no_versicolor = get_versicolor(input)
        no_virginica = get_virginica(input)
        initial_entropy = entropy(output)
        
        split_input1 = input[input[feature] <= val]
        split_input2 = input[input[feature] > val]
        split_output1 = output[input[feature] <= val]
        split_output2 = output[input[feature] > val]
        final_entropy = 0
        
        final_entropy += (len(split_output1)/len(output)) * entropy(split_output1) 
        final_entropy += (len(split_output2)/len(output)) * entropy(split_output2)
        
        entropy_gain = initial_entropy - final_entropy
        split_gain = split_ratio(split_output1,split_output2,output)
        gr = (entropy_gain)/(split_gain)
        return gr

In [None]:
def splitOn(input,output,feature):
    l = list(input[feature])
    val = -1
    max_gain_ratio = -1
    for i in range(len(l)-1):
        a = l[i]
        b = l[i+1]
        mid = (a+b)/2
        temp_gr = gain_ratio(input,output,mid,feature)
        if(temp_gr > max_gain_ratio):
            max_gain_ratio = temp_gr
            val = mid
    return max_gain_ratio,val

In [None]:
def splitting(input,output,feature):
    max_gain_ratio = 0
    split_feature = None
    split_val = 0
    
    for i in feature:
        temp_gain_ratio,temp_split_val = splitOn(input,output,i)
        if(temp_gain_ratio > max_gain_ratio):
            max_gain_ratio = temp_gain_ratio
            split_feature = i
            split_val = temp_split_val
    return split_feature,max_gain_ratio,split_val

In [None]:
def print_dt(input,output,feature,level):
    no_of_setosa = get_setosa(output)
    no_of_versicolor = get_versicolor(output)
    no_of_virginica = get_virginica(output)
    feature_left = len(feature)
    total_elements = len(input)
    
    print("Level",level)
    print("Count of Setosa (class 0) =",no_of_setosa)
    print("Count of Versicolor (class 1) =",no_of_versicolor)
    print("Count of virginica (class 2) =",no_of_virginica)
    e_val = entropy(output)
    print("Current Entropy =",e_val)
    
    maximum=max(no_of_setosa,no_of_versicolor,no_of_virginica)
    if maximum==no_of_setosa:
        cls_name='Setosa'
    elif maximum==no_of_versicolor:
        cls_name='Versicolor'
    else :
        cls_name='Virginica'
        
    if(no_of_setosa == total_elements or no_of_versicolor == total_elements or no_of_virginica == total_elements or feature_left == 0):
        print("Reached Leaf Node")
        root = binary_tree(e_val,level,"cannot split reached at leaf node",0.0,cls_name,None)
        print()
        return root
    else:
        feature_name,gain_val,val = splitting(input,output,feature)
        print("Splitting on feature",feature_name," with gain ratio",format(gain_val,".3f"))
        print(feature_name,"<=",val)
        root = binary_tree(e_val,level,feature_name,gain_val,cls_name,val)
        print()
        
        input1 = input[input[feature_name] <= val]
        input2 = input[input[feature_name] > val]
        output1 = output[input[feature_name] <= val]
        output2 = output[input[feature_name] > val]
        
        root.left = print_dt(input1,output1,feature,level+1)
        root.right = print_dt(input2,output2,feature,level+1)
        return root

In [None]:
root = print_dt(input,output,feature,0)
print("Now the printing of binary tree as of decision tree :")
print()
print_tree(root)

Level 0
Count of Setosa (class 0) = 50
Count of Versicolor (class 1) = 50
Count of virginica (class 2) = 50
Current Entropy = 1.584962500721156
Splitting on feature petal width (cm)  with gain ratio 1.000
petal width (cm) <= 0.7999999999999999

Level 1
Count of Setosa (class 0) = 50
Count of Versicolor (class 1) = 0
Count of virginica (class 2) = 0
Current Entropy = 0.0
Reached Leaf Node

Level 1
Count of Setosa (class 0) = 0
Count of Versicolor (class 1) = 50
Count of virginica (class 2) = 50
Current Entropy = 1.0
Splitting on feature petal width (cm)  with gain ratio 0.693
petal width (cm) <= 1.75

Level 2
Count of Setosa (class 0) = 0
Count of Versicolor (class 1) = 49
Count of virginica (class 2) = 5
Current Entropy = 0.44506485705083865
Splitting on feature petal length (cm)  with gain ratio 0.607
petal length (cm) <= 5.4

Level 3
Count of Setosa (class 0) = 0
Count of Versicolor (class 1) = 49
Count of virginica (class 2) = 3
Current Entropy = 0.31821529768323314
Splitting on fea