In [1]:
import numpy as np
import pandas as pd
import math
import random

# ID3 algorithm implement



In [2]:
class ID3_Algorithm :
    
    
    def __init__(self,train_data,test_data,label):
        self.trian_data=train_data
        self.test_data=test_data               
        self.label=label
    
    def __calc_total_entropy(self,train_data,class_list):
        total_row = train_data.shape[0] 
        total_entr = 0
        if total_row!=0:            
            for c in class_list: 
                total_class_count = train_data[train_data[self.label] == c].shape[0]       
                if total_class_count!=0 :                        
                    total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
                    total_entr += total_class_entr
    
        return total_entr
    
    def __calc_entropy(self,feature_value_data, class_list):
        class_count = feature_value_data.shape[0]
        entropy = 0
        if class_count!=0:   
            for c in class_list:
                label_class_count = feature_value_data[feature_value_data[self.label] == c].shape[0]  
                entropy_class = 0
                if label_class_count != 0:
                    probability_class = label_class_count/class_count 
                    entropy_class = - probability_class * np.log2(probability_class)  
                entropy += entropy_class
        return entropy
    
    def __calc_info_gain(self,feature_name, train_data, class_list):
        feature_value_list = train_data[feature_name].unique() 
        total_row = train_data.shape[0]
        feature_info = 0.0
        if total_row!=0:
            for feature_value in feature_value_list:
                feature_value_data = train_data[train_data[feature_name] == feature_value] 
                feature_value_count = feature_value_data.shape[0]
                feature_value_entropy = self.__calc_entropy(feature_value_data, class_list) 
                feature_value_probability = feature_value_count/total_row
                feature_info += feature_value_probability * feature_value_entropy 
            
        return self.__calc_total_entropy(train_data, class_list) - feature_info 

    def __find_most_informative_feature(self,train_data, class_list):
        feature_list = train_data.columns.drop(self.label) 
                                                
        max_info_gain = -1
        max_info_feature = None
        
        for feature in feature_list:  
            feature_info_gain = self.__calc_info_gain(feature, train_data, class_list)
            if max_info_gain < feature_info_gain: 
                max_info_gain = feature_info_gain
                max_info_feature = feature
                
        return max_info_feature
    
    def __generate_sub_tree(self,feature_name, train_data, class_list):
        feature_value_count_dict = train_data[feature_name].value_counts(sort=False) 
        tree = {} 
    
        for feature_value, count in feature_value_count_dict.iteritems():
            feature_value_data = train_data[train_data[feature_name] == feature_value]         
            assigned_to_node = False 
            for c in class_list: 
                class_count = feature_value_data[feature_value_data[self.label] == c].shape[0] 
                if class_count == count: 
                    tree[feature_value] = c 
                    train_data = train_data[train_data[feature_name] != feature_value] 
                    assigned_to_node = True
            if not assigned_to_node: 
                tree[feature_value] = "?" 
                
        return tree, train_data
    

    def __make_tree(self,root, prev_feature_value, train_data, class_list):
        if train_data.shape[0] != 0: 
            max_info_feature =  self.__find_most_informative_feature(train_data, class_list) 
            tree, train_data = self.__generate_sub_tree(max_info_feature, train_data, class_list)  
            next_root = None
        
            if prev_feature_value != None:
                root[prev_feature_value] = dict()
                root[prev_feature_value][max_info_feature] = tree
                next_root = root[prev_feature_value][max_info_feature]
            else: 
                root[max_info_feature] = tree
                next_root = root[max_info_feature]
        
            for node, branch in list(next_root.items()): 
                if branch == "?": 
                    feature_value_data = train_data[train_data[max_info_feature] == node] 
                    self.__make_tree(next_root, node, feature_value_data, class_list) 
    

    def Decision_Tree(self):
        train_data_m = self.trian_data.copy() 
        tree = {} 
        class_list = train_data_m[self.label].unique() 
        self.__make_tree(tree, None, self.trian_data,class_list) 
        return tree    
    
    def __predict(self,tree, instance):
        if not isinstance(tree, dict): 
            return tree 
        else:
            root_node = next(iter(tree)) 
            feature_value = instance[root_node] 
            if feature_value in tree[root_node]: 
                return self.__predict(tree[root_node][feature_value], instance)
            else:
                return None 
            

    def evaluate(self,tree):
        correct_preditct = 0
        wrong_preditct = 0
        for index,row in self.test_data.iterrows(): 
            result = self.__predict(tree,self.test_data.loc[index]) 
            if result == self.test_data[self.label].loc[index]: 
                correct_preditct += 1 
            else:
                wrong_preditct += 1 
        accuracy = correct_preditct / (correct_preditct + wrong_preditct)
        return accuracy                                 
    

# test-train split


In [3]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

# Heart Disease .... 

# Transform the datasets into a discrete state.


In [4]:
df=pd.read_csv('./heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
age_group=pd.cut(x=df.age,bins=[30,34,39,44,49,54,59,64,69,74,77 ],labels=['30 to 34','35 to 39', '40 to 44', '45 to 49','50 to 54','55 to 59','60 to 64','65 to 69','70 to 74','75 to 79'])
df.age=age_group
trestbps=pd.cut(x=df.trestbps,bins=[94,105,116,126,137,147,158,169,179,190,201],labels=['94 to 105','106 to 116', '117 to 126','127 to 137','138 to 147','148 to 158','159 to 169','170 to 179','180 to 190','191 to 200'])
df.trestbps=trestbps
chol=pd.cut(x=df.chol , bins=[126,170,214,258,302,345,389,520,565] , labels=['126 to 170','171 to 214','215 to 258','259 to 302','303 to 345','346 to 389','390 to 520','521 to 565'])
df.chol=chol
thalach=pd.cut(x=df.thalach,bins=[70,85,98,111,124,137,150,163,176,190,203] , labels=['70 to 85','86 to 98','99 to 111','112 to 124','125 to 137','138 to 150',' 151 to 163','164 to 176','177 to 190','191 to 203'])
df.thalach=thalach
oldpeak=pd.cut(x=df.oldpeak , bins=[-1,0.62,1.24,1.86,2.48,3.10,3.72,4.34,5.58,6.3] , labels=['0 to 0.62','0.63 to 1.24','1.25 to 1.86','1.87 to 2.48','2.49 to 3.10','3.11 to 3.72','3.73 to 4.34','4.35 to 5.58','5.59 to 6.3'])
df.oldpeak=oldpeak
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,60 to 64,1,3,138 to 147,215 to 258,1,0,138 to 150,0,1.87 to 2.48,0,0,1,1
1,35 to 39,1,2,127 to 137,215 to 258,0,1,177 to 190,0,3.11 to 3.72,0,0,2,1
2,40 to 44,0,1,127 to 137,171 to 214,0,0,164 to 176,0,1.25 to 1.86,2,0,2,1
3,55 to 59,1,1,117 to 126,215 to 258,0,1,177 to 190,0,0.63 to 1.24,2,0,2,1
4,55 to 59,0,0,117 to 126,346 to 389,0,1,151 to 163,1,0 to 0.62,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,55 to 59,0,0,138 to 147,215 to 258,0,1,112 to 124,1,0 to 0.62,1,0,3,0
299,45 to 49,1,3,106 to 116,259 to 302,0,1,125 to 137,0,0.63 to 1.24,1,0,3,0
300,65 to 69,1,0,138 to 147,171 to 214,1,1,138 to 150,0,3.11 to 3.72,1,2,3,0
301,55 to 59,1,0,127 to 137,126 to 170,0,1,112 to 124,1,0.63 to 1.24,1,1,3,0


In [6]:
train_df , test_df = train_test_split(df , test_size=0.3)

In [8]:
model=ID3_Algorithm(train_df,test_df,'target')
tree=model.Decision_Tree()
#model.evaluate(tree)
tree

{'thal': {1: {'trestbps': {'94 to 105': 0,
    '106 to 116': 0,
    '117 to 126': {'age': {'30 to 34': 0,
      '35 to 39': 0,
      '40 to 44': 0,
      '45 to 49': 0,
      '50 to 54': 1,
      '55 to 59': 0,
      '60 to 64': 0,
      '65 to 69': 0,
      '70 to 74': 0,
      '75 to 79': 0}},
    '127 to 137': 0,
    '138 to 147': 1,
    '148 to 158': 0,
    '159 to 169': 0,
    '170 to 179': 0,
    '180 to 190': 0,
    '191 to 200': 0}},
  2: {'ca': {0: {'age': {'30 to 34': 1,
      '35 to 39': 1,
      '40 to 44': 1,
      '45 to 49': {'trestbps': {'94 to 105': 1,
        '106 to 116': 1,
        '117 to 126': 1,
        '127 to 137': 1,
        '138 to 147': 1,
        '148 to 158': 0,
        '159 to 169': 0,
        '170 to 179': 0,
        '180 to 190': 0,
        '191 to 200': 0}},
      '50 to 54': 1,
      '55 to 59': {'trestbps': {'94 to 105': 1,
        '106 to 116': 0,
        '117 to 126': {'chol': {'126 to 170': 0,
          '171 to 214': 0,
          '215 to 258': 1,


#  Glass Classification


In [8]:
df2=pd.read_csv('./glass.csv')
df2

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [9]:
RI=pd.cut(df2.RI,20)
df2.RI=RI
Na=pd.cut(df2.Na,25)
df2.Na=Na
Mg=pd.cut(df2.Mg,15)
df2.Mg=Mg
Al=pd.cut(df2.Al,15)
df2.Al=Al
Si=pd.cut(df2.Si,25)
df2.Si=Si
K=pd.cut(df2.K,30)
df2.K=K
Ca=pd.cut(df2.Ca,20)
df2.Ca=Ca
Ba=pd.cut(df2.Ba,20)
df2.Ba=Ba
Fe=pd.cut(df2.Fe,20)
df2.Fe=Fe

df2



Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,"(1.52, 1.521]","(13.39, 13.656]","(4.191, 4.49]","(0.932, 1.146]","(71.602, 71.826]","(-0.00621, 0.207]","(8.658, 9.196]","(-0.00315, 0.158]","(-0.00051, 0.0255]",1
1,"(1.517, 1.518]","(13.656, 13.922]","(3.592, 3.891]","(1.146, 1.36]","(72.722, 72.946]","(0.414, 0.621]","(7.582, 8.12]","(-0.00315, 0.158]","(-0.00051, 0.0255]",1
2,"(1.516, 1.517]","(13.39, 13.656]","(3.293, 3.592]","(1.36, 1.574]","(72.946, 73.17]","(0.207, 0.414]","(7.582, 8.12]","(-0.00315, 0.158]","(-0.00051, 0.0255]",1
3,"(1.517, 1.518]","(13.124, 13.39]","(3.592, 3.891]","(1.146, 1.36]","(72.498, 72.722]","(0.414, 0.621]","(8.12, 8.658]","(-0.00315, 0.158]","(-0.00051, 0.0255]",1
4,"(1.517, 1.518]","(13.124, 13.39]","(3.592, 3.891]","(1.146, 1.36]","(72.946, 73.17]","(0.414, 0.621]","(7.582, 8.12]","(-0.00315, 0.158]","(-0.00051, 0.0255]",1
...,...,...,...,...,...,...,...,...,...,...
209,"(1.516, 1.517]","(13.922, 14.188]","(-0.00449, 0.299]","(2.858, 3.072]","(72.498, 72.722]","(-0.00621, 0.207]","(8.658, 9.196]","(0.945, 1.102]","(-0.00051, 0.0255]",7
210,"(1.517, 1.518]","(14.72, 14.986]","(-0.00449, 0.299]","(1.788, 2.002]","(72.946, 73.17]","(-0.00621, 0.207]","(8.12, 8.658]","(1.575, 1.732]","(-0.00051, 0.0255]",7
211,"(1.52, 1.521]","(14.188, 14.454]","(-0.00449, 0.299]","(2.002, 2.216]","(73.394, 73.618]","(-0.00621, 0.207]","(8.12, 8.658]","(1.575, 1.732]","(-0.00051, 0.0255]",7
212,"(1.516, 1.517]","(14.188, 14.454]","(-0.00449, 0.299]","(1.788, 2.002]","(73.394, 73.618]","(-0.00621, 0.207]","(8.12, 8.658]","(1.418, 1.575]","(-0.00051, 0.0255]",7


In [10]:
train_df2 , test_df2 = train_test_split(df2, test_size=0.3)


In [11]:
m=ID3_Algorithm(train_df2,test_df2,'Type')
x=m.Decision_Tree()
m.evaluate(x)



0.609375