In [2]:
import pandas as pd  
import numpy as np  
from pprint import pprint  
#Import the dataset and define the feature as well as the target datasets / columns#  
dataset = pd.read_csv('C:\\Users\\y19it41\\Downloads\\play_tennisDT1.csv', names=['Outlook','Temperature','humidity','Wind','PlayTennis']) 
dataset


Unnamed: 0,Outlook,Temperature,humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [4]:
def entropy(target_col):  
    
    elements,counts = np.unique(target_col,return_counts = True)  
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])  
    return entropy  

def InfoGain(data,split_attribute_name,target_name="PlayTennis"):  
         
    #Calculate the entropy of the total dataset  
    total_entropy = entropy(data[target_name])  
      
    ##Calculate the entropy of the dataset  
      
    #Calculate the values and the corresponding counts for the split attribute   
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
      
    #Calculate the weighted entropy  
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])  
      
    #Calculate the information gain  
    Information_Gain = total_entropy - Weighted_Entropy  
    print(split_attribute_name,vals,counts,Information_Gain)
    return Information_Gain  

def ID3(data,originaldata,features,target_attribute_name="PlayTennis",parent_node_class = None):  
  
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#  
      
    #If all target_values have the same value, return this value  
    if len(np.unique(data[target_attribute_name])) <= 1:  
        return np.unique(data[target_attribute_name])[0]  
      
    #If the dataset is empty, return the mode target feature value in the original dataset  
    elif len(data)==0:  
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]  
      
    #If the feature space is empty, return the mode target feature value of the direct parent node --> Note that  
    #the direct parent node is that node which has called the current run of the ID3 algorithm and hence  
    #the mode target feature value is stored in the parent_node_class variable.  
      
    elif len(features) ==0:  
        return parent_node_class  
      
    #If none of the above holds true, grow the tree!  
      
    else:  
        #Set the default value for this node --> The mode target feature value of the current node  
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]  
          
        #Select the feature which best splits the dataset  
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset  
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]  
               
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information  
        #gain in the first run  
        tree = {best_feature:{}}  
          
          
        #Remove the feature with the best inforamtion gain from the feature space  
        features = [i for i in features if i != best_feature]  
          
        #Grow a branch under the root node for each possible value of the root node feature  
          
        for value in np.unique(data[best_feature]):  
            value = value  
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets  
            sub_data = data.where(data[best_feature] == value).dropna()  
              
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!  
            
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)  
            print("\n ")
              
            #Add the sub tree, grown from the sub_dataset to the tree under the root node  
            tree[best_feature][value] = subtree  
              
        return(tree)      

def predict(query,tree,default = 1):  
    
    for key in list(query.keys()):  
        if key in list(tree.keys()):  
            
            try:  
                result = tree[key][query[key]]   
            except:  
                return default  
    
            result = tree[key][query[key]]  
            
            if isinstance(result,dict):  
                return predict(query,result)  
            else:  
                return result  
  
def train_test_split(dataset):  
    training_data = dataset.iloc[:14].reset_index(drop=True)#We drop the index respectively relabel the index  
    #starting form 0, because we do not want to run into errors regarding the row labels / indexes  
    testing_data = dataset.iloc[14:].reset_index(drop=True)  
    return training_data,testing_data  
  
training_data = train_test_split(dataset)[0]  
testing_data = train_test_split(dataset)[1]   
  
def test(data,tree):  
    #Create new query instances by simply removing the target feature column from the original dataset and   
    #convert it to a dictionary  
    queries = data.iloc[:,:-1].to_dict(orient = "records")  
      
    #Create a empty DataFrame in whose columns the prediction of the tree are stored  
    predicted = pd.DataFrame(columns=["predicted"])   
      
    #Calculate the prediction accuracy  
    for i in range(len(data)):  
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)   
    print('\n The prediction accuracy is: ',(np.sum(predicted["predicted"] == data["PlayTennis"])/len(data))*100,'%')     


tree = ID3(training_data,training_data,training_data.columns[:-1]) 
print("\n\n\nThe final Resultant Decision Tree")
pprint(tree)  
test(testing_data,tree)


Outlook ['Overcast' 'Rain' 'Sunny'] [4 5 5] 0.24674981977443933
Temperature ['Cool' 'Hot' 'Mild'] [4 4 6] 0.02922256565895487
humidity ['High' 'Normal'] [7 7] 0.15183550136234159
Wind ['Strong' 'Weak'] [6 8] 0.04812703040826949

 
Temperature ['Cool' 'Mild'] [2 3] 0.01997309402197489
humidity ['High' 'Normal'] [2 3] 0.01997309402197489
Wind ['Strong' 'Weak'] [2 3] 0.9709505944546686

 

 

 
Temperature ['Cool' 'Hot' 'Mild'] [1 2 2] 0.5709505944546686
humidity ['High' 'Normal'] [3 2] 0.9709505944546686
Wind ['Strong' 'Weak'] [2 3] 0.01997309402197489

 

 

 



The final Resultant Decision Tree
{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}

 The prediction accuracy is:  88.88888888888889 %
