In [10]:
# source https://www.python-course.eu/Decision_Trees.php

In [11]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split

In [12]:
def entropy(target_col):
    """
    Calculate the entropy of a dataset.
    The only parameter of this function is the target_col parameter which specifies the target column
    """
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data,split_attribute_name,target_name="class"):
    """
    Calculate the information gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """    
    #Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    
    ##Calculate the entropy of the dataset
    
    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    #Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain



In [13]:
def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class = None):
    """
    ID3 Algorithm: This function takes five paramters:
    1. data = the data for which the ID3 algorithm should be run --> In the first run this equals the total dataset
 
    2. originaldata = This is the original dataset needed to calculate the mode target feature value of the original dataset
    in the case the dataset delivered by the first parameter is empty

    3. features = the feature space of the dataset . This is needed for the recursive call since during the tree growing process
    we have to remove features from our dataset --> Splitting at each node

    4. target_attribute_name = the name of the target attribute

    5. parent_node_class = This is the value or class of the mode target feature value of the parent node for a specific node. This is 
    also needed for the recursive call since if the splitting leads to a situation that there are no more features left in the feature
    space, we want to return the mode target feature value of the direct parent node.
    """   
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#
    
    #If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    #If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    
    #If the feature space is empty, return the mode target feature value of the direct parent node --> Note that
    #the direct parent node is that node which has called the current run of the ID3 algorithm and hence
    #the mode target feature value is stored in the parent_node_class variable.
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        
        #Select the feature which best splits the dataset
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
        #gain in the first run
        tree = {best_feature:{}}
        
        
        #Remove the feature with the best inforamtion gain from the feature space
        features = [i for i in features if i != best_feature]
        
        #Grow a branch under the root node for each possible value of the root node feature
        
        for value in np.unique(data[best_feature]):
            value = value
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return(tree)    

    
def predict(query,tree,default):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
  
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result,default)

            else:
                return result


In [14]:
def dt_get_pred_true(test_examples,tree,default):
    # removes last column that contains class
    # converts each row as a dictionary of columns
    queries = test_examples.iloc[:,:-1].to_dict(orient='records')
    
    y_pred = [] 
    y_true = [] 
    for i in range(len(test_examples)):
        prediction = predict(queries[i],tree,default)
        y_pred.append(prediction)
        y_true.append(test_examples.iloc[i,-1])
    
    return y_pred,y_true



In [15]:
# performance measure
def confusion_matrix_2(y_pred,y_true,pos,neg):
    """
    inout:
        y_pred - prediction value
        y_true - true value
        pos    - positive attribute
        neg    - negative attribute
        
    output:
        a diftionary with TruePositive,FalsePositive,FalseNegative,TrueNegative
    """
    confusion_matrix = {'TP': 0 ,'FP' : 0,'FN': 0 ,'TN' : 0}
    
    for pred,true in zip(y_pred,y_true):
        if pred == pos and true == pos:
            confusion_matrix['TP'] += 1
        
        elif pred == pos and true == neg:
            confusion_matrix['FP'] += 1
            
        elif pred == neg and true == pos:
            confusion_matrix['FN'] += 1
            
        elif pred == neg and true == neg:
            confusion_matrix['TN'] += 1
            
    return confusion_matrix
        

# get stat
def get_stat(confusion_matrix):
    TP = confusion_matrix['TP']
    TN = confusion_matrix['TN']
    FP = confusion_matrix['FP']
    FN = confusion_matrix['FN']
    
    accuracy = (TP + TN)/(TP + TN + FP + FN)*100
    print(f'Accuracy\t\t{accuracy}%')
    
    sensitivity = TP/(TP + FN)*100
    print(f'Sensitivity\t\t{sensitivity}%')
    
    specificity = TN/(TN + FP)*100
    print(f'Specificity\t\t{specificity}%')
    
    precision = TP/(TP + FP)*100
    print(f'precision\t\t{precision}%')
    
    false_discovery_rate = FP/(FP + TP)*100
    print(f'False Discovery Rate\t{false_discovery_rate}%')
    
    f1_score = (2*TP)/(2*TP + FP + FN)*100
    print(f'F1 score\t\t{f1_score}%')

    
def plurality_value(examples,target):
    """
        input: 
                examples - the example
                target   - the name of target
        output:
                the most occuring target in examples
    """
    values,counts = np.unique(examples[target],return_counts=True)
    max_index     = np.argmax(counts)
    return values[max_index]

In [None]:
#Import the dataset and define the feature as well as the target datasets / columns#
dataset = pd.read_csv('data/telco.csv')
train_examples,test_examples = train_test_split(dataset,test_size=0.2, random_state=1)


print("****Start Training****")
tree = ID3(train_examples,train_examples,train_examples.columns[:-1],train_examples.columns[-1])
print("****End Training****")

****Start Training****


In [None]:
default = plurality_value(train_examples,train_examples.columns[-1])

print("****Start Testing****")
y_pred,y_true = dt_get_pred_true(test_examples,tree,default)

confusion_matrix = confusion_matrix_2(y_pred,y_true,'Yes','No')
print("****End of Testing****")
get_stat(confusion_matrix)