| NAME   |      SECTION      |  B.N. |
|----------|:-------------:|------:|
| Mohamed Khaled Galloul |  2 | 15 |

## Import Necessary Packages

In [1]:
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv('cardio_train.csv', sep=';')
df.drop(columns=['id'], inplace=True)

df.head(10)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [3]:
# Dividing columns into numerical and categorical columns
cat_columns_names = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
num_columns_names = [column for column in df.columns if column not in cat_columns_names]

In [4]:
print(cat_columns_names, '\n', num_columns_names)

['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'] 
 ['age', 'height', 'weight', 'ap_hi', 'ap_lo']


## Preprocessing

### Replace numeric value with the corresponding category for the Categorical Columns

In [5]:
df.gender.replace({1: "female", 2: "male"}, inplace=True)
df.cholesterol.replace({1: "normal", 2: "above normal", 3: "well above normal"}, inplace=True)
df.gluc.replace({1: "normal", 2: "above normal", 3: "well above normal"}, inplace=True)
df.smoke.replace({0: "doesn't smoke", 1: "smokes"}, inplace=True)
df.alco.replace({0: "doesn't drink", 1: "drinks"}, inplace=True)
df.active.replace({0: "not active", 1: "active"}, inplace=True)
df.cardio.replace({0: "absence", 1: "presence"}, inplace=True)

In [6]:
df.sample(10)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
67161,20332,female,161,90.0,150,100,normal,normal,doesn't smoke,doesn't drink,active,presence
49469,20634,female,159,67.0,140,90,normal,normal,doesn't smoke,doesn't drink,active,presence
13339,18319,female,173,108.0,145,90,well above normal,normal,doesn't smoke,doesn't drink,active,absence
49306,15516,male,170,68.0,140,80,above normal,normal,doesn't smoke,doesn't drink,active,presence
36470,18165,male,168,65.0,140,90,normal,normal,doesn't smoke,doesn't drink,active,presence
25360,19755,male,162,67.0,120,80,normal,normal,smokes,drinks,active,absence
183,23312,female,148,50.0,120,80,above normal,normal,doesn't smoke,doesn't drink,active,absence
6855,22916,male,160,80.0,120,80,normal,normal,doesn't smoke,doesn't drink,active,presence
69951,15254,female,158,55.0,110,70,normal,normal,doesn't smoke,doesn't drink,active,absence
10262,14714,male,175,92.0,140,100,normal,normal,doesn't smoke,doesn't drink,active,presence


### Discretizing continuous (numeric) columns into bins  
> By using a supervised method typically use target information in order to create bins or intervals, 
we would try different depths and select the more convenient depth based on the highest **roc-auc** score.

#### *First:* Let's select the optimal depth using only one numerical column for simplicity 
> Using the height column would do it.

In [7]:
depths = [i for i in range(1,6,1)]
scores_mean = [] 
scores_std = [] 

for tree_depth in depths:
    tree_model = DecisionTreeClassifier(max_depth=tree_depth)
    
    scores = cross_val_score(tree_model, df['height'].values.reshape(-1, 1),
                                         df['cardio'].values.reshape(-1, 1), cv=3, scoring='roc_auc')   
    
    scores_mean.append(np.mean(scores))
    
    scores_std.append(np.std(scores))
    
depths_results = pd.concat([pd.Series(depths), pd.Series(scores_mean), pd.Series(scores_std)], axis=1)
depths_results.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']

depths_results

Unnamed: 0,depth,roc_auc_mean,roc_auc_std
0,1,0.506663,0.002139
1,2,0.508401,0.002032
2,3,0.508889,0.000847
3,4,0.509117,0.000847
4,5,0.510573,0.002736


> So as the depth of the tree the higher roc_auc score which is quite expected moreover, deeper depth increases overfitting so depth=3 would be enough

#### *Second :* Apply discretization to all numerical columns using a decision tree of depth 3 to set the bins


In [8]:
for column in num_columns_names:
    tree_model = DecisionTreeClassifier(max_depth=3)
    tree_model.fit(df[column].values.reshape(-1, 1), df['cardio'].values.reshape(-1, 1))
    
    df[column + '_DIS'] = tree_model.predict_proba(df[column].values.reshape(-1, 1))[:,1]

In [9]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_DIS,height_DIS,weight_DIS,ap_hi_DIS,ap_lo_DIS
0,18393,male,168,62.0,110,80,normal,normal,doesn't smoke,doesn't drink,active,absence,0.423170,0.489104,0.402208,0.233271,0.424469
1,20228,female,156,85.0,140,90,well above normal,normal,doesn't smoke,doesn't drink,active,presence,0.553199,0.521270,0.596821,0.815514,0.749846
2,18857,female,165,64.0,130,70,well above normal,normal,doesn't smoke,doesn't drink,not active,presence,0.482771,0.489104,0.451893,0.596359,0.287170
3,17623,male,169,82.0,150,100,normal,normal,doesn't smoke,doesn't drink,active,presence,0.423170,0.489104,0.554014,0.861431,0.837689
4,17474,female,156,56.0,100,60,normal,normal,doesn't smoke,doesn't drink,not active,absence,0.423170,0.521270,0.402208,0.233271,0.250476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,male,168,76.0,120,80,normal,normal,smokes,doesn't drink,active,absence,0.482771,0.489104,0.508247,0.355663,0.424469
69996,22601,female,158,126.0,140,90,above normal,above normal,doesn't smoke,doesn't drink,active,presence,0.676122,0.498480,0.706201,0.815514,0.749846
69997,19066,male,183,105.0,180,90,well above normal,normal,doesn't smoke,drinks,not active,presence,0.482771,0.499635,0.655925,0.861431,0.749846
69998,22431,female,163,72.0,135,80,normal,above normal,doesn't smoke,doesn't drink,not active,presence,0.676122,0.498480,0.508247,0.674510,0.424469


In [10]:
df.age_DIS.value_counts()

0.553199    14615
0.423170    14467
0.482771    14075
0.600177     6793
0.332477     6614
0.720032     4947
0.676122     4900
0.230148     3589
Name: age_DIS, dtype: int64

> Since we choose to discretize each column using a decision tree of a depth of 3 we would get 8 bins for each numerical column which is (2^3).
For more details on the discretization method used here, check the following [POST](https://towardsdatascience.com/discretisation-using-decision-trees-21910483fa4b)

### Split data into train and test sets

In [11]:
#Create a second dataframe without the old numerical columns
df2 = pd.DataFrame([])

for column in df.columns:
    if column in num_columns_names:
        pass
    else:
        df2[column] = df[column]

df2.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_DIS,height_DIS,weight_DIS,ap_hi_DIS,ap_lo_DIS
0,male,normal,normal,doesn't smoke,doesn't drink,active,absence,0.42317,0.489104,0.402208,0.233271,0.424469
1,female,well above normal,normal,doesn't smoke,doesn't drink,active,presence,0.553199,0.52127,0.596821,0.815514,0.749846
2,female,well above normal,normal,doesn't smoke,doesn't drink,not active,presence,0.482771,0.489104,0.451893,0.596359,0.28717
3,male,normal,normal,doesn't smoke,doesn't drink,active,presence,0.42317,0.489104,0.554014,0.861431,0.837689
4,female,normal,normal,doesn't smoke,doesn't drink,not active,absence,0.42317,0.52127,0.402208,0.233271,0.250476


In [12]:
# Get all columns names
columns_names = df2.columns

# Rearrange columns names to make `cardio` the last column
rearranged_columns_names = np.array(Counter(columns_names) - Counter(['cardio']) + Counter(['cardio']))

df2 = df2[rearranged_columns_names]
df2.sample(10)

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,age_DIS,height_DIS,weight_DIS,ap_hi_DIS,ap_lo_DIS,cardio
44355,male,normal,normal,smokes,doesn't drink,active,0.553199,0.524045,0.554014,0.355663,0.424469,absence
53925,female,normal,normal,doesn't smoke,doesn't drink,active,0.332477,0.489104,0.451893,0.355663,0.424469,absence
21795,female,normal,normal,doesn't smoke,doesn't drink,active,0.720032,0.52127,0.655925,0.355663,0.424469,presence
17731,female,above normal,normal,doesn't smoke,doesn't drink,active,0.482771,0.49848,0.655925,0.355663,0.424469,presence
67819,female,above normal,normal,doesn't smoke,doesn't drink,active,0.42317,0.52127,0.402208,0.355663,0.424469,absence
29126,male,well above normal,normal,smokes,drinks,active,0.230148,0.524045,0.655925,0.596359,0.424469,presence
29105,female,normal,normal,doesn't smoke,doesn't drink,active,0.553199,0.49848,0.596821,0.355663,0.837689,presence
40139,male,normal,above normal,doesn't smoke,doesn't drink,not active,0.42317,0.49848,0.451893,0.355663,0.28717,absence
26719,female,normal,normal,doesn't smoke,doesn't drink,active,0.42317,0.49848,0.451893,0.355663,0.424469,presence
59440,female,well above normal,normal,doesn't smoke,doesn't drink,active,0.600177,0.538807,0.554014,0.355663,0.424469,presence


In [13]:
# Now split into train and test sets
train_data, test_data = train_test_split(np.array(df2), test_size=0.1, shuffle=True)
train_data.shape

(63000, 12)

## Utility Functions

In [14]:
def is_numeric(value):
    '''
    Checks whether the value is int or float
    
    Parameters:
        value : The value to be checked
        
    Returns:
        True if the value is `int` or `float`, False otherwise. 
    '''
    return isinstance(value, int) or isinstance(value, float)


class Question():
    '''
    Asks a question based on the feature column ID and a feature value
    
    Attributes:
        column_id (int) : The ID for the feature column.
        value : The value used to compare the values of the feature with.
        
    Methods:
    match(self, example)
        compare between the given value and the value from the example
        
        Parameters:
            example (1D array): The array of features of a sample.
        
        Returns:
            if the example value is numeric: True when the given value is greater than or equal the value from the example
            else the example value is categorical : True when the given value is equals the value from the example

    __repr__(self)
        Repreasenting the class by returning the question asked
            
            Returns:
                The question asked based on type of the given value when the class was initialized.
            
    
    '''
    def __init__(self, column_id, value):
        self.column_id = column_id
        self.value = value
        
    def match(self, example):
        val = example[self.column_id]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value      
        
    def __repr__(self):
        condition = '==';
        if is_numeric(self.value):
            condition = '>=';
        return "Is %s %s %s" % (columns_names[self.column_id], condition, self.value)
        
        
def split(rows, question):
    '''
    Splits the data to true rows and false rows based on the passed question.
    
    Parameters:
        rows (2D array or Pandas DataFrame): The data to be splitted
        question (class): The question to split the data based on.
        
    Returns:
        true_rows(2D array or Pandas DataFrame) : The true rows based on the asked question.
        false_rows(2D array or Pandas DataFrame) : The false rows based on asked the asked question.
        
    '''
    
    true_rows, false_rows = [], [];
    
    for i, row in enumerate(rows):
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    
    return true_rows, false_rows




def class_counts(rows):
    '''
    Counts the total number of samples for each class in a given data.
    
    Parameters:
        rows (2D array or Pandas DataFrame): The data to be counted.
        
    Returns:
        counts_dict(dict) : A Dictionay contains every unique class as a key and its total count as the value.
        
    '''
    
    
    counts_dict = {}
    
    for row in rows:
        label = row[-1] #get the label as its the last column
        
        if label not in counts_dict: # if label is not seen before
            counts_dict[label] = 1
        else: 
            counts_dict[label] += 1
    
    return counts_dict


def calculate_impurity(rows, criterion = 'gini'):
    '''
   Calculates the impurity for a given data.
    
    Parameters:
        rows (2D array or Pandas DataFrame): The data to be counted.
        criterion(str, default = 'gini'): The criterion to calculate the impurity based. 
                                           Either 'gini' or 'entropy'.
        
    Returns:
        impurity(float) : The impurity calculated for the given data based on the selected criterion.
        
    '''
    
    # get the count for each class
    classes_count = class_counts(rows)

    # calculate the probability for the two classes 'presence' and 'absence' 
    prob_class_list = []
    for class_, count in classes_count.items():
        prob_class_list.append(classes_count[class_] / sum(classes_count.values()))
    
    
    if criterion == 'gini':
        impurity = 1
        for prob_class in prob_class_list:
            impurity -= prob_class ** 2
    
    elif criterion == 'entropy':
        impurity = 0
        for prob_class in prob_class_list:
            impurity += -1 * (prob_class * np.log2(prob_class))
    
    return impurity



def information_gain(left_child_branch, right_child_branch, parent_impurity, criterion = 'gini'):
    '''
   Calculates the information gain for a parent node.
    
    Parameters:
        left_child_branch (2D array or Pandas DataFrame): The false(left) branch of the parent node.
        true_child_branch (2D array or Pandas DataFrame): The true(right) branch of the parent node.
        parent_impurity(float) : The impurity calculated for the parent node.
        criterion(str, default = 'gini'): The criterion to calculate the impurity for both branches based on.
                                           Either 'gini' or 'entropy'.
        
    Returns:
        IG(float) : The information gain at a given parent node based on the selected criterion.
        
    ''' 
    
    # calculate the probability for the left branch (p) so the probability for the right branch is going to be (1-p)
    sum_ = float(len(left_child_branch) + len(right_child_branch))
    p = len(left_child_branch) / sum_

    impurity_left_child = calculate_impurity(left_child_branch, criterion = criterion)
    impurity_right_child = calculate_impurity(right_child_branch, criterion = criterion)

    IG = parent_impurity - p * impurity_left_child - (1-p) * impurity_right_child
    
    return IG


def find_best_split(rows, criterion = 'gini'):
    '''
   Splits the given data (features) based on the optimal question which is the one with the least impurity in other words,
   the one with the highest information gain. 
    
    Parameters:
        rows (2D array or Pandas DataFrame): The data to be splitted.
        criterion(str, default = 'gini'): The criterion to calculate the impurity hence, the information gain.
                                           Either 'gini' or 'entropy'.
        
    Returns:
        best_gain(float) : The highest information gain obtained from the data.
        best_question(str) : The optimal question to split the data based on.
    ''' 
    
    
    best_gain = 0  
    best_question = None  
    current_impurity = calculate_impurity(rows, criterion=criterion)
    n_features = len(columns_names) - 1  # number of features without the target column

    for column in range(n_features):  

        values = set([row[column] for row in rows])  

        for value in values: 

            question = Question(column, value)

            # try splitting the dataset
            true_rows, false_rows = split(rows, question)

            # Skip this split if it doesn't divide the dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            IG = information_gain(true_rows, false_rows, current_impurity)

            # Save the IG if it's higher than the best gain so far and the corresponding question
            if IG > best_gain:
                best_gain, best_question = IG, question

    return best_gain, best_question


In [15]:
# Creating a test question with the gender column
q =Question(0, "female")
q

Is gender == female

In [16]:
example = train_data[1]
q.match(example) 

True

In [17]:
# test splitting of data if the gender is `female` or not.
true_rows, false_rows = split(train_data, Question(0, "female"))
true_rows[:3]

[array(['female', 'normal', 'normal', "doesn't smoke", "doesn't drink",
        'not active', 0.5531987683886418, 0.4891036321226258,
        0.45189332547517314, 0.35566268527845735, 0.42446871227646615,
        'absence'], dtype=object),
 array(['female', 'normal', 'normal', "doesn't smoke", "doesn't drink",
        'active', 0.7200323428340408, 0.4891036321226258,
        0.45189332547517314, 0.35566268527845735, 0.42446871227646615,
        'absence'], dtype=object),
 array(['female', 'well above normal', 'well above normal',
        "doesn't smoke", "doesn't drink", 'active', 0.7200323428340408,
        0.5212695613841746, 0.5968211327512872, 0.8614306630412047,
        0.8376891334250344, 'presence'], dtype=object)]

In [18]:
# test splitting of data if the ap_lo_DIS is greater than or equal to `0.287170` .
true_rows, false_rows = split(train_data, Question(10, 0.287170))
true_rows[:3]

[array(['male', 'normal', 'well above normal', "doesn't smoke",
        "doesn't drink", 'active', 0.5531987683886418, 0.4891036321226258,
        0.45189332547517314, 0.5963587921847247, 0.2871700310559006,
        'absence'], dtype=object),
 array(['female', 'normal', 'normal', "doesn't smoke", "doesn't drink",
        'not active', 0.5531987683886418, 0.4891036321226258,
        0.45189332547517314, 0.35566268527845735, 0.42446871227646615,
        'absence'], dtype=object),
 array(['female', 'normal', 'normal', "doesn't smoke", "doesn't drink",
        'active', 0.7200323428340408, 0.4891036321226258,
        0.45189332547517314, 0.35566268527845735, 0.42446871227646615,
        'absence'], dtype=object)]

In [19]:
calculate_impurity(train_data, criterion = 'entropy')

0.9999990578316049

In [20]:
calculate_impurity(train_data, criterion = 'gini')

0.49999934693877557

In [21]:
# Calculate the IG for the train data if we used `gini` as our criterion for the gender parent node.
root_impurity = calculate_impurity(train_data, criterion='gini')
true_rows, false_rows = split(train_data, Question(0, 'male'))
information_gain(true_rows, false_rows, root_impurity, criterion='gini')

2.6130886971953426e-05

In [22]:
# Calculate the IG for the train data if we used `entropy` as our criterion for the gender parent node.
root_impurity = calculate_impurity(train_data, criterion='entropy')
true_rows, false_rows = split(train_data, Question(0, 'male'))
information_gain(true_rows, false_rows, root_impurity, criterion='entropy')

3.7699278389591306e-05

In [23]:
# The root node to start our tree with since it has the highest gain.
best_gain, best_question = find_best_split(train_data, criterion='gini')
best_question

Is weight_DIS >= 0.5963587921847247

In [24]:
best_gain

0.09410482130799064

# Building the tree

In [25]:
class Leaf:
    '''
    This class is to represent the leaf nodes since we can't split the tree further.
    
    Attributes:
        predictions (2D array or Pandas DataFrame): The classes counts at a given leaf node.
    '''
    def __init__(self, rows):
        self.predictions = class_counts(rows)
        

class Decision_Node:
    """
    This class is to represent the internal(decision) nodes.
    
    Attributes:
        question (class): The question used to split the data in the internal node to left and right branches.
        true_branch (2D array or Pandas DataFrame): The right branch that achieved the question asked at the internal node.
        false_branch (2D array or Pandas DataFrame): The left branch that didn't achieve the question asked at the internal node.
    """

    def __init__(self, question, true_branch, false_branch):
        
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
        
        
def build_tree(rows, criterion = 'gini'):
    """
    Builds the Binary Dicision Tree by recursion.
    
    Parameters:
        rows (2D array or Pandas DataFrame): The data to be splitted to build the tree.
        criterion(str, default = 'gini'): The criterion to calculate the impurity hence, the information gain.
                                           Either 'gini' or 'entropy'.    
    
    """

    # Splits the data based on the highest gain among all possibilities and get the corresponding question
    # in the next call we will choose the second highest gain and its question and so on.
    gain, question = find_best_split(rows, criterion=criterion)

    # Stop when the gain is zero so we reached a leaf node.
    if gain == 0:
        return Leaf(rows)

    # Split the data based on the current best quetion.
    true_rows, false_rows = split(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Decision(internal) node and the corresponding question, left_branch and right_branch corrosponding to it.
    return Decision_Node(question, true_branch, false_branch)



    
def print_tree(node, spacing=""):
    """Prints out the created Dicision Tree and its branches."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


    
    
def classify(row, node):
    """
    Predicts the target by passing the sample of data at each node recursively.
    
    Arguments:
        row (1D array) : The sample that is being classified.
        node (class) : The node at which we split the data on.
        
    Returns:
        node.predictions (dict): The classes counts at all leaf nodes.
    """

    # we have reached a dead end which is a Leaf node.
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)



In [26]:
def print_leaf(counts):
    """
    A nicer way to print the predictions at a leaf node by normalizing the counts.
    
    Parameters:
        counts(dict): the total counts of each class at the leaf nodes for a given sample.
    
    """
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = counts[lbl] / total
    return probs

In [28]:
gini_tree = build_tree(train_data)
print_tree(gini_tree)

Is weight_DIS >= 0.5963587921847247
--> True:
  Is weight_DIS >= 0.8155142654076556
  --> True:
    Is weight_DIS >= 0.8614306630412047
    --> True:
      Is ap_hi_DIS >= 0.2871700310559006
      --> True:
        Is gluc == normal
        --> True:
          Is age_DIS >= 0.5083932853717026
          --> True:
            Is cardio >= 0.33247656486241306
            --> True:
              Is smoke == doesn't smoke
              --> True:
                Is height_DIS >= 0.554014360313316
                --> True:
                  Is cardio >= 0.4231699730420958
                  --> True:
                    Is height_DIS >= 0.6559249786871271
                    --> True:
                      Is cholesterol == well above normal
                      --> True:
                        Is cardio >= 0.6001766524363316
                        --> True:
                          Is age_DIS >= 0.5240452616690241
                          --> True:
                            Predict {'p

                                    Is cardio >= 0.7200323428340408
                                    --> True:
                                      Is cholesterol == well above normal
                                      --> True:
                                        Predict {'presence': 1}
                                      --> False:
                                        Predict {'absence': 2, 'presence': 2}
                                    --> False:
                                      Is height_DIS >= 0.5082472637582858
                                      --> True:
                                        Is smoke == doesn't smoke
                                        --> True:
                                          Predict {'absence': 1, 'presence': 1}
                                        --> False:
                                          Predict {'presence': 1}
                                      --> False:
                                        Pr

                                  Predict {'presence': 7}
                              --> False:
                                Is ap_hi_DIS >= 0.8376891334250344
                                --> True:
                                  Predict {'absence': 1}
                                --> False:
                                  Predict {'presence': 1}
                      --> False:
                        Is ap_hi_DIS >= 0.8376891334250344
                        --> True:
                          Predict {'presence': 6}
                        --> False:
                          Is cardio >= 0.6761224489795918
                          --> True:
                            Predict {'absence': 2}
                          --> False:
                            Is gender == female
                            --> True:
                              Predict {'presence': 1}
                            --> False:
                              Is smoke == smokes
             

                                    Predict {'presence': 4}
                                  --> False:
                                    Is gender == female
                                    --> True:
                                      Predict {'absence': 1}
                                    --> False:
                                      Predict {'presence': 1}
                              --> False:
                                Predict {'presence': 5}
                            --> False:
                              Is height_DIS >= 0.554014360313316
                              --> True:
                                Is cardio >= 0.4231699730420958
                                --> True:
                                  Is height_DIS >= 0.6559249786871271
                                  --> True:
                                    Is ap_hi_DIS >= 0.8376891334250344
                                    --> True:
                                      Predict

                          Is smoke == doesn't smoke
                          --> True:
                            Is cardio >= 0.4827708703374778
                            --> True:
                              Is cardio >= 0.5531987683886418
                              --> True:
                                Is ap_hi_DIS >= 0.8376891334250344
                                --> True:
                                  Predict {'presence': 6}
                                --> False:
                                  Is gender == female
                                  --> True:
                                    Is ap_hi_DIS >= 0.7498457953533
                                    --> True:
                                      Is active == not active
                                      --> True:
                                        Is height_DIS >= 0.554014360313316
                                        --> True:
                                          Is height_DIS

                                    --> False:
                                      Is cardio >= 0.4231699730420958
                                      --> True:
                                        Is gender == female
                                        --> True:
                                          Predict {'presence': 6, 'absence': 1}
                                        --> False:
                                          Predict {'absence': 1}
                                      --> False:
                                        Predict {'absence': 2}
                                --> False:
                                  Is cardio >= 0.4827708703374778
                                  --> True:
                                    Is ap_hi_DIS >= 0.8376891334250344
                                    --> True:
                                      Predict {'presence': 1}
                                    --> False:
                                      

                          Is gluc == normal
                          --> True:
                            Is cardio >= 0.6001766524363316
                            --> True:
                              Is smoke == smokes
                              --> True:
                                Predict {'absence': 1}
                              --> False:
                                Is ap_hi_DIS >= 0.42446871227646615
                                --> True:
                                  Is height_DIS >= 0.554014360313316
                                  --> True:
                                    Is age_DIS >= 0.4996345029239766
                                    --> True:
                                      Predict {'presence': 1}
                                    --> False:
                                      Predict {'absence': 1}
                                  --> False:
                                    Predict {'presence': 2}
                        

                                                        Predict {'absence': 1}
                                                      --> False:
                                                        Is height_DIS >= 0.6559249786871271
                                                        --> True:
                                                          Predict {'presence': 1}
                                                        --> False:
                                                          Predict {'presence': 1, 'absence': 1}
                                            --> False:
                                              Predict {'presence': 4}
                                          --> False:
                                            Is cardio >= 0.4827708703374778
                                            --> True:
                                              Predict {'presence': 10}
                                            --> False:
                   

                                      Predict {'presence': 4}
                                  --> False:
                                    Predict {'absence': 1, 'presence': 1}
                                --> False:
                                  Is gender == female
                                  --> True:
                                    Predict {'absence': 4}
                                  --> False:
                                    Predict {'presence': 1}
                          --> False:
                            Is gender == female
                            --> True:
                              Is ap_hi_DIS >= 0.42446871227646615
                              --> True:
                                Is height_DIS >= 0.554014360313316
                                --> True:
                                  Is cardio >= 0.4827708703374778
                                  --> True:
                                    Predict {'absence': 1}
       

                              Predict {'absence': 1}
                            --> False:
                              Is gender == female
                              --> True:
                                Predict {'presence': 2}
                              --> False:
                                Is cholesterol == normal
                                --> True:
                                  Predict {'absence': 1, 'presence': 1}
                                --> False:
                                  Predict {'presence': 1}
                        --> False:
                          Is height_DIS >= 0.554014360313316
                          --> True:
                            Predict {'absence': 4}
                          --> False:
                            Is gender == male
                            --> True:
                              Predict {'presence': 1, 'absence': 1}
                            --> False:
                              Predict 

                                        --> True:
                                          Predict {'presence': 3}
                                        --> False:
                                          Is active == not active
                                          --> True:
                                            Is height_DIS >= 0.554014360313316
                                            --> True:
                                              Predict {'absence': 2}
                                            --> False:
                                              Is gender == female
                                              --> True:
                                                Is height_DIS >= 0.5082472637582858
                                                --> True:
                                                  Predict {'absence': 1, 'presence': 2}
                                                --> False:
                                               

                                      --> False:
                                        Predict {'absence': 1, 'presence': 5}
                                    --> False:
                                      Predict {'presence': 2}
                                  --> False:
                                    Predict {'absence': 1}
                                --> False:
                                  Predict {'presence': 4}
                              --> False:
                                Predict {'absence': 1}
                    --> False:
                      Is ap_hi_DIS >= 0.6395939086294417
                      --> True:
                        Is cardio >= 0.4231699730420958
                        --> True:
                          Predict {'presence': 7}
                        --> False:
                          Predict {'absence': 1}
                      --> False:
                        Is height_DIS >= 0.40220787335971675
                        -

                  --> True:
                    Predict {'absence': 11}
                  --> False:
                    Is gender == female
                    --> True:
                      Predict {'absence': 3}
                    --> False:
                      Is smoke == smokes
                      --> True:
                        Is active == active
                        --> True:
                          Predict {'presence': 1}
                        --> False:
                          Predict {'absence': 1}
                      --> False:
                        Predict {'absence': 1}
          --> False:
            Is gender == female
            --> True:
              Is height_DIS >= 0.6559249786871271
              --> True:
                Is age_DIS >= 0.5388073828679603
                --> True:
                  Is active == active
                  --> True:
                    Predict {'absence': 1}
                  --> False:
                    Predic

                              --> False:
                                Predict {'absence': 1}
                        --> False:
                          Is weight_DIS >= 0.5
                          --> True:
                            Predict {'presence': 2}
                          --> False:
                            Is height_DIS >= 0.554014360313316
                            --> True:
                              Is smoke == smokes
                              --> True:
                                Predict {'presence': 6}
                              --> False:
                                Is ap_hi_DIS >= 0.5413793103448276
                                --> True:
                                  Predict {'presence': 3}
                                --> False:
                                  Is cholesterol == above normal
                                  --> True:
                                    Is height_DIS >= 0.6559249786871271
                   

                                          --> False:
                                            Is ap_hi_DIS >= 0.7498457953533
                                            --> True:
                                              Predict {'presence': 1}
                                            --> False:
                                              Is ap_hi_DIS >= 0.42446871227646615
                                              --> True:
                                                Predict {'absence': 5, 'presence': 1}
                                              --> False:
                                                Predict {'presence': 3, 'absence': 3}
                                      --> False:
                                        Is height_DIS >= 0.45189332547517314
                                        --> True:
                                          Predict {'absence': 5}
                                        --> False:
                                 

                                    --> False:
                                      Is age_DIS >= 0.5083932853717026
                                      --> True:
                                        Is height_DIS >= 0.554014360313316
                                        --> True:
                                          Predict {'presence': 1, 'absence': 1}
                                        --> False:
                                          Predict {'absence': 5, 'presence': 1}
                                      --> False:
                                        Is height_DIS >= 0.554014360313316
                                        --> True:
                                          Is gender == female
                                          --> True:
                                            Is gluc == normal
                                            --> True:
                                              Predict {'absence': 5, 'presence': 1}
          

                              Predict {'presence': 1}
                            --> False:
                              Is height_DIS >= 0.45189332547517314
                              --> True:
                                Is age_DIS >= 0.4984798471160528
                                --> True:
                                  Is age_DIS >= 0.5212695613841746
                                  --> True:
                                    Predict {'absence': 1}
                                  --> False:
                                    Predict {'presence': 1, 'absence': 2}
                                --> False:
                                  Predict {'absence': 2}
                              --> False:
                                Predict {'absence': 4}
                          --> False:
                            Is age_DIS >= 0.5388073828679603
                            --> True:
                              Is height_DIS >= 0.40220787335971675
     

                                  Is gluc == above normal
                                  --> True:
                                    Predict {'presence': 1}
                                  --> False:
                                    Is alco == drinks
                                    --> True:
                                      Predict {'presence': 1}
                                    --> False:
                                      Predict {'presence': 4, 'absence': 4}
                                --> False:
                                  Predict {'absence': 2}
                            --> False:
                              Is gender == female
                              --> True:
                                Predict {'absence': 8}
                              --> False:
                                Is ap_hi_DIS >= 0.2871700310559006
                                --> True:
                                  Is smoke == smokes
                     

                    --> False:
                      Predict {'absence': 2}
                  --> False:
                    Predict {'absence': 4}
        --> False:
          Is age_DIS >= 0.4984798471160528
          --> True:
            Is active == active
            --> True:
              Is cardio >= 0.33247656486241306
              --> True:
                Is height_DIS >= 0.554014360313316
                --> True:
                  Is age_DIS >= 0.5083932853717026
                  --> True:
                    Is cardio >= 0.4827708703374778
                    --> True:
                      Is ap_hi_DIS >= 0.2871700310559006
                      --> True:
                        Predict {'absence': 2}
                      --> False:
                        Predict {'presence': 1}
                    --> False:
                      Predict {'presence': 2}
                  --> False:
                    Is weight_DIS >= 0.35566268527845735
                    --> Tru

                                          --> True:
                                            Is height_DIS >= 0.5082472637582858
                                            --> True:
                                              Predict {'absence': 2, 'presence': 1}
                                            --> False:
                                              Predict {'absence': 1}
                                          --> False:
                                            Is height_DIS >= 0.5082472637582858
                                            --> True:
                                              Predict {'absence': 3, 'presence': 4}
                                            --> False:
                                              Predict {'presence': 7, 'absence': 9}
                                      --> False:
                                        Is age_DIS >= 0.4996345029239766
                                        --> True:
                        

                                  Is height_DIS >= 0.5968211327512872
                                  --> True:
                                    Predict {'absence': 3, 'presence': 1}
                                  --> False:
                                    Predict {'absence': 1}
                                --> False:
                                  Predict {'presence': 1, 'absence': 2}
                            --> False:
                              Is ap_hi_DIS >= 0.2871700310559006
                              --> True:
                                Predict {'presence': 3}
                              --> False:
                                Predict {'absence': 1}
                        --> False:
                          Is height_DIS >= 0.6559249786871271
                          --> True:
                            Is gluc == above normal
                            --> True:
                              Predict {'absence': 1}
                     

                                          Predict {'absence': 4, 'presence': 1}
                                      --> False:
                                        Predict {'absence': 6}
                                    --> False:
                                      Is age_DIS >= 0.5083932853717026
                                      --> True:
                                        Is height_DIS >= 0.5082472637582858
                                        --> True:
                                          Is age_DIS >= 0.5212695613841746
                                          --> True:
                                            Is gluc == above normal
                                            --> True:
                                              Predict {'absence': 1}
                                            --> False:
                                              Is ap_hi_DIS >= 0.42446871227646615
                                              --> True:
     

                                        Predict {'absence': 1}
                                  --> False:
                                    Predict {'absence': 15}
                                --> False:
                                  Is gender == female
                                  --> True:
                                    Is active == active
                                    --> True:
                                      Predict {'absence': 2, 'presence': 2}
                                    --> False:
                                      Predict {'absence': 2}
                                  --> False:
                                    Predict {'absence': 2}
                        --> False:
                          Predict {'absence': 11}
                  --> False:
                    Is active == not active
                    --> True:
                      Is ap_hi_DIS >= 0.7498457953533
                      --> True:
                        Pre

                                    Predict {'absence': 1}
                                --> False:
                                  Predict {'absence': 9}
                              --> False:
                                Is active == active
                                --> True:
                                  Predict {'absence': 24, 'presence': 5}
                                --> False:
                                  Predict {'absence': 2}
                            --> False:
                              Predict {'absence': 10}
                          --> False:
                            Is active == active
                            --> True:
                              Is age_DIS >= 0.4984798471160528
                              --> True:
                                Is age_DIS >= 0.5212695613841746
                                --> True:
                                  Predict {'absence': 4, 'presence': 1}
                                -->

                                        Predict {'absence': 2}
                                      --> False:
                                        Is gluc == well above normal
                                        --> True:
                                          Predict {'absence': 1}
                                        --> False:
                                          Is ap_hi_DIS >= 0.42446871227646615
                                          --> True:
                                            Is height_DIS >= 0.45189332547517314
                                            --> True:
                                              Predict {'absence': 9, 'presence': 1}
                                            --> False:
                                              Predict {'absence': 2}
                                          --> False:
                                            Is height_DIS >= 0.45189332547517314
                                            --

In [29]:
print(classify(train_data[10], gini_tree))
print(print_leaf(classify(train_data[10], gini_tree)))

{'absence': 255, 'presence': 144}
{'absence': 0.6390977443609023, 'presence': 0.3609022556390977}


In [30]:
def accuracy(data, model):
    '''
    Calculates the predicted targets and the overall accuracy.
    
    Parameters:
        data (2D array or Pandas DataFrame): The data to be predicted.
        model (class): The decision tree model to generate predictions.
        
    Returns:
        predicted_target (1D array, list): The predicted labels for the given data passed to the given model.
        accuracy (float): The accuracy of the given model based on the true targets and predicted ones.
    '''
    
    matched_count =0
    predicted_target = []

    for i, row in enumerate(data):
        probs = print_leaf(classify(row, model))
        
        
        check_presence_exists = probs.get('presence', 0)
        
        if check_presence_exists >= 0.5:
            predicted_target.append('presence')

        elif check_presence_exists == 0:
            predicted_target.append('absence')

        else:
            predicted_target.append('absence')
            
        if predicted_target[i] == row[-1]:
            matched_count += 1

    accuracy = matched_count / len(data)

    return predicted_target, accuracy

In [31]:
predicted_target, train_accuracy = accuracy(train_data, gini_tree)

In [32]:
train_accuracy

0.8388730158730159

In [33]:
predicted_target, test_accuracy = accuracy(test_data, gini_tree)

In [34]:
test_accuracy

0.6891428571428572

In [35]:
entropy_tree = build_tree(train_data)
predicted_target, test_accuracy = accuracy(test_data, entropy_tree)
test_accuracy

0.6891428571428572

> **We can clearly see that the chosen impurity criterion doesn't change the the structure of the tree nor its performance.**