In [1]:
import pandas as pd
import sys

In [2]:
#Save data to dataframes
df_train = pd.read_csv("./part2/hepatitis-training", delimiter=r"\s+");
df_test = pd.read_csv("./part2/hepatitis-test", delimiter=r"\s+");

In [3]:
class Node:
    def __init__(self, attribute, depth, probability = None, true_node = None, false_node = None, class_node=None):
        self.attribute = attribute
        self.true_node = true_node
        self.false_node = false_node 
        self.depth = depth   
        self.probability = probability
        self.class_node = class_node

In [4]:
def get_prob(data):

    try:
        die = data.Class.value_counts().die
    except:
        die = 0

    try:
        live = data.Class.value_counts().live
    except:
        live = 0

    best = die if die > live else live

    return  "die" if die > live else "live" , best / len(data)    


In [5]:
def build_tree(instances, attributes, depth):

    #Increase depth for printing
    depth += 1

    if len(instances) == 0:

        #Create a leaf node using the most probable class
        class_val, prob = get_prob(df_train)
        return Node("Class", depth, probability=prob, class_node=class_val)
        
    elif calc_impurity(instances, len(instances)) == 0:

        #Return a pure node
        class_val, prob = get_prob(instances)
        return Node("Class", depth, probability=prob, class_node=class_val)

    elif len(attributes) == 1:
        #Return a leaf with amjority class
        class_val, prob = get_prob(instances)
        return Node("Class", depth, probability=prob, class_node=class_val)
    else:

        best_impurity = sys.maxsize
        best_att = None
        best_true = None
        best_false = None


        for attribute in attributes:

                if attribute == "Class":
                    continue

                true_instance = instances.loc[instances[attribute] == True]
                false_instance = instances.loc[instances[attribute] == False]

                true_impurity = calc_impurity(true_instance, len(instances))
                false_impurity = calc_impurity(false_instance, len(instances))
                
                true_weighted = len(true_instance) / len(instances) * true_impurity
                false_weighted = len(false_instance) / len(instances) * false_impurity

                if(true_weighted + false_weighted < best_impurity):
                    best_impurity = true_weighted + false_weighted

                    best_att = attribute
                    best_true = true_instance
                    best_false = false_instance

        attributes.remove(best_att)

        true = build_tree(best_true, attributes, depth)
        false = build_tree(best_false, attributes, depth)

        return Node(best_att, depth, true_node=true, false_node=false)
    



In [6]:
def calc_impurity(instances, total):

    try:
        die = instances.Class.value_counts().die
    except:
        return 0

    try:
        live = instances.Class.value_counts().live
    except:
        return 0

    return (die / total) * (live / total)

In [7]:
root_node = build_tree(df_train, list(df_train.columns.values), 0)

In [8]:
#Print Tree
def print_tree(node, outcome=None):


    if(node.class_node != None):
        print("  " * node.depth + f"Class {node.class_node} prob = {node.probability}")
        return
    else:
        print("  " * node.depth + f"{node.attribute} = {outcome}:")

    true = node.true_node
    false = node.false_node

    if true != None:
         print_tree(true, outcome="True")
    if false != None:
         print_tree(false, outcome="False")


   
print_tree(root_node)


  HISTOLOGY = None:
    MALAISE = True:
      FATIGUE = True:
        BILIRUBIN = True:
          VARICES = True:
            Class live prob = 1.0
            Class die prob = 1.0
          STEROID = False:
            Class live prob = 1.0
            Class die prob = 1.0
        ASCITES = False:
          Class live prob = 1.0
          BIGLIVER = False:
            Class die prob = 1.0
            Class live prob = 1.0
      ANOREXIA = False:
        AGE = True:
          Class live prob = 1.0
          FIRMLIVER = False:
            Class die prob = 1.0
            SGOT = False:
              FEMALE = True:
                Class live prob = 0.8125
                ANTIVIRALS = False:
                  SPLEENPALPABLE = True:
                    SPIDERS = True:
                      Class live prob = 0.8125
                      Class live prob = 0.5
                    Class live prob = 0.8125
                  Class live prob = 0.8125
              Class die prob = 0.66666666666666