# Assignment 3:  Decision Tree Implementation
*Margaret Thomann - February 17, 2018 *

In this assignment, I will construct a decision tree from the data provided about heart disease.

### Reading the data and assigning counts to arrays and Data class

#### Data Class 
A Data class will be instantiated for each line of the data.  It will then be added to one of two arrays (explained later).

In [98]:
from collections import OrderedDict

class Data:
    def __init__(self, has_heart_disease_value):
        self.has_heart_disease = has_heart_disease_value
        self.data_vars = OrderedDict()
        self.data_vars["age"] = 0
        self.data_vars["sex"] = 0
        self.data_vars["chest_pain_type"] = 0
        self.data_vars["resting_blood_pressure"] = 0
        self.data_vars["serum_cholesterol"] = 0
        self.data_vars["fasting_blood_sugar"] = 0
        self.data_vars["resting_electrocardiographic_results"] = 0
        self.data_vars["maximum_heart_rate_achieved"] = 0
        self.data_vars["exercise_induced_angina"] = 0
        self.data_vars["oldpeak"] = 0
        self.data_vars["slope_peak_exercise"] = 0
        self.data_vars["number_of_major_vessels"] = 0
        self.data_vars["thal"] = 0
        self.data_vars["has_heart_disease"] = self.has_heart_disease

In [99]:
# Arrays for the Data instances
#     absence_heart_array  : contains all Data instantiations where heart disease is absent
#     presence_heart_array : contains all Data instantiations where heart disease is absent
absence_heart_array = []
presence_heart_array = []
total_data_array = []

data = open('heart.data.txt')
for line in data.readlines():
    feature_value_list = line.split()
    has_heart_disease = int(feature_value_list[-1])
    data = Data(has_heart_disease)
    counter = 0
    for feature in data.data_vars.keys():
        data.data_vars[feature] = float(feature_value_list[counter])
        counter += 1
    if has_heart_disease == 2:
        presence_heart_array.append(data)
    elif has_heart_disease == 1:
        absence_heart_array.append(data)
    total_data_array.append(data)

presence_heart_array_num = len(presence_heart_array)
absence_heart_array_num = len(absence_heart_array)
print str(presence_heart_array_num) + " = # Of People with Heart Disease"
print str(absence_heart_array_num) + " = # Of People without Heart Disease"

120 = # Of People with Heart Disease
150 = # Of People without Heart Disease


### Calculate Information Gain for Each Feature
The below function can be used to determine the information gain for a given data and hypothesis (passed in as a string - x and y).  Information Gain can be represented as: Infgain(Y|X_K) = H(Y) - H(Y|X_K)

In [151]:
import math

def info_gain_buckets(y, x, buckets, bucket_num):
    print "info_gain_bucket called with "+ str(len(buckets)) + " buckets"
    
    # Define dicts for the counts
    positive_y_counts = {}
    positive_x_counts = {}
    negative_y_counts = {}
    negative_x_counts = {}
    
    # Get the bucket values
    for bucket in buckets:
        # Convert to string
        s = ""
        for num in list(set(bucket)):
            s += (str(num)+ " ")
        positive_x_counts[s] = 0
        negative_x_counts[s] = 0        

    
    y_denom = 0
    for data in presence_heart_array:
        y_denom += 1
        value = data.data_vars[y]
        
        # Value is not in dictionary yet
        # so set the occurence for that value to 1
        if value not in positive_y_counts.keys():
            positive_y_counts[value] = 1 
        # Value is already in dictionary
        # so increase the occurence count for that value by 1
        else:
            current_count_for_value = positive_y_counts[value]
            positive_y_counts.update({value:current_count_for_value+1})
            
        # Same thing is done for processing x:
        x_value = data.data_vars[x]
        for key in negative_x_counts.keys():
            if str(x_value)+" " in key:
                current_count_for_value = positive_x_counts[key]
                positive_x_counts.update({key:current_count_for_value+1})
            
    
    for data in absence_heart_array:
        y_denom += 1
        value = data.data_vars[y]
        
        # Value is not in dictionary yet
        # so set the occurence for that value to 1
        if value not in negative_y_counts.keys():
            negative_y_counts[value] = 1 
        # Value is already in dictionary
        # so increase the occurence count for that value by 1
        else:
            current_count_for_value = negative_y_counts[value]
            negative_y_counts.update({value:current_count_for_value+1})
            
        # Same thing is done for processing x:
        x_value = data.data_vars[x]
        for key in negative_x_counts.keys():
            if str(x_value)+" " in key:
                current_count_for_value = negative_x_counts[key]
                negative_x_counts.update({key:current_count_for_value+1})
            
            
    # Calculate H(Y)     
    h_of_y = 0
    for count in positive_y_counts.values():
        p = float(float(count)/float(y_denom))
        entropy = -1 * p * (math.log(p, 2))
        h_of_y += entropy
    for count in negative_y_counts.values():
        p = float(float(count)/float(y_denom))
        entropy = -1 * p * (math.log(p, 2))
        h_of_y += entropy
    
    h_of_y_given_x = 0
    for feature_value in positive_x_counts.keys():
        sum_of_values = positive_x_counts[feature_value] + negative_x_counts[feature_value]
        fraction = float(float(sum_of_values)/float(y_denom))
        p_positive = float(float(positive_x_counts[feature_value])/float(sum_of_values)) 
        p_negative = float(float(negative_x_counts[feature_value])/float(sum_of_values)) 
        entropy_positive = -p_positive * (math.log(p_positive, 2))
        entropy_negative = -p_negative * (math.log(p_negative, 2))
        h_of_y_given_x -= fraction*(entropy_positive+entropy_negative)
    
    info_gain = h_of_y - h_of_y_given_x
    return info_gain
    

### Determine possible splits
These will be used for the information gain

In [101]:
def get_splits(feature):
    splits = {}
    already_split_on = []
    for data in total_data_array:
        feature_value = float(data.data_vars[feature])
        if feature_value not in already_split_on:
            already_split_on.append(feature_value)
            splits[feature_value] = [[],[]]
    
    for split in splits:
        less_than_array = []
        greater_than_or_equal_to_array = []
        for data in total_data_array:
            feature_value = float(data.data_vars[feature])
            # Compare it to the split
            if feature_value < split:
                less_than_array.append(feature_value)
            else:
                greater_than_or_equal_to_array.append(feature_value)
        splits[split] = [less_than_array,greater_than_or_equal_to_array]
    
    # Remove any splits where the less than array is 0
    
    split_dict = {}
    for data in total_data_array: 
        feature_value = float(data.data_vars[feature])
        if feature_value not in split_dict.keys():
            new_list = []
            new_list.append(feature_value)
            split_dict[feature_value] = new_list
        else:
            split_dict[feature_value].append(feature_value)
    
    

    for split in splits.keys():
        if len(splits[split][0]) == 0:
            del splits[split]
            break
            
    separate_buckets = []
    for l in split_dict.values():
        separate_buckets.append(l)
    
    splits["separate_values"] = separate_buckets

    
    return splits

In [153]:
def compute_best_split(feature):
    print "Computing best split for: "+feature
    splits = get_splits(feature)
    info_gains = {}
    max_split_info = 0
    split_on_value = 0
    for split in splits.keys():
        info_gains[split] = info_gain_buckets("has_heart_disease", feature, splits[split], len(splits[split]))
        print str(info_gains[split])+" = info gain for splitting on "+str(split)
        if info_gains[split] > max_split_info:
            max_split_info = info_gains[split]
            split_on_value = split
    return {"Greatest Info Gain":max_split_info,
           "from splitting on value":split_on_value}


### Recurse to build the tree
Used the information gain function to determine the best splits for each node of the tree.

In [105]:
def build_tree():
    for feature in data.data_vars.keys():
        print compute_best_split("thal")
        #information_gain = info_gain("has_heart_disease", feature)
        break



In [154]:
build_tree()

Computing best split for: thal
info_gain_bucket called with 3 buckets
1.77359644136 = info gain for splitting on separate_values
info_gain_bucket called with 2 buckets
1.7791218833 = info gain for splitting on 6.0
info_gain_bucket called with 2 buckets
1.79338911416 = info gain for splitting on 7.0
{'Greatest Info Gain': 1.793389114163413, 'from splitting on value': 7.0}
