# Q4

Tree Based Models - Q04 - 05/July
===================================
Consider the following. What could be a splitting rule (on F1 or F2) for the following?

---------------------------
|F1 | F2 | Outcome|
|---| ---| -------|
|4 | 1 | 0 |
|6 | 6 | 0 |
|9 | 5 | 1 |
|1 | 2 | 0 |
|7 | 3 | 1 |
|5 | 4 | 1 |


In [1]:
import numpy as np
import pandas as pd

In [2]:
def calculate_entropy(dep_var):
    
    num_a = np.sum(dep_var == 0)
    num_b = np.sum(dep_var == 1)
    total = num_a + num_b
    prob_a = num_a / total
    prob_b = num_b / total
    if((num_a == 0) | (num_b == 0)):
        entropy = 0
    else:
        entropy = - np.sum(prob_a * np.log2(prob_a) + prob_b * np.log2(prob_b))
    
    return entropy

# create a function to calcaulate information gain for split and variable
def info_gain(dep_var, ind_var, split_level):
    index = ind_var <= split_level
    set_a_dep_var = dep_var[index]
    size_set_a = set_a_dep_var.shape[0]
    set_b_dep_var = dep_var[~ index]
    size_set_b = set_b_dep_var.shape[0]
    
    original_set_entropy = calculate_entropy(dep_var)
    if((size_set_a == 0) | (size_set_b == 0)):
        weighted_entropy = original_set_entropy
    else:
        set_a_entropy = calculate_entropy(set_a_dep_var)
        set_b_entropy = calculate_entropy(set_b_dep_var)
        weighted_entropy = (size_set_a * set_a_entropy + size_set_b * set_b_entropy) / (size_set_a + size_set_b)
    
    information_gain = original_set_entropy - weighted_entropy
    
    return information_gain

In [3]:
d = {'F1' : [4, 6, 9, 1, 7, 5], 'F2' : [1, 6, 5, 2, 3, 4], 'Outcome' : [0, 0, 1, 0, 1, 1]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,F1,F2,Outcome
0,4,1,0
1,6,6,0
2,9,5,1
3,1,2,0
4,7,3,1
5,5,4,1


In [4]:
# iterate over all unique levels for F1 and F2
information_gain_dict = {}
df_info_gain = pd.DataFrame(data={'vrb': [], 'split_level': [], 'information_gain':[]})
for vrb in ['F1', 'F2']:
    info_gain_dict = {}
    unique_levels = df[vrb].unique().tolist()
    for level in unique_levels:
        gain_value = info_gain(df['Outcome'], df[vrb], level)
        info_gain_dict[str(level)] = gain_value
        #print(f"{level} {vrb} {gain_value}")
        df_info_gain.loc[len(df_info_gain.index)] = [vrb, level, gain_value]
        #print(df_info_gain)
        
    information_gain_dict[vrb] = info_gain_dict

In [5]:
df_info_gain.sort_values(by = ['information_gain'], ascending=False)

Unnamed: 0,vrb,split_level,information_gain
0,F1,4.0,0.459148
1,F1,6.0,0.459148
9,F2,2.0,0.459148
3,F1,1.0,0.190875
4,F1,7.0,0.190875
6,F2,1.0,0.190875
8,F2,5.0,0.190875
5,F1,5.0,0.081704
10,F2,3.0,0.081704
2,F1,9.0,0.0
