# Q6

Tree Based Models - Q06 - 07/July

Continuing with the same dataset as yesterday, after the first split on Age, on which variable(s) should we do the second split ?


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("q5_data.csv")
df.head()

Unnamed: 0,age,income,student,credit_rating,buys_laptop
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31..40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes


In [3]:
def calculate_entropy(dep_var):
    
    num_a = np.sum(dep_var == "no")
    num_b = np.sum(dep_var == "yes")
    total = num_a + num_b
    prob_a = num_a / total
    prob_b = num_b / total
    if((num_a == 0) | (num_b == 0)):
        entropy = 0
    else:
        entropy = - np.sum(prob_a * np.log2(prob_a) + prob_b * np.log2(prob_b))
    
    return entropy

# create a function to calcaulate information gain for split and variable
def info_gain(dep_var, ind_var, split_level):
    index = ind_var == split_level
    set_a_dep_var = dep_var[index]
    #print(set_a_dep_var)
    size_set_a = set_a_dep_var.shape[0]
    set_b_dep_var = dep_var[~ index]
    #print(set_b_dep_var)
    size_set_b = set_b_dep_var.shape[0]
    
    original_set_entropy = calculate_entropy(dep_var)
    if((size_set_a == 0) | (size_set_b == 0)):
        weighted_entropy = original_set_entropy
    else:
        set_a_entropy = calculate_entropy(set_a_dep_var)
        set_b_entropy = calculate_entropy(set_b_dep_var)
        weighted_entropy = (size_set_a * set_a_entropy + size_set_b * set_b_entropy) / (size_set_a + size_set_b)
    
    information_gain = original_set_entropy - weighted_entropy
    
    return information_gain

In [4]:
# 1st split on age

# iterate over all unique levels for F1 and F2
df_info_gain = pd.DataFrame(data={'vrb': [], 'split_level': [], 'information_gain':[]})
for vrb in ['age']:
    unique_levels = df[vrb].unique().tolist()
    for level in unique_levels:
        gain_value = info_gain(df['buys_laptop'], df[vrb], level)
        df_info_gain.loc[len(df_info_gain.index)] = [vrb, level, gain_value]
df_info_gain

Unnamed: 0,vrb,split_level,information_gain
0,age,<=30,0.102244
1,age,31..40,0.226
2,age,>40,0.003185


In [5]:
# Based on above, we can create 1st cut as age == 31..40
index = df['age'] == '31..40'
df_1st = df[index]
df_1st_rest = df[~index]

In [6]:
df_1st

Unnamed: 0,age,income,student,credit_rating,buys_laptop
2,31..40,high,no,fair,yes
6,31..40,low,yes,excellent,yes
11,31..40,medium,no,excellent,yes
12,31..40,high,yes,fair,yes


In [7]:
df_1st_rest

Unnamed: 0,age,income,student,credit_rating,buys_laptop
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes
10,<=30,medium,yes,excellent,yes
13,>40,medium,no,excellent,no


In [8]:
# calculate info gain for rest of the variables for df_1st
df_info_gain = pd.DataFrame(data={'vrb': [], 'split_level': [], 'information_gain':[]})
for vrb in ['income', 'student', 'credit_rating', 'age']:
    unique_levels = df_1st[vrb].unique().tolist()
    for level in unique_levels:
        gain_value = info_gain(df_1st['buys_laptop'], df_1st[vrb], level)
        df_info_gain.loc[len(df_info_gain.index)] = [vrb, level, gain_value]
df_info_gain

Unnamed: 0,vrb,split_level,information_gain
0,income,high,0.0
1,income,low,0.0
2,income,medium,0.0
3,student,no,0.0
4,student,yes,0.0
5,credit_rating,fair,0.0
6,credit_rating,excellent,0.0
7,age,31..40,0.0


so no splits needed, for the split data where age == 30..40, since buy_laptop is only "yes" for this split

In [9]:
# calculate info gain for rest of the variables for df_1st
df_info_gain = pd.DataFrame(data={'vrb': [], 'split_level': [], 'information_gain':[]})
for vrb in ['income', 'student', 'credit_rating', 'age']:
    unique_levels = df_1st_rest[vrb].unique().tolist()
    for level in unique_levels:
        gain_value = info_gain(df_1st_rest['buys_laptop'], df_1st_rest[vrb], level)
        df_info_gain.loc[len(df_info_gain.index)] = [vrb, level, gain_value]
df_info_gain

Unnamed: 0,vrb,split_level,information_gain
0,income,high,0.236453
1,income,medium,0.029049
2,income,low,0.034852
3,student,no,0.278072
4,student,yes,0.278072
5,credit_rating,fair,0.124511
6,credit_rating,excellent,0.124511
7,age,<=30,0.029049
8,age,>40,0.029049


For the other split, where age is in ('<=30', '>40'), split should be done on income = 'high' and other split can have income as 'medium' or 'low'

# Answer

Assuming age as the variable for 1st split, data should be split in two sets as below:

    - 1. One where age == '30..40'
            - for this split, there is no further split possible, since the buy_laptop variable has only one value
    - 2. One where age is in ('<=30', '>40')
            - for this set, we can have second split based on income = 'high' or income is in ('medium', 'low')