# Q9

Tree Based Models - Q09- 11/July

An optometrist prescribes type of contact lenses according to 4 features (age, prescription, astigmatism and tear production rate). The data is available in 02_lens_prescription.csv file here: https://drive.google.com/drive/folders/1Jl8iDu7nGmrqCECbrLqmVafgwE5PYfiU

Calculate the Gini impurity index for each of the features. Note that age is an ordinal variable.
Which feature indicates that contact lens should not be used?

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("02_lens_prescription.csv")
df.head(2)

Unnamed: 0,S No,Age,Prescription,Astigmatism,Tear production rate,Contact_lens
0,1,1,myope,no,reduced,no lens
1,2,1,myope,no,normal,soft


In [3]:
df.Contact_lens.value_counts()

no lens    15
soft        5
hard        4
Name: Contact_lens, dtype: int64

In [4]:
df['Age'].value_counts()

1    8
2    8
3    8
Name: Age, dtype: int64

In [5]:
df.shape

(24, 6)

In [6]:
def calculate_gini_impurity(ind_var):
    
    prob = ind_var.value_counts() / (ind_var.shape[0])
    prob_sq_sum = np.sum(prob * prob)
    gini_impurity = 1 - prob_sq_sum
    
    return gini_impurity
    
def calculate_gini_impurity_reduction(dep_var, ind_var, split_info=None):
    
    # before split, find gini impurity
    gini_impurity_before_split = calculate_gini_impurity(ind_var)
    
    if split_info is not None:
        dep_var = pd.Series(np.where(dep_var <= split_info, 1, 0))
    
    unique_levels_dep = dep_var.unique().tolist()
    gini_impurity_list = []
    for level in unique_levels_dep:
        # Get independent variable, after split
        ind_var_1 = ind_var[dep_var == level]
        gini_impurity = calculate_gini_impurity(ind_var_1)
        # weight gini impurity by number of observations
        gini_impurity = gini_impurity * (ind_var_1.shape[0] / ind_var.shape[0])
        gini_impurity_list.append(gini_impurity)
    
    gini_impurity_after_split = sum(gini_impurity_list)
    gini_reduction = gini_impurity_before_split - gini_impurity_after_split
        
    return gini_reduction    

# Approach 1
Without converting categorical variable to dummy variable, hence calculating gini reduction at categorical variable level

In [7]:
# for every variable, calculate gini reduction
# since age is ordinal, do it at level 1 and 2
df_gini_reduction = pd.DataFrame(data={'vrb': [], 'split_level': [], 'gini_reduction':[]})
for vrb in ['Prescription', 'Astigmatism', 'Tear production rate', 'Age']:
    if vrb == 'Age':
        for split_info in [1, 2]:
            gini_reduction = calculate_gini_impurity_reduction(df['Age'], df['Contact_lens'], split_info)
            df_gini_reduction.loc[len(df_gini_reduction.index)] = [vrb, split_info, gini_reduction]
    else:
        gini_reduction = calculate_gini_impurity_reduction(df[vrb], df['Contact_lens'])
        df_gini_reduction.loc[len(df_gini_reduction.index)] = [vrb, None, gini_reduction]
df_gini_reduction

Unnamed: 0,vrb,split_level,gini_reduction
0,Prescription,,0.010417
1,Astigmatism,,0.072917
2,Tear production rate,,0.211806
3,Age,1.0,0.012153
4,Age,2.0,0.012153


# Approach 2
After converting categorical variable to dummy variable

In [8]:
cat_var_list = ['Prescription', 'Astigmatism', 'Tear production rate']
df_encoded = pd.get_dummies(df, columns = cat_var_list, prefix_sep='_')
df_encoded.head()

Unnamed: 0,S No,Age,Contact_lens,Prescription_hypermetrope,Prescription_myope,Astigmatism_no,Astigmatism_yes,Tear production rate_normal,Tear production rate_reduced
0,1,1,no lens,0,1,1,0,0,1
1,2,1,soft,0,1,1,0,1,0
2,3,1,no lens,0,1,0,1,0,1
3,4,1,hard,0,1,0,1,1,0
4,5,1,no lens,1,0,1,0,0,1


In [9]:
df_gini_reduction = pd.DataFrame(data={'vrb': [], 'split_level': [], 'gini_reduction':[]})
vrb_list = ['Prescription_hypermetrope', 'Prescription_myope', 'Astigmatism_no', 'Astigmatism_yes', 
            'Tear production rate_normal', 'Tear production rate_reduced', 'Age']
for vrb in vrb_list:
    if vrb == 'Age':
        for split_info in [1, 2]:
            gini_reduction = calculate_gini_impurity_reduction(df_encoded['Age'], df_encoded['Contact_lens'], split_info)
            df_gini_reduction.loc[len(df_gini_reduction.index)] = [vrb, split_info, gini_reduction]
    else:
        gini_reduction = calculate_gini_impurity_reduction(df_encoded[vrb], df_encoded['Contact_lens'])
        df_gini_reduction.loc[len(df_gini_reduction.index)] = [vrb, None, gini_reduction]
df_gini_reduction

Unnamed: 0,vrb,split_level,gini_reduction
0,Prescription_hypermetrope,,0.010417
1,Prescription_myope,,0.010417
2,Astigmatism_no,,0.072917
3,Astigmatism_yes,,0.072917
4,Tear production rate_normal,,0.211806
5,Tear production rate_reduced,,0.211806
6,Age,1.0,0.012153
7,Age,2.0,0.012153


In [10]:
# With Cross tabulate, we can find for which varibale, we can get data split with only 'no lens'

In [11]:
pd.crosstab(df['Tear production rate'], df['Contact_lens'])

Contact_lens,hard,no lens,soft
Tear production rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
normal,4,3,5
reduced,0,12,0


With `Tear production rate`, we see that when its equal to reduced, data does has all entries with "no lens"