In [1]:
import pandas as pd 
import numpy as np 

In [20]:
data = pd.read_csv("iris.csv")
mapping = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
data['species'] = data['species'].map(mapping)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
# Now we first compute gini index and entropy at a given node 
# Entropy at a given node is E(S) = - sum_i -p_i log2(p_i)
def entropy(y):
    # y is a panda series on the categorical variable (dependent variable)
    class_labels = np.unique(y.species)
    entropy = 0 
    for cls in class_labels: 
        # probability for each class, essentially its counting the frequencies of the class 
        p_cls = len(y.species[y.species == cls])/len(y.species)
        entropy += -p_cls * np.log2(p_cls)



In [25]:
# We also define another metric, gini index 
def gini_index(y):
    class_labels = np.unique(y)
    gini_index = 0 
    for cls in class_labels:
        p_cls = len(y[y == cls])/len(y)
        gini_index += p_cls **2 
    gini_index = 1 - gini_index 
    return gini_index 

In [None]:
# Now we define information gain, 
def information_gain(parent,l_child,r_child,mode)
    # parent: a panda series on the parent node (the dependent variable)
    # l(r) child: the child nodes under the parent node (the dependent variable after the cut or threshold)
    weight_l = len(l_child)/len(parent)
    weight_r = len(r_child)/len(parent)
    ig = 0 

    if mode == 'gini_index':
        ig = gini_index(parent) - weight_l * gini_index(l_child) - weight_r * gini_index(r_child)
    else:
        ig = entropy(parent) - weight_l * entropy(l_child) - weight_r * entropy(r_child)

    return ig 

In [53]:
def split(dataset,feature_index,threshold):
    # dataset: the data that we are going to cut 
    # feature_index: which feature we are going to take the cut 
    # threshold: where to cut on the feature
    dataset_left = dataset[dataset.iloc[:, feature_index] <= threshold]
    dataset_right = dataset[dataset.iloc[:, feature_index] > threshold]
    return dataset_left, dataset_right 


In [63]:
## now we loop over the split to see which is the best split 
num_samples = data.shape[0]
num_features = data.shape[1] - 1 


5

In [80]:
feature_index = 0 
feature_values = data.iloc[:,feature_index].values
possible_thresholds = np.unique(feature_values)
for threshold in possible_thresholds:
    # get current split
    print(threshold)
    dataset_left, dataset_right = split(data, feature_index, threshold)


4.3
4.4
4.5
4.6
4.7
4.8
4.9
5.0
5.1
5.2
5.3
5.4
5.5
5.6
5.7
5.8
5.9
6.0
6.1
6.2
6.3
6.4
6.5
6.6
6.7
6.8
6.9
7.0
7.1
7.2
7.3
7.4
7.6
7.7
7.9
