In [None]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import pandas as pd
import numpy as np
import math

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

from sklearn.model_selection import train_test_split
import pprint

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.3)


In [None]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
iris_set.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [None]:
iris_set['species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
iris_set.shape

(149, 5)

In [None]:
#split training and testing dataset
train, test = train_test_split(iris_set,test_size=0.2)

In [None]:
print(len(train))
print(len(test))

119
30


In [None]:
#select attribute that is the most useful
#information gain measures how well a given attribute separates the training examples
# according to their target classification
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
101,7.1,3.0,5.9,2.1,Iris-virginica
100,5.8,2.7,5.1,1.9,Iris-virginica
124,7.2,3.2,6.0,1.8,Iris-virginica
30,5.4,3.4,1.5,0.4,Iris-setosa
54,5.7,2.8,4.5,1.3,Iris-versicolor
...,...,...,...,...,...
12,4.3,3.0,1.1,0.1,Iris-setosa
103,6.5,3.0,5.8,2.2,Iris-virginica
62,6.1,2.9,4.7,1.4,Iris-versicolor
99,6.3,3.3,6.0,2.5,Iris-virginica


In [None]:
train_m = train.values
test_m = test.values

In [None]:
train['species'].unique()

array(['Iris-virginica', 'Iris-setosa', 'Iris-versicolor'], dtype=object)

In [None]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [None]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [None]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(1,len(unique_values)):
            current_value = unique_values[index]
            previous_value = unique_values[index - 1]
            potential_split = (current_value + previous_value) / 2
                
            potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [None]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

In [None]:
def calculate_entropy(data): #given data of one side 
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True) #return counts for each label 

    probabilities = counts / counts.sum() #element wise 
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [None]:
def calculate_attribute_entropy(data_below, data_above): #
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    attribute_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return attribute_entropy

In [None]:
def information_gain(entropy,attribute_entropy): #for whatever attribute given
  return (entropy-attribute_entropy);

In [None]:
#split information value represents the potential information generated by splitting
# the training data set S into c partitions, corresponding to classes/values of attribute A.
def intrinsic_info(data,split_column): #split col is the attribute 
  split_column_values = data[:, split_column]
  unique, counts = np.unique(split_column_values, return_counts=True)
  n = len(split_column_values)
  res = 0
  for i in counts:
    res = res + -(i/n)*np.log2(i/n)
  return res

In [None]:
#trial
res = intrinsic_info(train.values,2)
print(res)

5.0288673868329905


In [None]:
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
45,5.1,3.8,1.6,0.2,Iris-setosa
66,5.8,2.7,4.1,1.0,Iris-versicolor
89,5.5,2.6,4.4,1.2,Iris-versicolor
65,5.6,3.0,4.5,1.5,Iris-versicolor
119,6.9,3.2,5.7,2.3,Iris-virginica
...,...,...,...,...,...
110,6.4,2.7,5.3,1.9,Iris-virginica
13,5.8,4.0,1.2,0.2,Iris-setosa
132,6.3,2.8,5.1,1.5,Iris-virginica
137,6.0,3.0,4.8,1.8,Iris-virginica


In [None]:
def determine_best_split(data, potential_splits):
    
    #overall_entropy = 10000
    gain_ratio = -10000
    for column_index in potential_splits: #column_index represents the attribute at which split will occur
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            entropy = calculate_entropy(data) 
            attribute_entropy = calculate_attribute_entropy(data_below, data_above)
            #info gain is the diff between previous entropy and after split
            info_gain = information_gain(entropy,attribute_entropy) #
            intr_info = intrinsic_info(data,column_index) #
            curr_gain_ratio = info_gain/intr_info
            

            if curr_gain_ratio >= gain_ratio:
              gain_ratio = curr_gain_ratio
              best_split_column = column_index
              best_split_value = value
            #if current_overall_entropy <= overall_entropy:
            #    overall_entropy = current_overall_entropy
            #    best_split_column = column_index
            #    best_split_value = value
    
    return best_split_column, best_split_value

In [None]:
def decision_tree_algorithm(df, counter=0):
    global cols
    if counter == 0:
        cols = df.columns
        data = df.values
    else:
        data = df           
    # base cases
    if check_purity(data):
        classification = classify_data(data)
        return classification
    # recursive part
    else:    
        counter += 1
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
      
        question = "{} <= {}".format(cols[split_column], split_value)
        sub_tree = {question: []}
        
        # recursive
        yes_answer = decision_tree_algorithm(data_below, counter)
        no_answer = decision_tree_algorithm(data_above, counter)
        
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)
        
        return sub_tree

In [None]:
printer = pprint.PrettyPrinter(indent=3)

In [None]:
tree = decision_tree_algorithm(train)
printer.pprint(tree)

{  'petal_width <= 0.8': [  'Iris-setosa',
                            {  'petal_width <= 1.75': [  {  'petal_length <= 4.95': [  'Iris-versicolor',
                                                                                       {  'petal_width <= 1.55': [  'Iris-virginica',
                                                                                                                    {  'petal_length <= 5.449999999999999': [  'Iris-versicolor',
                                                                                                                                                               'Iris-virginica']}]}]},
                                                         {  'petal_length <= 4.85': [  {  'sepal_width <= 3.1': [  'Iris-virginica',
                                                                                                                   'Iris-versicolor']},
                                                                                       

In [None]:
test_tree = decision_tree_algorithm(test)
printer.pprint(test_tree)

{  'petal_width <= 1.6': [  {  'petal_width <= 0.65': [  'Iris-setosa',
                                                         'Iris-versicolor']},
                            'Iris-virginica']}
