In [1]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import pandas as pd
import numpy as np
import math

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

from sklearn.model_selection import train_test_split
import pprint

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.4)


In [2]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
iris_set.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [4]:
iris_set['species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [5]:
iris_set.shape

(149, 5)

In [6]:
#split training and testing dataset
train, test = train_test_split(iris_set,test_size=0.2)

In [7]:
print(len(train))
print(len(test))
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

119
30


In [9]:
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,6.8,3.2,5.9,2.3,Iris-virginica
1,5.5,2.4,3.8,1.1,Iris-versicolor
2,6.1,2.8,4.7,1.2,Iris-versicolor
3,5.8,4.0,1.2,0.2,Iris-setosa
4,6.4,2.8,5.6,2.1,Iris-virginica
...,...,...,...,...,...
114,5.4,3.9,1.3,0.4,Iris-setosa
115,5.1,3.7,1.5,0.4,Iris-setosa
116,5.5,2.4,3.7,1.0,Iris-versicolor
117,5.8,2.7,5.1,1.9,Iris-virginica


In [10]:
train['species'].unique()

array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [11]:
def entropy(target_col):
 
    elements,counts = np.unique(target_col, return_counts = True)
    entropy = np.sum([(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy

In [12]:
def InfoGain(data, split_attribute_name, target_name = "class"):
    
    #Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    
    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name], return_counts = True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    #Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [13]:
def ID3(data, originaldata, features, target_attribute_name , parent_node_class = None):
    
    #If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    #If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts = True)[1])]
    
    #If the feature space is empty, return the mode target feature value of the direct parent node --> Note that
    #the direct parent node is that node which has called the current run of the ID3 algorithm and hence
    #the mode target feature value is stored in the parent_node_class variable.
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts = True)[1])]
        
        #Select the feature which best splits the dataset
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
        #gain in the first run
        tree = {best_feature:{}}
        
        
        #Remove the feature with the best inforamtion gain from the feature space
        features = [i for i in features if i != best_feature]
        
        #Grow a branch under the root node for each possible value of the root node feature
        
        for value in np.unique(data[best_feature]):
            value = value
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = ID3(sub_data, data, features, target_attribute_name, parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return(tree)

In [14]:
def predict(query,tree,default = 1):
  
    for key in list(query.keys()):
        if key in list(tree.keys()):       
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result, dict):
                return predict(query, result)
            else:
                return result

In [15]:
def testit(data, label, tree):
    #Create new query instances by simply removing the target feature column from the original dataset and 
    #convert it to a dictionary
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    
    #Create a empty DataFrame in whose columns the prediction of the tree are stored
    predicted = pd.DataFrame(columns = ["predicted"]) 
    
    #Calculate the prediction accuracy
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    print('The prediction accuracy is: ', (np.sum(predicted["predicted"] == data[label]) / len(data)) * 100, '%')

In [17]:
tree = ID3(train, train, train.columns[:-1], train.columns[-1])
pprint.pprint(tree)

{'petal_width': {0.1: 'Iris-setosa',
                 0.2: 'Iris-setosa',
                 0.3: 'Iris-setosa',
                 0.4: 'Iris-setosa',
                 0.5: 'Iris-setosa',
                 0.6: 'Iris-setosa',
                 1.0: 'Iris-versicolor',
                 1.1: 'Iris-versicolor',
                 1.2: 'Iris-versicolor',
                 1.3: 'Iris-versicolor',
                 1.4: 'Iris-versicolor',
                 1.5: {'petal_length': {4.2: 'Iris-versicolor',
                                        4.5: 'Iris-versicolor',
                                        4.6: 'Iris-versicolor',
                                        4.9: 'Iris-versicolor',
                                        5.0: 'Iris-virginica'}},
                 1.6: 'Iris-versicolor',
                 1.7: {'sepal_length': {4.9: 'Iris-virginica',
                                        6.7: 'Iris-versicolor'}},
                 1.8: {'sepal_length': {5.9: 'Iris-versicolor',
                  

In [18]:
testit(test, "species", tree)

The prediction accuracy is:  80.0 %
