In [5]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

# Defining the file name
FILENAME = 'iris.csv'

In [6]:
# Defining the class IrisSpeciesDecisionTree
class IrisSpeciesDecisionTree:
    def __init__(self, groupsCount: int = 3):
        """
        Initializes the IrisSpeciesDecisionTree class with the specified number of groups.
        
        Parameters:
        - groupsCount: Number of groups to divide the data into.
        """
        self.df = IrisSpeciesDecisionTree.importData(FILENAME)
        self.speciesNames = set(self.df['Species'])
        self.groupsCount = groupsCount

    @staticmethod
    def importData(filename: str) -> pd.DataFrame:
        """
        Static method to import data from a CSV file into a pandas DataFrame.
        
        Parameters:
        - filename: Name of the CSV file containing the data.
        
        Returns:
        - Pandas DataFrame containing the imported data.
        """
        dataframe: pd.DataFrame = pd.read_csv(filename)
        dataframe['Species'] = np.array([x[5:] for x in dataframe['Species']])
        return dataframe

    def getCountBySpecies(self) -> pd.DataFrame:
        """
        Calculates the count of each species in the dataset.
        
        Returns:
        - Pandas DataFrame containing the count of each species.
        """
        count = self.df.groupby('Species')['Species'].count()
        return count

    def calculateEntropy(self) -> float:
        """
        Calculates the entropy of the dataset.
        
        Returns:
        - Entropy value of the dataset.
        """
        dfWithCounts = self.getCountBySpecies()
        return sum(-1 * (part := species / len(self.df)) * np.log2(part)
                   for species in dfWithCounts)

    def getGroupSizes(self) -> list:
        """
        Calculates the size of each group based on the number of groups specified.
        
        Returns:
        - List containing the size of each group.
        """
        totalCount = len(self.df)
        groupSizes = [totalCount // self.groupsCount + (1 if i < totalCount % self.groupsCount else 0)
                       for i in range(self.groupsCount)]
        return groupSizes

    def divideInGroups(self) -> dict:
        """
        Divides the dataset into groups based on feature values.
        
        Returns:
        - Dictionary containing groups for each feature.
        """
        groupSizes = self.getGroupSizes()
        featureNames = [columnName for columnName in self.df.columns if columnName.endswith('Cm')]
        groupSets = dict()
        for featureName in featureNames:
            dataframeSorted = self.df.sort_values(by=featureName)
            groups = [
                dataframeSorted.iloc[sum(groupSizes[:i]):sum(groupSizes[:i + 1])]
                .groupby('Species')['Species']
                .count()
                for i in range(self.groupsCount)
            ]
            groupSets[featureName] = groups

        return groupSets
    
    def evaluateGroups(self, sets, entropyGlobal, groupSizes, decisionTree):
        """
        Evaluates the groups and calculates discriminative power.
        
        Parameters:
        - sets: Dictionary containing groups for each feature.
        - entropyGlobal: Entropy value of the entire dataset.
        - groupSizes: Size of each group.
        - decisionTree: Instance of the IrisSpeciesDecisionTree class.
        
        Returns:
        - Tuple containing discriminative powers and attributes.
        """
        discriminativePowers = []
        attrs = []
        for key, value in sets.items():
            entropy_i = 0

            dfNew = pd.DataFrame()
            full_group = pd.Series(dtype=float)
            for i, group in enumerate(value):
                entropy = 0
                for name in decisionTree.speciesNames:
                    full_group[name] = group.get(name, 0)
                dfNew[f'Group {i + 1}'] = full_group

                dfWithCounts = decisionTree.getCountBySpecies()
                entropy = sum(-1 * (part := species / sum(group)) * np.log2(part)
                                for species in group)

                entropy_i += entropy

            discriminative_power = entropyGlobal - groupSizes[0]/len(decisionTree.df)*entropy_i
            discriminativePowers.append((key, discriminative_power))

            dfNew = dfNew.fillna(0)
            attrs.append(dfNew)

        discriminativePowers.sort(key=lambda x: x[1], reverse=True)
        return (discriminativePowers, attrs)

In [7]:
# Create an instance of the IrisSpeciesDecisionTree class
decisionTree = IrisSpeciesDecisionTree()

# Calculate the entropy of the dataset
entropyGlobal= decisionTree.calculateEntropy()
print(f'{entropyGlobal}')

1.584962500721156


In [8]:
# Divide the dataset into groups based on feature values
sets = decisionTree.divideInGroups()
pprint(sets)

{'PetalLengthCm': [Species
setosa    50
Name: Species, dtype: int64,
                   Species
versicolor    47
virginica      3
Name: Species, dtype: int64,
                   Species
versicolor     3
virginica     47
Name: Species, dtype: int64],
 'PetalWidthCm': [Species
setosa    50
Name: Species, dtype: int64,
                  Species
versicolor    46
virginica      4
Name: Species, dtype: int64,
                  Species
versicolor     4
virginica     46
Name: Species, dtype: int64],
 'SepalLengthCm': [Species
setosa        43
versicolor     6
virginica      1
Name: Species, dtype: int64,
                   Species
setosa         7
versicolor    31
virginica     12
Name: Species, dtype: int64,
                   Species
versicolor    13
virginica     37
Name: Species, dtype: int64],
 'SepalWidthCm': [Species
setosa         1
versicolor    30
virginica     19
Name: Species, dtype: int64,
                  Species
setosa        13
versicolor    16
virginica     21
Name: Species, 

In [9]:
# Get the size of each group to be used in the evaluation
groupSizes = decisionTree.getGroupSizes()
# Evaluate the groups and calculate discriminative power
discriminativePowers, attrs = decisionTree.evaluateGroups(sets, entropyGlobal, groupSizes, decisionTree)

# Print the discriminative power of each attribute
for key, value in discriminativePowers:
  print(f"Discr(Iris/{key}) = {value}")

Discr(Iris/PetalLengthCm) = 1.3666658879515052
Discr(Iris/PetalWidthCm) = 1.3168430405863076
Discr(Iris/SepalLengthCm) = 0.6474114469842496
Discr(Iris/SepalWidthCm) = 0.338428336537556


In [10]:
# Print the attributes
for atr in attrs:
    pprint(atr)
    print("")

            Group 1  Group 2  Group 3
virginica         1       12       37
versicolor        6       31       13
setosa           43        7        0

            Group 1  Group 2  Group 3
virginica        19       21       10
versicolor       30       16        4
setosa            1       13       36

            Group 1  Group 2  Group 3
virginica         0        3       47
versicolor        0       47        3
setosa           50        0        0

            Group 1  Group 2  Group 3
virginica         0        4       46
versicolor        0       46        4
setosa           50        0        0

