# ID3 Learner

## Setup

In [183]:
import numpy as np

_dataSetFileName = 'PlayTennis.csv'

_features, _data, _target = ReadData(_dataSetFileName)



raw data : 

     Outlook Temperature Humidity    Wind Play Tennis
0      Sunny         Hot     High    Weak          No
1      Sunny         Hot     High  Strong          No
2   Overcast         Hot     High    Weak         Yes
3       Rain        Mild     High    Weak         Yes
4       Rain        Cool   Normal    Weak         Yes
5       Rain        Cool   Normal  Strong          No
6   Overcast        Cool   Normal  Strong         Yes
7      Sunny        Mild     High    Weak          No
8      Sunny        Cool   Normal    Weak         Yes
9       Rain        Mild   Normal    Weak         Yes
10     Sunny        Mild   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
13      Rain        Mild     High  Strong          No


In [186]:
trainer = ID3_Trainer(_features, _data, _target, ID3_Algorythm.INFO_GAIN)
# trainer = ID3_Trainer(_features, _data, _target, ID3_Algorythm.INFO_GAIN)

print(f'Total Entropy : {trainer.ID3.Entropy}')
trainer.ID3



Total Entropy : 0.9402859586706311


ID3{Outlook,['Overcast' 'Rain' 'Sunny'],['Yes' ID3{Wind,['Strong' 'Weak'],['No' 'Yes']}
 ID3{Humidity,['High' 'Normal'],['No' 'Yes']}]}

## ID3 :

### Entities

In [185]:
from enum import Enum

class ID3_Algorythm(Enum) :
    INFO_GAIN = 0
    GAIN_RATIO = 1
    GINI_INDEX = 2

class ID3_Trainer :
    @property
    def ID3 (self):
        return self.__ID3

    def __init__(self, featurs, data, target, id3_Algorythm):
        if(id3_Algorythm == ID3_Algorythm.INFO_GAIN):
            self.__ID3 = IG_ID3()
        elif(id3_Algorythm == ID3_Algorythm.GAIN_RATIO):
            self.__ID3 = GR_ID3()
        elif(id3_Algorythm == ID3_Algorythm.GINI_INDEX):
            self.__ID3 = GI_ID3()
        else :
            print('invalid Algorythm')
            return
        
        self.__ID3.Train(featurs, data , target)
        
class ID3:
    __Name = ''
    @property 
    def Name(self):
        return self.__Name
    
    __Keys = []
    @property 
    def Keys(self):
        return self.__Keys
    
    __Values = []
    @property 
    def Values(self):
        return self.__Values
    
    __TotalEntropy = 0
    @property 
    def Entropy(self):
        return self.__TotalEntropy
        
    def __str__(self):
        return f'ID3{"{"}{self.__Name},{self.__Keys},{self.__Values}{"}"}'
    
    def __repr__(self):
        return f'{self}'
    
    def _CalculateEntropy(self, target):
        totalEntries = len(target)
        entropy = 0
        for t in np.unique(target):
            targetCount = np.count_nonzero(target == t)
            targetProbability = targetCount/totalEntries
            entropy -= targetProbability * np.log2(targetProbability)
        return entropy
    
    def _SliceDataByFeatureIndex(self, data, featureIndex):
        featureData = np.array(data)[:,featureIndex]
        featurePossibleValues = np.unique(featureData)
        return featureData , featurePossibleValues
    
    def _CalculateBestFeature(self, entropy, features, data, target):
        pass
    
    def _Instantiate(self):
        pass
    
    def Train(self, features, data , target):
        self.__TotalEntropy = self._CalculateEntropy(target)
        self.__Name, featureIndex = self._CalculateBestFeature( self.__TotalEntropy, features, data, target)
        
        featureData ,featurePossibleValues = self._SliceDataByFeatureIndex(data, featureIndex)
        self.__Keys = featurePossibleValues
        
        newFeatures = np.delete(features, featureIndex, 0)
        newData = np.delete(data, featureIndex, 1)
        tmpValues = list([])
        for value in self.__Keys:
            valueIndecies = np.where(featureData == value)
            newTarget = np.array(target)[valueIndecies]
            pTargets = np.unique(newTarget)
            if(len(pTargets) == 1):
                tmpValues.append(pTargets[0])
            else:
                tmpValues.append(self._Instantiate())
                tmpValues[-1].Train(newFeatures, np.array(newData)[valueIndecies], newTarget)
        
        self.__Values = np.array(tmpValues)
    
    def Resolve(key):
        return self.__Values[self.__Keys.index(key)]
    
        
class IG_ID3(ID3):
    
    def __init__(self):
        super()
        
    def _Instantiate(self):
        return IG_ID3()
    
    def _CalculateBestFeature(self, entropy, features, data, target):
        gains = np.zeros((len(features)), dtype=float)
        for featureIndex in range(len(features)):
            featureData ,featurePossibleValues = self._SliceDataByFeatureIndex(data, featureIndex)
            gains[featureIndex] = entropy
            for c in featurePossibleValues:
                valueIndecies = np.where(featureData == c)
                gains[featureIndex] -= (np.count_nonzero(featureData == c)/ len(data)) * self._CalculateEntropy(np.array(target)[valueIndecies])
        
        #print(features)
        #print(gains)
        featureIndex = gains.argmax()
        return features[featureIndex], featureIndex
    
class GR_ID3(ID3):
    
    def __init__(self):
        super()
        
    def _Instantiate(self):
        return GR_ID3()
    
    def _CalculateBestFeature(self, entropy, features, data, target):
        gains = np.zeros((len(features)), dtype=float)
        for featureIndex in range(len(features)):
            featureData ,featurePossibleValues = self._SliceDataByFeatureIndex(data, featureIndex)
            for c in featurePossibleValues:
                valueIndecies = np.where(featureData == c)
                gains[featureIndex] -= (np.count_nonzero(featureData == c)/ len(data)) * self._CalculateEntropy(np.array(target)[valueIndecies])
            gains[featureIndex] = (entropy + gains[featureIndex])/gains[featureIndex]
        #print(features)
        #print(gains)
        featureIndex = gains.argmax()
        return features[featureIndex], featureIndex
  

class GI_ID3(ID3):
    
    def __init__(self):
        super()
        
    def _Instantiate(self):
        return GI_ID3()
    
    def __CalculateGINI(self, target):
        totalEntries = len(target)
        gini = 1
        for t in np.unique(target):
            targetCount = np.count_nonzero(target == t)
            targetProbability = targetCount/totalEntries
            gini -= targetProbability * targetProbability
        return gini
    
    #TODO: calculate gini index
    def _CalculateBestFeature(self, entropy, features, data, target):
        ginis = np.zeros((len(features)), dtype=float)
        for featureIndex in range(len(features)):
            featureData ,featurePossibleValues = self._SliceDataByFeatureIndex(data, featureIndex)
            for c in featurePossibleValues:
                valueIndecies = np.where(featureData == c)
                ginis[featureIndex] += (np.count_nonzero(featureData == c)/ len(data)) * self.__CalculateGINI(np.array(target)[valueIndecies])
        #print(features)
        #print(gains)
        featureIndex = ginis.argmin()
        return features[featureIndex], featureIndex


## Reading Dataset

In [133]:
from pandas import read_csv

def ReadData(fileName):
    rawData = read_csv(fileName)
    print(f'\nraw data : \n\n{rawData}')

    columns = np.array(rawData.columns)[:-1]
    #print(f'\ncolumns are : {columns}')
    
    data = np.array(rawData)[:,:-1]
    #print(f'\nfeatures are : \n{data}')

    target = np.array(rawData)[:,-1]
    #print(f'\ntarget is : {target}')
    
    return columns, data, target