In [1]:
import pandas as pd
import math

# convert values of the dataset from str to double
def strToDouble(dataSet):
    for row in dataSet:
        for i in range(len(dataSet[0])):
            if(isinstance(row[i],str) == 1):
                row[i] = float(row[i])
    return dataSet

# find the min and max value for each column
def findMinMax(dataSet):
    minMax = list()
    for i in range(len(dataSet[0])):
        colVal = [row[i] for row in dataSet]
        colMax = max(colVal)
        colMin = min(colVal)
        minMax.append([colMin, colMax])
    return minMax

# normalize dataset
def normalizeData(dataSet, minMax):
    for row in dataSet:
        for i in range(len(dataSet[0])):
            row[i] = (row[i] - minMax[i][0]) / (minMax[i][1] - minMax[i][0])
    return dataSet

# extract column from a matrix
def extractCol(dataSet, i):
    return [row[i] for row in dataSet]

# define information entropy
def entropy(classCount):
    ratios = [0.0] * 2
    ratios[0] = classCount[0] / sum(classCount)
    ratios[1] = classCount[1] / sum(classCount)
    entropy = 0
    for ratio in ratios:
        if ratio == 0:
            continue
        else:
            entropy = entropy - (ratio * math.log(ratio,2))
    return entropy

# define Gini Index
def giniIndex(classCount):
    ratios = [0.0] * 2
    ratios[0] = classCount[0] / sum(classCount)
    ratios[1] = classCount[1] / sum(classCount)
    gini = 1
    for ratio in ratios:
        if ratio == 0:
            continue
        else:
            gini = gini - ratio ** 2
    return gini

In [None]:
data = pd.read_csv('DecTreeAssign1.csv')
data = pd.DataFrame(data)
featureMat = [0] * 31
featureMat = (data.columns).tolist()
vals = data.values
dataList = vals.tolist()

# convert to list and create a copy of dataList to save the normalized list
dataNormalized = list()
dataNormalized = dataList.copy()

# normalize the input data
firstCol = list()
firstCol = extractCol(dataNormalized, 0)
for i in dataNormalized:
    del i[0]
dataNormalized = strToDouble(dataNormalized)
dataNormalized = normalizeData(dataNormalized, findMinMax(dataNormalized))

# find the mean of each column in the normalized dataset
# reuse the extract column function
meanVec = [0.0] * len(dataNormalized[0])
for i in range(0,len(dataNormalized[0])):
    colVec = extractCol(dataNormalized, i)
    colSum = 0
    for j in range(len(dataNormalized)):
        colSum  = colSum + colVec[j]
    mean = colSum / len(dataNormalized)
    meanVec[i] = mean

# append back the first class column
for i in range(len(dataNormalized)):
    dataNormalized[i].insert(0,firstCol[i])
    
# count and print the number of B and M observations
numB = 0
numM = 0
for obj in firstCol:
    if(obj == 'B'):
        numB = numB + 1
    if(obj == 'M'):
        numM = numM + 1
        
classCount = [0] * 2
classCount[0] = numB
classCount[1] = numM

# Driver Code
print("Class B observations: " + str(numB))   
print("Class M observations: " + str(numM))
print(f'Total observations: ' + str(len(dataNormalized)))
print(f'Entropy for the root: {entropy(classCount) : .4f}')
print(f'Gini for the root: {giniIndex(classCount) : .4f}')

# calculate the gini and entropy for the child nodes after splitting by the feature mean
child1BCount = [0] * 30
child1MCount = [0] * 30
child1Entropy = [0.0] * 30
child1Gini = [0.0] * 30
child2BCount = [0] * 30
child2MCount = [0] * 30
child2Entropy = [0.0] * 30
child2Gini = [0.0] * 30
combinedEntropy = [0.0] * 30
combinedGini = [0.0] * 30
rootEntropy = entropy(classCount)

for i in range(1,len(dataNormalized[0])):
    child1 = 0
    child2 = 0
    child1B = 0
    child1M = 0
    child2B = 0
    child2M = 0
    BMCount1 = [0] * 2
    BMCount2 = [0] * 2
    for j in range(len(dataNormalized)):
        if dataNormalized[j][i] < meanVec[i-1]:
            child1 = child1 + 1
            if dataNormalized[j][0] == 'B':
                child1B = child1B + 1
            else:
                child1M = child1M + 1
        else:
            child2 = child2 + 1
            if dataNormalized[j][0] == 'B':
                child2B = child2B + 1
            else:
                child2M = child2M + 1
    child1BCount[i-1] = child1B
    child1MCount[i-1] = child1M
    child2BCount[i-1] = child2B
    child2MCount[i-1] = child2M
    BMCount1[0] = child1B
    BMCount1[1] = child1M
    BMCount2[0] = child2B
    BMCount2[1] = child2M
    child1Gini[i-1] = giniIndex(BMCount1)
    child1Entropy[i-1] = entropy(BMCount1)
    child2Gini[i-1] = giniIndex(BMCount2)
    child2Entropy[i-1] = entropy(BMCount2)
    combinedGini[i-1] = (child1 / len(dataNormalized)) * child1Gini[i-1] + (child2 / len(dataNormalized)) * child2Gini[i-1]
    combinedEntropy[i-1] = rootEntropy - ((child1 / len(dataNormalized)) * child1Entropy[i-1] + (child2 / len(dataNormalized)) * child2Entropy[i-1])
    
# print child's gini in assigned format
# featureName, feature mean value (after scaling), numChild1B, numChild1M, child1Gini, child1Entropy, 
# numChild2B, numChild2M, child2Gini, child2Entropy, combinedGini, combinedEntropy
res = [[0 for i in range(12)] for j in range(len(dataNormalized[0]) - 1)]
for i in range(len(res)):
    res[i][0] = featureMat[i+1]
    res[i][1] = round(meanVec[i],4)
    res[i][2] = child1BCount[i]
    res[i][3] = child1MCount[i]
    res[i][4] = round(child1Gini[i],4)
    res[i][5] = round(child1Entropy[i],4)
    res[i][6] = child2BCount[i]
    res[i][7] = child2MCount[i]
    res[i][8] = round(child2Gini[i],4)
    res[i][9] = round(child2Entropy[i],4)
    res[i][10] = round(combinedGini[i],4)
    res[i][11] = round(combinedEntropy[i],4)

print(res)
    
    
    


            
            
            
            
            
            
            
            
            
            
            
            






Class B observations: 357
Class M observations: 212
Total observations: 569
Entropy for the root:  0.9526
Gini for the root:  0.4675
[['F00', 0.3382, 311, 32, 0.1692, 0.4474, 46, 180, 0.3242, 0.729, 0.2308, 0.3934], ['F01', 0.324, 254, 52, 0.2821, 0.6576, 103, 160, 0.4765, 0.9658, 0.372, 0.1526], ['F02', 0.3329, 313, 30, 0.1596, 0.4279, 44, 182, 0.3136, 0.7112, 0.2208, 0.4122], ['F03', 0.2169, 324, 41, 0.1994, 0.5069, 33, 171, 0.2712, 0.6385, 0.2252, 0.3985], ['F04', 0.3948, 222, 67, 0.3562, 0.7812, 135, 145, 0.4994, 0.9991, 0.4266, 0.0642], ['F05', 0.2606, 282, 44, 0.2335, 0.5709, 75, 168, 0.4268, 0.8916, 0.316, 0.2448], ['F06', 0.2081, 319, 28, 0.1484, 0.4046, 38, 184, 0.2837, 0.6604, 0.2012, 0.4482], ['F07', 0.2431, 325, 18, 0.0994, 0.2968, 32, 194, 0.2431, 0.5884, 0.1565, 0.54], ['F08', 0.3796, 227, 78, 0.3807, 0.8202, 130, 134, 0.4999, 0.9998, 0.436, 0.0491], ['F09', 0.2704, 205, 123, 0.4688, 0.9544, 152, 89, 0.4658, 0.9501, 0.4675, 0.0], ['F10', 0.1063, 313, 61, 0.273, 0.6417, 44

In [None]:
outMat = pd.DataFrame(res)
outMat.to_csv('DecisionTree.csv')