In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter

import functools

import math

import matplotlib.pyplot as plt

In [2]:
inputFile = "ID_data_mass_18122012_prepared.csv"

dataFrame = pd.read_csv(inputFile, header = 0, sep = ';')
print(dataFrame.shape)
data = dataFrame.values
# print(data)

numSamples = dataFrame.shape[0]
print("Num samples == " + str(numSamples))
numFeatures = dataFrame.shape[1]  #Last two features used as class designation
print("Num features == " + str(numFeatures))

kgfColumnId = dataFrame.shape[1] - 1
gtotalColumnId = dataFrame.shape[1] - 2

def getClassMarker(row):
    return str(row[-2]) + '/' + str(row[-1])

(185, 33)
Num samples == 185
Num features == 33


In [3]:
def filterDataWithKnowledgeFromLab2(dataFrame):
    return dataFrame.drop([dataFrame.columns[27], 
                           dataFrame.columns[28],
                           dataFrame.columns[29],
                           dataFrame.columns[30],
                           dataFrame.columns[5],
                           dataFrame.columns[7],
                           dataFrame.columns[8],
                           dataFrame.columns[3],
                           dataFrame.columns[10],
                           dataFrame.columns[11],
                           dataFrame.columns[13],
                           dataFrame.columns[14],
                           dataFrame.columns[15],
                           dataFrame.columns[17],
                           dataFrame.columns[18],
                           dataFrame.columns[19],
                           dataFrame.columns[20],
                           dataFrame.columns[23],
                           dataFrame.columns[24],
                           dataFrame.columns[26]], axis='columns')

In [4]:
def giniCoefficient(groups, classes):
    numOfSamples = float(sum([len(group) for group in groups]))
    
    # Total Gini index
    result = 0.0
    
    for group in groups:
        size = float(len(group))
        if (size == 0):
            continue
            
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [getClassMarker(row) for row in group].count(class_val) / size
            score += p * p
            
        groupWeight = (size / numOfSamples)
        result += (1.0 - score) * groupWeight
    return result

#test
#Coeff should be equal to 0.5
print(giniCoefficient([[[1, 1, 1], [1, 1, 0]], 
                       [[1, 1, 1], [1, 1, 0]]], 
                      ['1/0', '1/1']))

#Coeff should be equal to 0
print(giniCoefficient([[[1, 1, 0], [1, 1, 0]], 
                       [[1, 1, 1], [1, 1, 1]]], 
                      ['1/0', '1/1']))

0.5
0.0


In [5]:
def splitData(featureIndex, splitValue, data):
    left = list()
    right = list()
    
    for row in data:
        if (row[featureIndex] < splitValue):
            left.append(row)
        else:
            right.append(row)
            
    return left, right

print(splitData(0,
                1,
                [[0, 1], [2, 0]]))

([[0, 1]], [[2, 0]])


In [6]:
# print (list(set(classMarker(row) for row in data)))

def findBestSplit(data, numNonClassMarkerFeatures):
    classMarkers = list(set(getClassMarker(row) for row in data))
    
    bestFeatureId = 0
    bestSplitValue = 0
    bestGiniCoeff = 999999
    bestSplit = None
    
#     print(classMarkers)
    
    for featureId in range(numNonClassMarkerFeatures):
        for row in data:
            splittedData = splitData(featureId, row[featureId], data)
            coeff = giniCoefficient(splittedData, classMarkers)
            
#             print('X%d < %.3f Gini=%.3f' % ((featureId + 1), row[featureId], coeff))
            
            if (coeff < bestGiniCoeff):
                bestFeatureId = featureId
                bestSplitValue = row[featureId]
                bestGiniCoeff = coeff
                bestSplit = splittedData
                
    return {'featureId' : bestFeatureId,
            'splitValue' : bestSplitValue,
            'dataGroups' : bestSplit}

testDataset = [[0.5, 1, 0],
               [1, 1, 0],
               [1.5, 2, 0],
               [2, 2, 0]]
split = findBestSplit(testDataset, 1)
print('Split: [X%d < %.3f]' % ((split['featureId']+1), split['splitValue']))

Split: [X1 < 1.500]


In [7]:
def createTerminalNode(group):
    groupClassMarkers = list(getClassMarker(row) for row in group)
    return max(set(groupClassMarkers), key=groupClassMarkers.count)

def splitNode(node, maxDepth, minNodeSize, numNonClassMarkerFeatures, depth):
    left, right = node['dataGroups']
    del(node['dataGroups'])
    
    if ((not left) or (not right)):
        node['left'] = node['right'] = createTerminalNode(left + right)
        return
    
    if (depth >= maxDepth):
        node['left'] = createTerminalNode(left)
        node['right'] = createTerminalNode(right)
        return
    
    if (len(left) <= minNodeSize):
        node['left'] = createTerminalNode(left)
    else:
#         print("Splitting...")
        node['left'] = findBestSplit(left, numNonClassMarkerFeatures)
        splitNode(node['left'], maxDepth, minNodeSize, numNonClassMarkerFeatures, depth + 1)
        
    if (len(right) <= minNodeSize):
        node['right'] = createTerminalNode(right)
    else:
#         print("Splitting...")
        node['right'] = findBestSplit(right, numNonClassMarkerFeatures)
        splitNode(node['right'], maxDepth, minNodeSize, numNonClassMarkerFeatures, depth + 1)
        
def buildDecisionTree(data, maxDepth, minNodeSize, numNonClassMarkerFeatures):
#     print("Splitting...")
    root = findBestSplit(data, numNonClassMarkerFeatures)
    splitNode(root, maxDepth, minNodeSize, numNonClassMarkerFeatures, 1)
    return root

In [8]:
def printTree(node, depth):
    if isinstance(node, dict):
        print('{0}[X{1} < {2}]'.format(depth*' ', node['featureId'] + 1, node['splitValue']))
        
        printTree(node['left'], depth + 1)
        printTree(node['right'], depth + 1)
    else:
        print('{0}[{1}]'.format(depth*' ', node))
        
allDataTree = buildDecisionTree(data, 9999, 1, gtotalColumnId)
printTree(allDataTree, 0)

[X31 < 0.6847]
 [X15 < 145.72]
  [X2 < 18.06.07]
   [X1 < 22501]
    [nan/180.0]
    [nan/180.0]
   [nan/241.0]
  [X17 < 0.32]
   [X2 < 18.05.07]
    [X2 < 10.05.07]
     [nan/188.0]
     [X1 < 807]
      [nan/178.0]
      [nan/178.0]
    [X4 < 9.53]
     [nan/270.0]
     [nan/251.0]
   [X2 < 16.01.09]
    [X16 < 98.2]
     [X11 < 65.0]
      [X13 < 104.85]
       [X10 < 95.88]
        [X1 < 21303]
         [nan/139.0]
         [nan/139.0]
        [X4 < 11.11]
         [X1 < 807]
          [2.78/311.91]
          [nan/172.0]
         [X1 < 807]
          [nan/141.0]
          [nan/141.0]
       [X19 < 2868.6]
        [X10 < 100.2]
         [X1 < 21303]
          [nan/180.0]
          [X1 < 22501]
           [X2 < 09.12.08]
            [nan/198.0]
            [nan/174.0]
           [nan/219.0]
         [X2 < 09.02.08]
          [nan/218.0]
          [X4 < 11.11]
           [X1 < 21002]
            [nan/200.0]
            [nan/200.0]
           [nan/161.0]
        [X10 < 101.6]
         

In [9]:
data = filterDataWithKnowledgeFromLab2(dataFrame).values

optimizedTree = buildDecisionTree(data, 9999, 1, len(data[0]) - 2)
printTree(optimizedTree, 0)

[X1 < 21202]
 [X5 < 121.68]
  [X10 < 41.22]
   [X2 < 14.07.08]
    [X4 < 250.0]
     [X4 < 232.0]
      [nan/162.0]
      [X1 < 807]
       [nan/169.0]
       [nan/169.0]
     [X1 < 21002]
      [X2 < 10.05.07]
       [nan/188.0]
       [nan/178.0]
      [X2 < 12.07.08]
       [nan/200.0]
       [nan/180.0]
    [X2 < 19.05.07]
     [X2 < 18.05.07]
      [X2 < 15.07.08]
       [nan/160.0]
       [X2 < 16.07.08]
        [nan/156.0]
        [nan/153.0]
      [X4 < 251.29]
       [nan/251.0]
       [nan/270.0]
     [X2 < 20.05.07]
      [nan/236.0]
      [nan/217.0]
   [X1 < 20503]
    [X7 < 104.6]
     [X1 < 807]
      [X2 < 06.06.08]
       [2.78/311.91]
       [X2 < 07.06.08]
        [3.7/288.6]
        [X2 < 08.06.08]
         [4.52/248.79]
         [X2 < 09.06.08]
          [5.22/223.56]
          [X2 < 17.06.11]
           [5.77/215.15]
           [3.08/241.13]
      [X2 < 30.08.09]
       [nan/141.0]
       [nan/157.0]
     [X4 < 239.42]
      [X1 < 807]
       [nan/172.0]
       [n

In [12]:
def predict(node, dataForPrediction):
    if dataForPrediction[node['featureId']] < node['splitValue']:
        if isinstance(node['left'], dict):
            return predict(node['left'], dataForPrediction)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], dataForPrediction)
        else:
            return node['right']
        
print(predict(optimizedTree, data[0]))

2.78/311.91
