# Testing

In [1]:
import pandas as pd
import numpy as np
from math import log

In [2]:
df = pd.read_csv("nursery.data")

In [3]:
df

Unnamed: 0,usual,proper,complete,1,convenient,convenient.1,nonprob,recommended,recommend
0,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
1,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
2,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
3,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
4,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom
...,...,...,...,...,...,...,...,...,...
12954,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12956,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


In [4]:
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet: #the the number of unique elements and their occurance
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2) #log base 2
    return shannonEnt

In [5]:
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

In [6]:
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)       #get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer


In [7]:
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

In [8]:
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#stop splitting when all of the classes are equal
    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree 

In [9]:
def classify(inputTree,featLabels,testVec):
    firstStr=list(inputTree.keys())[0]
    secondDict=inputTree[firstStr]
    featIndex=featLabels.index(firstStr)
    for key in secondDict:
        if testVec[featIndex]==key:
            if type(secondDict[key]).__name__=='dict':
                classLabel=classify(secondDict[key],featLabels,testVec)
            else:
                classLabel=secondDict[key]
    return classLabel

In [10]:
column_labels = df.columns.values

In [11]:
labels = []
for element in column_labels:
    labels.append(element)
labels

['usual',
 'proper',
 'complete',
 '1',
 'convenient',
 'convenient.1',
 'nonprob',
 'recommended',
 'recommend']

In [12]:
#labels.remove("recommend")

In [13]:
labelsbackup = labels.copy()
labelsbackup

['usual',
 'proper',
 'complete',
 '1',
 'convenient',
 'convenient.1',
 'nonprob',
 'recommended',
 'recommend']

In [14]:
data = df.to_numpy()
data = data.tolist()
data

[['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'nonprob',
  'priority',
  'priority'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'nonprob',
  'not_recom',
  'not_recom'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'slightly_prob',
  'recommended',
  'recommend'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'slightly_prob',
  'priority',
  'priority'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'slightly_prob',
  'not_recom',
  'not_recom'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'problematic',
  'recommended',
  'priority'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'problematic',
  'priority',
  'priority'],
 ['usual',
  'proper',
  'complete',
  '1',
  'convenient',
  'convenient',
  'problematic',
  'not_recom',
  'not_recom'],
 ['usual',
  

In [15]:
labels

['usual',
 'proper',
 'complete',
 '1',
 'convenient',
 'convenient.1',
 'nonprob',
 'recommended',
 'recommend']

In [16]:
mytree = createTree(data, labels)

In [17]:
labels

['usual',
 'proper',
 'complete',
 '1',
 'convenient',
 'convenient.1',
 'nonprob',
 'recommend']

In [18]:
mytree

{'recommended': {'priority': {'proper': {'very_crit': {'complete': {'completed': 'spec_prior',
      'incomplete': 'spec_prior',
      'complete': {'1': {'more': 'spec_prior',
        '3': 'spec_prior',
        '1': {'convenient': {'convenient': {'convenient.1': {'convenient': 'priority',
            'inconv': 'spec_prior'}},
          'less_conv': 'spec_prior',
          'critical': 'spec_prior'}},
        '2': 'spec_prior'}},
      'foster': 'spec_prior'}},
    'less_proper': {'usual': {'great_pret': {'convenient': {'convenient': {'convenient.1': {'convenient': 'priority',
          'inconv': {'1': {'more': 'spec_prior',
            '3': 'spec_prior',
            '1': {'complete': {'completed': 'priority',
              'incomplete': 'priority',
              'complete': 'priority',
              'foster': 'spec_prior'}},
            '2': {'complete': {'completed': 'priority',
              'incomplete': 'spec_prior',
              'complete': 'priority',
              'foster': 'spe

In [21]:
labelsbackup

['usual',
 'proper',
 'complete',
 '1',
 'convenient',
 'convenient.1',
 'nonprob',
 'recommended',
 'recommend']

In [23]:
classify(mytree, labelsbackup, ['usual','proper','complete','1','convenient','convenient','nonprob','priority','priority'])

'priority'

In [24]:
mytree.keys()

dict_keys(['recommended'])

In [25]:
test = list(mytree.keys())[0]
test

'recommended'

In [26]:
test2 = mytree[test]
test2

{'priority': {'proper': {'very_crit': {'complete': {'completed': 'spec_prior',
     'incomplete': 'spec_prior',
     'complete': {'1': {'more': 'spec_prior',
       '3': 'spec_prior',
       '1': {'convenient': {'convenient': {'convenient.1': {'convenient': 'priority',
           'inconv': 'spec_prior'}},
         'less_conv': 'spec_prior',
         'critical': 'spec_prior'}},
       '2': 'spec_prior'}},
     'foster': 'spec_prior'}},
   'less_proper': {'usual': {'great_pret': {'convenient': {'convenient': {'convenient.1': {'convenient': 'priority',
         'inconv': {'1': {'more': 'spec_prior',
           '3': 'spec_prior',
           '1': {'complete': {'completed': 'priority',
             'incomplete': 'priority',
             'complete': 'priority',
             'foster': 'spec_prior'}},
           '2': {'complete': {'completed': 'priority',
             'incomplete': 'spec_prior',
             'complete': 'priority',
             'foster': 'spec_prior'}}}}}},
       'less_conv': 

In [28]:
test2.keys()

dict_keys(['priority', 'not_recom', 'recommended'])

In [27]:
testindex = labelsbackup.index(test)