In [None]:
import sys
!{sys.executable} -m pip install numpy

In [144]:
import numpy as np
from anytree import AnyNode, RenderTree

In [412]:
debug = False
# debug = True

In [416]:
def impurity(array):
    total = np.shape(array)[0]
    zero = np.shape(array[array == 0])[0]
    one = np.shape(array[array == 1])[0]
    result = (zero / total) * (one / total)
    return result

def bestsplit(x,y):
    iSorted = np.sort(np.unique(x))
    iSCount = np.shape(iSorted)[0]

    splitpoints = (iSorted[0:iSCount-1]+iSorted[1:iSCount])/2
    bestSplit  = 0
    bestSplitImpurity = 1.0
    
    
    for s in splitpoints:
        left = y[x <= s]
        right = y[x > s]
        splitImpurity = (impurity(left) + impurity(right))/2
        if (splitImpurity < bestSplitImpurity):
            bestSplit = s
            bestSplitImpurity = splitImpurity
            
    return bestSplit, bestSplitImpurity

def multiFeatureSplit(x, y):
    
    bestSplit  = 0
    bestSplitImpurity = 1.0
    
    nFeatures = np.shape(x)[1]
    for i in range(nFeatures):
        split, splitImpurity = bestsplit(x[:,i],y)
        if debug:
            print("split for col ", i, ":", bestsplit(x[:,i],y))
        if (splitImpurity < bestSplitImpurity):
            splitFeature = i
            bestSplit = split
            bestSplitImpurity = splitImpurity
    if debug:
        print("best split at col:", splitFeature, "bestSplit:", bestSplit, "impurity:", bestSplitImpurity)
    return splitFeature, bestSplit, bestSplitImpurity

In [417]:
#-------MULTIFEATURESPLIT TEST-------
credit_data = np.genfromtxt("credit.txt", delimiter=',',skip_header=True)
multiFeatureSplit(credit_data[:,np.arange(0,5)],credit_data[:,5])


DIMENSION: (10, 5)


(3, 36.0, 0.10204081632653061)

In [432]:
#x data-matrix, 
#y vector of class labels, binary 0 or 1
#nmin minimum observations each node should contain
#minleaf minimum observations for each leaf
#nfeat number of features considered for each split, draw random number of features from which best split is to be selected

def tree_grow(x, y, nmin, minleaf, nfeat):

    root = AnyNode(indices = np.arange(0,np.shape(x)[0]))
    
    nodeStack = [root]
    
#     while len(nodeStack) >0:
    for i in range(8):
        n = nodeStack[0]
        nodeStack = np.delete(nodeStack, 0)
        ny = np.take(y, n.indices)
        nx = x[n.indices,:]
        if impurity(ny)>0:
            f, s, imp = multiFeatureSplit(nx,ny) #feature, splitValue, impurity
            n.f = f
            n.s = s

            
            leftIndices =  np.where(x[:,f] < s)[0]
            rightIndices = np.where(x[:,f] >= s)[0]
#             leftIndices =  list(set(np.where(x[:,f] < s)[0]) & set(n.indices))
#             rightIndices = list(set(np.where(x[:,f] >= s)[0]) & set(n.indices))
            
            nodeStack = np.append(nodeStack, AnyNode(indices = np.asarray(leftIndices), parent = n))
            nodeStack = np.append(nodeStack, AnyNode(indices = np.asarray(rightIndices), parent = n))
            print(leftIndices, "|", rightIndices)
        print("SubTree", RenderTree(n))
    print()
    print("finalTree", RenderTree(root))
    return root


        

In [433]:
#TEST
credit_data = np.genfromtxt("credit.txt", delimiter=',',skip_header=True)
data = credit_data[:,np.arange(0,5)]
classification = credit_data[:,5]

tree_grow(data, classification, 2, 1, 1)

DIMENSION: (10, 5)
[0 1 2 3 4 5 9] | [6 7 8]
SubTree AnyNode(f=3, indices=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), s=36.0)
├── AnyNode(indices=array([0, 1, 2, 3, 4, 5, 9]))
└── AnyNode(indices=array([6, 7, 8]))
DIMENSION: (7, 5)
[0 1 2 3 4 5 7 8] | [6 9]
SubTree AnyNode(f=0, indices=array([0, 1, 2, 3, 4, 5, 9]), s=48.0)
├── AnyNode(indices=array([0, 1, 2, 3, 4, 5, 7, 8]))
└── AnyNode(indices=array([6, 9]))
SubTree AnyNode(indices=array([6, 7, 8]))
DIMENSION: (8, 5)
[0 1 2 3 4 5 9] | [6 7 8]
SubTree AnyNode(f=3, indices=array([0, 1, 2, 3, 4, 5, 7, 8]), s=36.0)
├── AnyNode(indices=array([0, 1, 2, 3, 4, 5, 9]))
└── AnyNode(indices=array([6, 7, 8]))
SubTree AnyNode(indices=array([6, 9]))
DIMENSION: (7, 5)
[0 1 2 3 4 5 7 8] | [6 9]
SubTree AnyNode(f=0, indices=array([0, 1, 2, 3, 4, 5, 9]), s=48.0)
├── AnyNode(indices=array([0, 1, 2, 3, 4, 5, 7, 8]))
└── AnyNode(indices=array([6, 9]))
SubTree AnyNode(indices=array([6, 7, 8]))
DIMENSION: (8, 5)
[0 1 2 3 4 5 9] | [6 7 8]
SubTree AnyNode(f=3, ind

AnyNode(f=3, indices=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), s=36.0)

In [426]:
np.where(data[:,3] < 36)
np.where(data[:,3] > 36)

(array([6, 7, 8]),)

In [275]:
indicesRange = [0, 1, 2, 3, 4, 5, 9]

print(np.take(subset,indicesRange))

subset = credit_data[[0, 1, 2, 3, 4, 5, 9],:]
subsetA = subset[:,np.arange(0,5)]
subsetB = subset[:, 5]
multiFeatureSplit(subset[:,np.arange(0,5)],subset[:,5])
    
# print(subset)


[22.  0.  0. 28.  1.  0. 32.]


(0, 48.0, 0.06944444444444445)

In [150]:
def tree_pred(x, tr):
    y= "vector"
    return y

In [427]:
print(credit_data)
print("-----")
print(np.delete(credit_data,0, axis=0))
print("-----")
print(credit_data)

[[22.  0.  0. 28.  1.  0.]
 [46.  0.  1. 32.  0.  0.]
 [24.  1.  1. 24.  1.  0.]
 [25.  0.  0. 27.  1.  0.]
 [29.  1.  1. 32.  0.  0.]
 [45.  1.  1. 30.  0.  1.]
 [63.  1.  1. 58.  1.  1.]
 [36.  1.  0. 52.  1.  1.]
 [23.  0.  1. 40.  0.  1.]
 [50.  1.  1. 28.  0.  1.]]
-----
[[46.  0.  1. 32.  0.  0.]
 [24.  1.  1. 24.  1.  0.]
 [25.  0.  0. 27.  1.  0.]
 [29.  1.  1. 32.  0.  0.]
 [45.  1.  1. 30.  0.  1.]
 [63.  1.  1. 58.  1.  1.]
 [36.  1.  0. 52.  1.  1.]
 [23.  0.  1. 40.  0.  1.]
 [50.  1.  1. 28.  0.  1.]]
-----
[[22.  0.  0. 28.  1.  0.]
 [46.  0.  1. 32.  0.  0.]
 [24.  1.  1. 24.  1.  0.]
 [25.  0.  0. 27.  1.  0.]
 [29.  1.  1. 32.  0.  0.]
 [45.  1.  1. 30.  0.  1.]
 [63.  1.  1. 58.  1.  1.]
 [36.  1.  0. 52.  1.  1.]
 [23.  0.  1. 40.  0.  1.]
 [50.  1.  1. 28.  0.  1.]]


In [44]:
list = np.arange(0,10)
list = np.delete(list,0)
print(list)

[1 2 3 4 5 6 7 8 9]


In [50]:
np.take(credit_data, [0,1,2,9], 0)

array([[22.,  0.,  0., 28.,  1.,  0.],
       [46.,  0.,  1., 32.,  0.,  0.],
       [24.,  1.,  1., 24.,  1.,  0.],
       [50.,  1.,  1., 28.,  0.,  1.]])

In [66]:
print(credit_data[:,5])
print(credit_data)

[0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
[[22.  0.  0. 28.  1.  0.]
 [46.  0.  1. 32.  0.  0.]
 [24.  1.  1. 24.  1.  0.]
 [25.  0.  0. 27.  1.  0.]
 [29.  1.  1. 32.  0.  0.]
 [45.  1.  1. 30.  0.  1.]
 [63.  1.  1. 58.  1.  1.]
 [36.  1.  0. 52.  1.  1.]
 [23.  0.  1. 40.  0.  1.]
 [50.  1.  1. 28.  0.  1.]]


In [337]:
x=[[22,0,0,28,1,],[46,0,1,32,0,],[24,1,1,24,1,],[25,0,0,27,1,],[29,1,1,32,0,],[45,1,1,30,0,],[50,1,1,28,0,]]


TypeError: list indices must be integers or slices, not tuple