In [1]:
trainingData=[['Green',3,'Mongo'],['Yellow',3,'Mongo'],['Red',1,'Grape'],['Red',1,'Grape'],['Yellow',3,'Lemon']]
trainingData

[['Green', 3, 'Mongo'],
 ['Yellow', 3, 'Mongo'],
 ['Red', 1, 'Grape'],
 ['Red', 1, 'Grape'],
 ['Yellow', 3, 'Lemon']]

In [2]:
header = ['color','diameter','label']

In [3]:
#find the unique values for a column in a dataset
def uniqueVals(rows,col):
    return set([row[col] for row in rows])

In [4]:
#demo of uniqueVals
uniqueVals(trainingData,0)  #{}is dictionnary

{'Green', 'Red', 'Yellow'}

In [5]:
#count the number of each type of example in a dataset
def classCounts(rows):
    counts={}
    for row in rows:
        label=row[-1] #read the last element of each row 
        if label not in counts:
            counts[label]=0
        counts[label]+=1
    return counts

In [6]:
#demo of classCounts
classCounts(trainingData)

{'Mongo': 2, 'Grape': 2, 'Lemon': 1}

In [7]:
#test is a value numeric or not
def isNumeric(value):
    return isinstance(value,int) or isinstance(value,float)

In [8]:
#demo of isNumeric
isNumeric(6)

True

In [9]:
isNumeric('dsd')

False

In [10]:
#compare the feature value in an example of the one in this Question
class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value
        
    def match(self,emample):
        val=emample[self.column]
        if isNumeric(val):
            return val>=self.value
        else:
            return val==self.value
        
    def __repr__(self):
        condition="=="
        if isNumeric(self.value):
            condition=">="
        return "Is %s %s %s?" % (header[self.column],condition,str(self.value))


In [11]:
#do partition for a dataset
#For each row in the dataset, check if it matches the question. If so, add it to true rows; otherwise, false rows;
def partition(rows,question):
    trueRows,falseRows=[],[]
    for row in rows:
        if question.match(row):
            trueRows.append(row)
        else:
            falseRows.append(row)
    return trueRows,falseRows

In [12]:
#demo 
trueRows,falseRows=partition(trainingData,Question(2,'Mongo')) #is the 3rd column mongo?

In [13]:
trueRows

[['Green', 3, 'Mongo'], ['Yellow', 3, 'Mongo']]

In [14]:
falseRows

[['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon']]

In [15]:
#demo 
trueRows,falseRows=partition(trainingData,Question(1,2)) #is the 2nd column >=2?

In [16]:
trueRows

[['Green', 3, 'Mongo'], ['Yellow', 3, 'Mongo'], ['Yellow', 3, 'Lemon']]

In [17]:
falseRows

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']]

In [18]:
#demo 
trueRows,falseRows=partition(trainingData,Question(2,'Mongo'))

In [19]:
trueRows

[['Green', 3, 'Mongo'], ['Yellow', 3, 'Mongo']]

In [20]:
falseRows

[['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon']]

In [21]:
#calculate the Gini impurity index(uncertainty) for a list of rows
def gini(rows):
    counts=classCounts(rows)  # counts = {'Mongo': 2, 'Grape': 2, 'Lemon': 1}
    impurity=1
    for label in counts:
        probilityOfLabel=counts[label]/float(len(rows))
        impurity-=probilityOfLabel**2
    return impurity
#demo, calculate the uncertainty
gini(trainingData)

0.6399999999999999

In [22]:
#calculate the information gain = the uncertainty of the starting node - weighted impurity of two child nodes
def infoGain(left, right, currentUncertainty):
    p=float(len(left))/(len(left)+len(right))
    return currentUncertainty-p*gini(left)-(1-p)*gini(right)
#demo
infoGain(trueRows,falseRows,gini(trainingData))

0.37333333333333324

In [23]:
#find the best question to ask by iterating over every feature/value and calculating the infoGain
def findBestSplit(rows):
    bestGain=0
    bestQuestion=None
    currentUncertainty=gini(rows)
    nFeatures=len(rows[0])-1 #number of columns
    for col in range(nFeatures): #for each feature
        values = set([row[col] for row in rows]) #unique values in the column
        for value in values: #for each value
            question = Question(col,value)
            trueRows,falseRows=partition(rows,question)   #try splitting the dataset
            if len(trueRows)==0 or len(falseRows)==0: #skip this splitting if it cannot devide the dataset
                continue
            gain = infoGain(trueRows,falseRows,currentUncertainty) #calculate the infoGain from this split
            if gain>=bestGain:
                bestGain,bestQuestion=gain,question
    return bestGain,bestQuestion
#demo
bestGain,bestQuestion=findBestSplit(trainingData)
print(bestGain)
print(bestQuestion)

0.37333333333333324
Is diameter >= 3?


In [24]:
#a leaf node classifies data
#this holds a dictionary of class (e.g., 'Mango')->number of times it appears in the rows from the training data that reach this leaf
class Leaf:
    def __init__(self,rows):
        self.predictions=classCounts(rows)

In [25]:
#a decision node asks a question, this holds a reference to the question and two child nodes
class DecisionNode:
    def __init__(self, question, trueBranch, falseBranch):
        self.question=question
        self.trueBranch=trueBranch
        self.falseBranch=falseBranch

In [26]:
def buildTree(rows):
    gain,question=findBestSplit(rows)
    #base case: if no further info gain, we cannot ask further questions, return a leaf
    if(gain==0):
        return Leaf(rows)
    #if we reach here, we have found a useful feature/value to do a partition
    trueRows,falseRows=partition(rows,question)
    #Recursion
    trueBranch=buildTree(trueRows)
    falseBranch=buildTree(falseRows)
    return DecisionNode(question,trueBranch,falseBranch)

In [27]:
def printTree(node, spacing=""):
    #base case: we have reached the leaf
    if isinstance(node, Leaf):
        print(spacing+"Predict",node.predictions)
        return
    #print the question of this node
    print(spacing+str(node.question))
    #call this function recursively on the true branch
    print(spacing+'--> True')
    printTree(node.trueBranch,spacing+" ")
    #call this function recursively on the false branch
    print(spacing+'--> False')
    printTree(node.falseBranch,spacing+" ")

In [28]:
def classify(row,node):
    if isinstance(node,Leaf):   #if we reach a leaf
        return node.predictions   
    
    #not reach a leaf, continue following a branch
    if node.question.match(row):   #decide to follow which branch
        return classify(row,node.trueBranch)     #Recursion
    else:
        return classify(row,node.falseBranch)    #Recursion
#demo
myTree=buildTree(trainingData)
classify(trainingData[0],myTree)

{'Mongo': 1}

In [29]:
#print the leaf prettier
def printLeaf(counts):
    total=sum(counts.values())*1.0
    probilities={}
    for label in counts.keys():
        probilities[label]=str(int(counts[label]/total*100))+"%"
    return probilities
#demo
printLeaf(classify(trainingData[0],myTree))

{'Mongo': '100%'}

In [30]:
if __name__=='__main__':
    myTree=buildTree(trainingData)
    printTree(myTree)
    testingData=[['Green',3,'Mango'],['Yellow',4,'Mango'],['Red',2,'Grape'],['Red',1,'Grape'],['Yellow',3,'Lemon']]
    for row in testingData:
        print("Actual: %s. Predicted: %s" % (row[-1], printLeaf(classify(row,myTree))))

Is diameter >= 3?
--> True
 Is color == Yellow?
 --> True
  Predict {'Mongo': 1, 'Lemon': 1}
 --> False
  Predict {'Mongo': 1}
--> False
 Predict {'Grape': 2}
Actual: Mango. Predicted: {'Mongo': '100%'}
Actual: Mango. Predicted: {'Mongo': '50%', 'Lemon': '50%'}
Actual: Grape. Predicted: {'Grape': '100%'}
Actual: Grape. Predicted: {'Grape': '100%'}
Actual: Lemon. Predicted: {'Mongo': '50%', 'Lemon': '50%'}
