In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt 
import pandas as pd
import common as cm
import graphviz
import copy as copy

# Part 1: Information Gain

Important note: this exercise uses Pandas (for data manipulation and analysis) and Graphviz (for graph-drawing) libraries. 

This exercise consists of 3 parts. Complete the first part to get a mark of 3.0, the first two parts to get 4.0, complete all assignments to get 5.0. 

1.1 ) There are 10 objects (data) characterized with 5 binary attributes:

In [2]:
attributeNames = ["attr 1", "attr 2", "attr 3", "attr 4", "attr 5"]

data = pd.DataFrame(
    [
        [1, 0, 1, 1, 1],
        [1, 1, 0, 0, 1],
        [0, 1, 1, 1, 1],
        [1, 0, 1, 0, 1],
        [1, 0, 0, 1, 1],
        [0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 1, 1],
    ],
    columns=attributeNames,
)

1.2) Each object is assigned to either a class "0" or "1". The assignments are as follows (cl):

In [3]:
data["cl"] = [1, 1, 1, 0, 0, 1, 1, 1, 0, 0]

Hint: How one can read data (columns) in Pandas:

In [4]:
print(data["cl"])
print(list(data["cl"]))
print(set(data["cl"]))
print(data["attr 1"][0])
#print (data[0])

0    1
1    1
2    1
3    0
4    0
5    1
6    1
7    1
8    0
9    0
Name: cl, dtype: int64
[1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
{0, 1}
1


Hint: How split data (Pandas DataFrame) base on column:

In [22]:
data[data['cl']==0]

Unnamed: 0,attr 1,attr 2,attr 3,attr 4,attr 5,cl
3,1,0,1,0,1,0
4,1,0,0,1,1,0
8,0,1,0,0,1,0
9,0,0,0,1,1,0


Hint: How take values from column (Pandas Series):

In [23]:
for id, row in data['cl'].items():
    print(id,row)

0 1
1 1
2 1
3 0
4 0
5 1
6 1
7 1
8 0
9 0


1.3 )  Finish the below function for calculating entropy. $H(CL) = - \sum_{y \in CL}p(y)log_2p(y)$ It should return a value of entropy for an input vector CL. Assume that $log_2(0)$ is equal to 0.

In [5]:
def getEntropy(cl, data):
    sums=[0, 0]
    entropy=0
    for i in range (len(data["cl"])):
        sums[cl[i]]+=1
    sums=[sums[a]/(sums[0]+sums[1]) for a in range(2)]
    for i in range (2):
        if (sums[i]>0):
            entropy+=-sums[i]*math.log(sums[i], 2)
    return entropy

1.4 ) Calculate the entropy for the CL vector  (the result should be 0.97095...):

In [6]:
print(getEntropy(list(data["cl"]), data))

0.9709505944546686


1.5) Finish the below function for calculating a conditional entropy: $H(CL|X) = - \sum_{x \in X} \sum_{y \in CL} p(x,y) log_2 \frac{p(x,y)}{p(x)}$. Assume that $log_2(0)$ is equal to 0 and if $p(x) = 0$, $\frac{p(x,y)}{p(x)}$ is equal to 0 as well.

In [7]:
def getConditionalEntropy(cl, attr, data):
    sums=[[0]*2 for a in range (2)]
    probs=[[0]*2 for a in range (2)]
    entropies=[0]*2
    for i in range (len(cl)):
        sums[attr[i]][cl[i]]+=1
    for x in range (2):
        for y in range (2):
            if (sums[x][0]+sums[x][1]==0):
                probs[x][y]=0
            else:
                probs[x][y]=sums[x][y]/(sums[x][0]+sums[x][1])
    cond_entropy=0
    for i in range (2):
        for j in range (2):
            if (probs[i][j]>0):
                entropies[i]-=probs[i][j]*math.log(probs[i][j], 2)
    for i in range (2):
        cond_entropy+=entropies[i]*(sums[i][0]+sums[i][1])
    return cond_entropy/len(cl)

1.6 ) Calculate conditional entropies for given attribiutes.

In [8]:
print(getConditionalEntropy(list(data["cl"]), list(data["attr 1"]), data)) ### the result should be 0.95097...
print(getConditionalEntropy(list(data["cl"]), list(data["attr 5"]), data)) ### the result should be 0.97095...

0.9509775004326937
0.9709505944546686


1.7 ) **Question: Which entropy is lesser and why?**

In [28]:
#Because fo attribute 1, information is better distributed and better ordered

1.8) Finish the below function for calculating information gain (use getEntropy() and getConditionalEntropy() functions):

In [9]:
def getInformationGain(cl, attr, data):
    IG=getEntropy(cl, data)-getConditionalEntropy(cl, attr, data)
    return IG

In [10]:
print(getInformationGain(data["cl"], data["attr 1"], data))
print(getInformationGain(data["cl"], data["attr 5"], data))

0.01997309402197489
0.0


1.9) **Question: Which IG is lesser and why?**

In [31]:
#IG for attribute 5 is lesser, because higher is conditional entropy for attribute 5 (so difference is smaller and IG is smaller)

# Part 2: ID3 algorithm

Decision tree consists of decision nodes and leaves. Nodes split data while leaves classify objects. Consider the class "Node" provided below. It consists of 4 fields:
- attr - attribute ID (use the names in attributeNames vector)
- left - left branch, i.e., a reference to other node
- right - right branch, i.e., a reference to other node
- value - a decision. If node = None, then the node is not a leaf. If value is not None, then a node is considered a leaf. 

Method __call__ returns the decision if the node is a leaf (i.e., when value is not None). 
Otherwise, it calls either the left or the right branch of an input object, based on the attribute value (0 -> left children; 1 -> right children). In this way, we can traverse the decision tree in order to find the final decision.

In [12]:
class Node:
    def __init__(self, attr, left, right, value):
        self.attr = attr
        self.left = left
        self.right = right
        self.value = value

    def __call__(self, obj):
        if self.value is None:
            if obj[self.attr] == 0:
                return self.left(obj)
            else:
                return self.right(obj)
        else:
            return self.value
        
### EXAMPLE
def example(obj):
    root = Node(0, None, None, None) ###  IN ROOT SPLIT ON 1ST (0) ATTRIBUTE
    lChildren = Node(1, None, None, None) ### IN ROOT's LEFT CHILDREN SPLIT ON 2ND (1) ATTRIBUTE
    rChildren = Node(None, None, None, 2) ### IN ROOT's RIGHT CHILDREN -> DECISION = 2
    root.left = lChildren
    root.right = rChildren
    llChildren = Node(None, None, None, 3) ### IN ROOT's LEFT-LEFT CHILDREN -> DECISION = 3
    lrChildren = Node(None, None, None, 4) ### IN ROOT's LEFT-RIGHT CHILDREN -> DECISION = 4
    lChildren.left = llChildren
    lChildren.right = lrChildren
    print(root(obj))
    
example([0, 0]) ### ROOT : FIRST ATTRIBUTE = 0 SO WE GO TO LEFT CHILDREN.
### IT IS A LEAF WITH THE DECISION = 3
### THEN, IN THE CORRESPONDING CHILDREN, THE SECOND ATTRIBUTE = 0, SO WE GO TO LEFT-LEFT CHILDREN

example([0, 1]) 
example([1, 0])
example([1, 1])

3
4
2
2


2.1) Create an initial root. Set the value (decision) to 1. 

In [13]:
root = Node(None, None, None, 1)
print (root)

<__main__.Node object at 0x0000023444A7AFB0>


2.2) Use a getErrorRate method in common.py auxiliary file to calculate the error rate. The decision is made based on the majority rule. In case of tie, the method takes 0 as the default class.

In [14]:
print(round(cm.getErrorRate(root, data), 2))
## SHOULD BE 0.4

0.4


2.3) Use printGraph method (see the common.py file) to draw the decision tree and save it in a png file.

In [15]:
cm.printGraph(root)

2.4) Calculate information gain for all attribiutes.

In [16]:
def printInformationGain(data):
    IG=[]
    for attribute_name in attributeNames:
        IG.append([attribute_name, getInformationGain(list(data["cl"]), list(data[attribute_name]), data)])
    for i in range (5):
        print(IG[i])
        
printInformationGain(data)

['attr 1', 0.01997309402197489]
['attr 2', 0.0464393446710154]
['attr 3', 0.12451124978365313]
['attr 4', 0.09127744624168]
['attr 5', 0.0]


2.5) Choose the best attribute to split the data (HINT, it should be the third attribute :)). Construct two new nodes: one for $x_i$ = 0 decision and the second for $x_i$ = 1; connect them with the root (left and right branch). Remember to update the root. 

In [19]:
root = Node(2, None, None, None)
lChildren = Node(None, None, None, 0)
rChildren = Node(None, None, None, 1)
root.left = lChildren
root.right = rChildren

2.6) Print the graph and calculate the error rate. What happened with the error rate?

In [20]:
cm.printGraph(root)
print(round(cm.getErrorRate(root, data), 2))

0.3


2.7) Split the 'data' (table) based on the selected attribiute, i.e., create two new tables.

In [23]:
left_data = data[data['attr 3']==0]
right_data = data[data['attr 3']==1]
print (left_data)
print (right_data)

   attr 1  attr 2  attr 3  attr 4  attr 5  cl
1       1       1       0       0       1   1
4       1       0       0       1       1   0
7       1       0       0       1       1   1
8       0       1       0       0       1   0
9       0       0       0       1       1   0
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       1       1       1   1
2       0       1       1       1       1   1
3       1       0       1       0       1   0
5       0       0       1       1       1   1
6       1       1       1       1       1   1


2.8) Let us start with the left node. Firstly, calculate information gain for this node.

In [24]:
#print(getEntropy(list(left_data["cl"]), left_data))
printInformationGain(left_data)

['attr 1', 0.4199730940219748]
['attr 2', 0.01997309402197489]
['attr 3', 0.0]
['attr 4', 0.01997309402197489]
['attr 5', 0.0]


2.9) Choose the best attribute to split the data and then update the decision tree.

In [29]:
root = Node(2, None, None, None)
root.left = lChildren
root.right=rChildren
lChildren = Node(0, None, None, None)
llChildren = Node(None, None, None, 0) ### IN ROOT's LEFT-LEFT CHILDREN -> DECISION = 0
lrChildren = Node(None, None, None, 1) ### IN ROOT's LEFT-RIGHT CHILDREN -> DECISION = 1
lChildren.left=llChildren
lChildren.right = lrChildren

2.10) Print the graph and calculate the error rate (HINT: should be 0.2 :). What happened with the error rate?

In [30]:
cm.printGraph(root)
print(round(cm.getErrorRate(root, data), 2))

0.2


2.11) Split data (remember that we split left_data, not data).

In [31]:
leftLeft_data = left_data[left_data['attr 1']==0]
leftRight_data = left_data[left_data['attr 1']==1]
print (leftLeft_data)
print (leftRight_data)

   attr 1  attr 2  attr 3  attr 4  attr 5  cl
8       0       1       0       0       1   0
9       0       0       0       1       1   0
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
1       1       1       0       0       1   1
4       1       0       0       1       1   0
7       1       0       0       1       1   1


2.12) Repeat the whole process for the right node.

In [32]:
printInformationGain(right_data)

['attr 1', 0.17095059445466854]
['attr 2', 0.17095059445466854]
['attr 3', 0.0]
['attr 4', 0.7219280948873623]
['attr 5', 0.0]


In [43]:
root = Node(2, None, None, None)
root.left = lChildren
root.right = rChildren
rChildren = Node(3, None, None, None)
rlChildren = Node(None, None, None, 0) ### IN ROOT's LEFT-LEFT CHILDREN -> DECISION = 0
rrChildren = Node(None, None, None, 1) ### IN ROOT's LEFT-RIGHT CHILDREN -> DECISION = 1
rChildren.left = rlChildren
rChildren.right = rrChildren

In [44]:
cm.printGraph(root)
print(round(cm.getErrorRate(root, data), 2))

0.1


In [38]:
rightLeft_data = right_data[right_data['attr 4']==0]
rightRight_data = right_data[right_data['attr 4']==1]
print(rightLeft_data)
print(rightRight_data)

   attr 1  attr 2  attr 3  attr 4  attr 5  cl
3       1       0       1       0       1   0
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       1       1       1   1
2       0       1       1       1       1   1
5       0       0       1       1       1   1
6       1       1       1       1       1   1


2.13) Let's consider left-left node. Calculate information gain for it.

In [39]:
printInformationGain(leftLeft_data)

['attr 1', 0.0]
['attr 2', 0.0]
['attr 3', 0.0]
['attr 4', 0.0]
['attr 5', 0.0]


2.14) Will adding a new node to the tree improve its effectiveness? Why? Why not?

In [None]:
#No, it is unnecessary to overfit our tree to data set. Adding too much new branches won't improve quality of classification, because left-left is well classified

2.15) Calculate information gain for the left-right node.

In [241]:
printInformationGain(leftRight_data)

['attr 1', 0.6428070838381427]
['attr 2', 0.7182958340544896]
['attr 3', 0.6428070838381427]
['attr 4', 0.7182958340544896]
['attr 5', 0.6428070838381427]


In [308]:
root = Node(2, lChildren, rChildren, None)
root.left = lChildren
root.right = rChildren
lChildren.left = llChildren
lChildren.right = lrChildren
lrChildren = Node (1, lrlChildren, lrrChildren, None)
lrlChildren = Node(None, None, None, 0)
lrrChildren = Node(None, None, None, 1)

In [310]:
cm.printGraph(root)
print(round(cm.getErrorRate(root, data), 2))

0.1


2.16) What happened with the error rate? Is it necessary to keep these two newly added leaves?

In [None]:
#Nothing changed - it is not necessary to keep those two leaves. Better is to have one node instead

2.17) Finish creating the right side of the tree

In [313]:
leftRightleft_data = leftRight_data[leftRight_data['attr 2']==0]
leftRightright_data = leftRight_data[leftRight_data['attr 2']==1]
print(leftRightleft_data)
print(leftRightright_data)

   attr 1  attr 2  attr 3  attr 4  attr 5  cl
4       1       0       0       1       1   0
7       1       0       0       1       1   1
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
1       1       1       0       0       1   1


# Part 3: automated construction of decision trees

3.1 Complete the following function for automated construct of decision trees, so that it returns a decision tree for the given data and attribute list. Note that this is a recusive method, i.e., calls itself.

In [46]:
max_depth = 10

def createTree(data, attributeNames, depth=0):
    data = data.reset_index().drop("index", axis=1)
    #choosing majority class for the node
    #choosing attribute to split based on Inf Gain
    sums=[0, 0]
    for i in range (len(data)):
        sums[data['cl'][i]]+=1
    if (sums[1]<sums[0]):
        dec=0
    else:
        dec=1
    maxi=[0, 0] #index, IG
    for i in range (len(attributeNames)):
        if (getInformationGain(data["cl"], data[attributeNames[i]], data)>maxi[1]):
            maxi[0]=i
            maxi[1]=getInformationGain(list(data["cl"]), list(data[attributeNames[i]]), data)
    #splitting tree
    root = Node(maxi[0], None, None, None)
    lChildren = Node(maxi[0], None, None, 0)
    left_data = data[data[attributeNames[maxi[0]]]==0]
    rChildren = Node(maxi[0], None, None, 1)
    right_data = data[data[attributeNames[maxi[0]]]==1]
    root.left=lChildren
    root.right=rChildren
    if (len(left_data)==len(data) or len(right_data)==len(data) or maxi[1]==0 or depth==max_depth):
        #print (depth)
        root = Node(None, None, None, dec)
    else:
        root.left=createTree(left_data, attributeNames, depth+1)
        root.right=createTree(right_data, attributeNames, depth+1)
    return root

3.2) Build a decision tree for a training dataset in the common.py auxiliary file, for diffrent values of max_depth.  Calculate & compare the error rates for training and validation datasets.

In [515]:
max_depth = 10

In [47]:
# Training dataset
train_attributeNames, train_data = cm.getTrainingDataSet()
cm.printGraph(createTree(train_data, train_attributeNames, 0))
print (round(cm.getErrorRate(createTree(train_data, train_attributeNames, 0), train_data), 2))

0.2


In [520]:
# Validation dataset
valid_attributesName, valid_data = cm.getValidationDataSet()
cm.printGraph(createTree(valid_data, valid_attributesName, 0))
print (cm.getErrorRate(createTree(valid_data, valid_attributesName, 0), valid_data))

0.0


In [None]:
#Error rate for validation data = 0, error rate for training set = 0.2 - tree is well built, because it performs better on validation data than on training set

3.3) Consider only the training data set and answer the following questions:
* What is the miximum depth of the tree (consider only the training data set)?
* The tree building process should stop when there is no improvement in error rate (why?). Check for which value of "max_dept" there is no improvement in error rate. 

In [527]:
for i in range(10):
    max_depth = i
    cm.printGraph(createTree(train_data, train_attributeNames, 0))
    print (round(cm.getErrorRate(createTree(train_data, train_attributeNames, 0), train_data), 2))

0.4
0.35
0.3
0.25
0.25
0.2
0.2
0.2
0.2
0.2


In [523]:
#maximum depth for training data is 5, even when we increase maximum possible depth more, nothing will change 

In [None]:
#we should stop after fourth iteration, when maximum depth is 3 (there is no improvement in error rate - increasing 
#number of branches won't change anything and better is to keep our tree as simple as possible)