## Data Description : 
### For Simplicity We have assumed Dataset of two Attribute having Binary Value ( Which can be considered as Attribute with two unique value) And one Ouput Class_id (Which is also Binary). Dimension : 1800x3

In [45]:
# Importing Required Librabries
import numpy as np

In [46]:
# Class which represent Internal H-Tree Nodes
class VFDT_Node():
    
    # Constructor for H_Node Node
    def __init__(self,no_of_class=2):
        self.attribute_id = None # attribute id leaf belongs to
        self.left = None # Left child
        self.right = None # Right child
        self.num_class = no_of_class # Number of classes
        self.statistic = np.zeros(no_of_class,int) # statistics
        self.example_seen = 0 # number of examples seen
    
    # Calculating Entropy based on statistic collected so far
    def Entropy(self):
        result = 0
        for i in range(self.num_class):
            ratio = self.statistic[i]/np.float(np.sum(self.statistic[:]))
            if ratio != 0:
                result = result + ratio*np.log2(ratio)
        return -result
    
      # To get the Child Entropy
    def child_Entropy(self,childId):
        if childId == 0: # If child_id is 0 then that is left child
            if self.left != None: # If it exist
                return self.left.Entropy() # Return the Entropy of that child
            else:
                return 0 #if it is not exist
        elif childId == 1: # If child_id is 1 then that is right child
            if self.right != None: # If it exist
                return self.right.Entropy() # Return the statistics of that child
            else:
                return 0 #if it is not exist
            
    # To get the statistic of examples seen by the childs
    def child_Examples(self,childId):
        if childId == 0: # If child_id is 0 then that is left child
            if self.left != None: # If it exist
                return np.sum(self.left.statistic) # Return the statistics of that child
            else:
                return 0 #if it is not exist
        elif childId == 1: # If child_id is 1 then that is right child
            if self.right != None: # If it exist
                return np.sum(self.right.statistic) # Return the statistics of that child
            else:
                return 0 #if it is not exist

    
    def Gain(self):
        ans = 0
        for i in range(self.num_class):
            ratio = self.child_Examples(i)/np.float(np.sum(self.statistic[:])) # Weighted Average of particular child
            ans = ans + ratio*self.child_Entropy(i) # Multiplying weighted average and entropy
        return self.Entropy()-ans # Gain of parent - (weighted average)*(entropy of childs)
    
     # Sort Example till Leaf
    def Example_Sorting(self,attribValue,classId):
        self.statistic[classId] = self.statistic[classId] + 1 # Increment the counter to the corresponding class_id
        self.example_seen = self.example_seen + 1 # Increment the number of examples seen so far
        if attribValue == 0: # If Value of Attribute is 0
            if self.left == None: # If it has no left child
                newLeaf = VFDT_Leaf() # Create a Left child which is a new leaf
                newLeaf.Example_Sorting(classId) # Calling Sort Example of Hleaf 
                self.left = newLeaf # Set the left child as the new leaf which was created
            else:
                self.left.Example_Sorting(classId) #Call the Sort Example directly if left child is alreasy exists
        elif attribValue == 1: # If the value of attribute is 1
            if self.right == None: # If it has no right child
                newLeaf = VFDT_Leaf() # Create a right child which is a new leaf
                newLeaf.Example_Sorting(classId) # Calling sortexample of Hleaf
                self.right = newLeaf # Set the right child as the new leaf which was created
            else:
                self.right.Example_Sorting(classId) #Call the Sort Example directly if right child is alreasy exists
                
    
    # Create a left child 
    def set_LeftChild(self,leaf,newNode):
        if leaf == True:
            self.left = VFDT_Leaf(newNode)
        else:
            self.left = newNode
            
    # Create a right child 
    def set_RightChild(self,leaf,newNode):
        if leaf == True:
            self.right = VFDT_Leaf(newNode)
        else:
            self.right = newNode

## Every Node (Not a leaf) in the H-Tree can be described using :-

<table class='table table-striped'> <thead> <tr> 
    <th><h1 align="center">Attribute</h1></th> <th><h1 align="center">Description</h1></th> </tr> </thead> <tbody> <tr> 

<th scope='row'><h2 align="left" >attribute_id</h2></th><td><h3 align="left">Attribute id(0 or 1) which differentiate this attribute from two attribute</h3></td> </tr> <tr>     
<th scope='row'><h2 align="left" >left</h2></th> <td><h3 align="left">left child if have any</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >right</h2></th> <td><h3 align="left">right child if have any</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >statistic</h2></th> <td><h3 align="left">no. of 0's or 1's seen by this leaf</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >example_seen</h2></th> <td><h3 align="left">example seen so far by this attribute</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >num_class</h2></th> <td><h3 align="left">Num of distinct classes that attribute have</h3></td> </tr> </tbody> </table>


## List of Methods used in the H-Tree :-

<table class='table table-striped'> <thead> <tr> 
    <th><h1 align="center">Method</h1></th> <th><h1 align="center">Description</h1></th> </tr> </thead> <tbody> <tr> 

<th scope='row'><h2 align="left">__init__ </h2></th><td><h3 align="left">Constructor for H-node</h3></td> </tr> <tr>     
    <th scope='row'><h2 align="left">set_LeftChild</h2></th> <td><h3 align="left">Create Left Child</h3></td> </tr> <tr>
    <th scope='row'><h2 align="left">set_RightChild</h2></th> <td><h3 align="left">Create Right Child</h3></td> </tr> <tr>

    
<th scope='row'><h2 align="left">Example_Sorting</h2></th> <td><h3 align="left">After seeing new example update statistic and update number of examples</h3></td> </tr> <tr>
<th scope='row'><h2 align="left">check_SameClass</h2></th> <td><h3 align="left">Check whether uptill now any different class-example is seen or not</h3></td> </tr> <tr>
    
 <th scope='row'><h2 align="left">child_Examples</h2></th> <td><h3 align="left">Return the statistic of the child</h3></td> </tr> <tr>
<th scope='row'><h2 align="left">child_Entropy</h2></th> <td><h3 align="left">Return the entropy of the child</h3></td> </tr> <tr>
    <th scope='row'><h2 align="left">Gain</h2></th> <td><h3 align="left">Calculating Gain Based on Child Statistic, Child Entropy and Gain</h3></td> </tr> <tr>
<th scope='row'><h2 align="left">Entropy</h2></th> <td><h3 align="left">Calculating Entropy based on statistic that is collected so far by this leaf</h3></td> </tr> </tbody> </table>

## Formula for Gain : 
 ![title](gain.png)

# Formula to calculate H-Bound
![title](hbound.png)

In [47]:
# Class which represent the leaf node of H-Tree
class VFDT_Leaf():
    
     # Constructor for H-leaf
    def __init__(self,no_of_class=2):
        self.attribute_id = None # attribute id leaf belongs to (None = Root HT)
        self.attribute_val = None # attribute value leaf represents (None = Root HT)
        self.num_class = no_of_class
        self.example_seen = 0 # number of examples seen in leaf
        self.statistic = np.zeros(no_of_class,int) # statistics
        self.value = None # value for prediction
        
        
    # Calculating Entropy based on statistic that is collected so far
    def Entropy(self):
        ans = 0
        for i in range(self.num_class):
            ratio = self.statistic[i]/np.float(np.sum(self.statistic[:]))
            if ratio != 0:
                ans = ans + ratio*np.log2(ratio)
        return -ans
    
     # After seeing new example update statistic and update number of examples
    def Example_Sorting(self,classId):
        self.statistic[classId] = self.statistic[classId] + 1
        self.example_seen = self.example_seen + 1

## Every Leaf in the H-Tree can be described using :-

<table class='table table-striped'> <thead> <tr> 
    <th><h1 align="center">Attribute</h1></th> <th><h1 align="center">Description</h1></th> </tr> </thead> <tbody> <tr> 

<th scope='row'><h2 align="left" >attribute_id</h2></th><td><h3 align="left">Attribute id(0 or 1) which differentiate this attribute from two attribute</h3></td> </tr> <tr>     
<th scope='row'><h2 align="left" >attribute_val</h2></th> <td><h3 align="left">Attribute value(0 or 1) which is the value of that attribute</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >example_seen</h2></th> <td><h3 align="left">example seen so far by this attribute</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >statistic</h2></th> <td><h3 align="left">no. of 0's or 1's seen by this leaf</h3></td> </tr> <tr>
<th scope='row'><h2 align="left" >num_class</h2></th> <td><h3 align="left">Num of distinct classes that attribute have</h3></td> </tr> </tbody> </table>


## List of Methods used in the H-Tree :-

<table class='table table-striped'> <thead> <tr> 
    <th><h1 align="center">Method</h1></th> <th><h1 align="center">Description</h1></th> </tr> </thead> <tbody> <tr> 

<th scope='row'><h2 align="left">__init__ </h2></th><td><h3 align="left">Constructor for H-leaf</h3></td> </tr> <tr>     
<th scope='row'><h2 align="left">Example_Sorting</h2></th> <td><h3 align="left">After seeing new example update statistic and update number of examples</h3></td> </tr> <tr>
<th scope='row'><h2 align="left">check_SameClass</h2></th> <td><h3 align="left">Check whether uptill now any different class-example is seen or not</h3></td> </tr> <tr>
<th scope='row'><h2 align="left">Entropy</h2></th> <td><h3 align="left">Calculating Entropy based on statistic that is collected so far by this leaf</h3></td> </tr> </tbody> </table>


## Formula for Entropy : 
 ![title](images.png)

# Main Algorithm

In [48]:
R = 2 # number of classes
delta = 1e-07 # constant = 1 - probability that correct attribute is chosen
nmin = 100 # minimum number of examples for learning one attribute

VFDTleaf = VFDT_Leaf() # create a root leaf
Attribute1 = VFDT_Node() # create a attribute 1
Attribute2 = VFDT_Node() # create a attribute 2

level = 0 # Currently we are at level zero
file = open('stream.txt', 'r') # Reading the dataset from stream.txt
for dataset in file:
    try: # If file does not exist
        data = dataset.split(',') # Split the data row seperated by comma
        try: # If data row are not proper
            if level == 0: # Since we are at level zero
                VFDTleaf.Example_Sorting(np.int(data[2])) # Increment class_id counter and no. of examples seen 
                Attribute1.Example_Sorting(np.int(data[0]),np.int(data[2])) # Sorting the example based on value of attribute-1 
                Attribute2.Example_Sorting(np.int(data[1]),np.int(data[2])) # Sorting the example based on value of attribute-2
                n = VFDTleaf.example_seen # examples seen so far
                
                if (np.mod(n,nmin) == 0) and (np.min(VFDTleaf.statistic[:]) != 0): # When enough example is seen and check wthether all examples are of same or different class  
                    Gain_Attribute1 = Attribute1.Gain() # Calculating Gain of Attribute-1
                    Gain_Attribute2 = Attribute2.Gain() # Calculating Gain of Attribute-2
                    epsilon = np.sqrt((R*R*np.log(1/np.float(delta)))/np.float(2*n)) # Calculating H_bound
                    print("No. of example seen: ",n," Gain of attribute 1: ",Gain_Attribute1," Gain of attribute 2: ",Gain_Attribute2," epsilon: ",epsilon)
                    if np.abs(Gain_Attribute1 - Gain_Attribute2) > epsilon: # When Difference between Gains is greater then epsilon
                        print("After taking: ",n," We can decide our Root")
                        if Gain_Attribute1 >= Gain_Attribute2: # If Gain_Attribute1 (Gain of attribute-1) >G2 (Gain of attribute-2)
                            print("Gain_Attribute1>=Gain_Attribute2 (Attribute 1 is Selected as Root)")
                            VFDTleaf = Attribute1 # Replacing Root Node by Attribute-1
                            #Creating two childs because Attribute-1 has two child
                            Attribute2_1 = VFDT_Node() # attribute 2 for left branch of attribute 1
                            Attribute2_2 = VFDT_Node() # attribute 2 for right branch of attribute 1
                        else:
                            print("Gain_Attribute1<Gain_Attribute2 (Attribute 2 is Selected as Root)")
                            VFDTleaf = Attribute2 # Replacing Root Node by Attribute-2
                            # Creating two childs because Attribute-2 has two child
                            Attribute1_1 = VFDT_Node() # attribute 1 for left branch of attribute 2
                            Attribute1_2 = VDFT_Node() # attribute 1 for right branch of attribute 2
                        # After Selecting the Root reset the Root Statistic
                        VFDTleaf.example_seen = 0
                        VFDTleaf.statistic = np.zeros(VFDTleaf.num_class,int)
                        level = level + 1 # Incrementing level
                        
            elif level == 1: # Now we are at level one 
                if Gain_Attribute1 >= Gain_Attribute2: # When Attribute-1 is root
                    VFDTleaf.Example_Sorting(np.int(data[0]),np.int(data[2])) # Sorting Example of attribute 1
                    if np.int(data[0]) == 0: # When Value of A1 is 0
                        Attribute2_1.Example_Sorting(np.int(data[1]),np.int(data[2])) # Do sorting on A21
                    elif np.int(data[0]) == 1: # When Value of A1 is 1
                        Attribute2_2.Example_Sorting(np.int(data[1]),np.int(data[2])) # DO sorting on A22
                else: # When Attribute-2 is root
                    VFDTleaf.Example_Sorting(np.int(data[1]),np.int(data[2])) # Sorting Example of attribute 2
                    if np.int(data[1]) == 0: # When Value of A2 is 0
                        Attribute1_1.Example_Sorting(np.int(data[0]),np.int(data[2])) # Do sorting on A11
                    elif np.int(data[1]) == 1: # When Value of A2 is 1
                        Attribute1_2.Example_Sorting(np.int(data[0]),np.int(data[2])) # DO sorting on A12
                    
                n = VFDTleaf.example_seen # examples seen so far
                if (np.mod(n,nmin) == 0) and (np.min(VFDTleaf.statistic[:]) != 0): # Again at level 1, # When enough example is seen and check wthether all examples are of same or different class  
                    if Gain_Attribute1 >= Gain_Attribute2: # When A1 is Root
                        Gain2_1 = Attribute2_1.Gain() # Calculating GAIN of left child of Root
                        Gain2_2 = Attribute2_2.Gain() # Calculating GAIN of right child of Root
                        epsilon = np.sqrt((R*R*np.log(1/np.float(delta)))/np.float(2*n)) # Calculating H_bound
                        print("No. of example seen: ",n," G21: ",Gain2_1," G22: ",Gain2_2," epsilon: ",epsilon)
                        if np.abs(Gain2_1 - Gain2_2) > epsilon: # When Difference between G21 and G22 is greater than epsilon
                            print(n)
                            if Gain2_1 >= Gain2_2: # When Gain of left child is greater than right child
                                VFDTleaf.set_LeftChild(False,A21) 
                                print("Gain2_2>=Gain2_1 (left branch of Attribute1 is ready for splitting)")
                            else:  # When Gain of right child is greater than left child
                                VFDTleaf.set_RightChild(False,A22)
                                print("Gain2_2<Gain2_1 (right branch of Attribute1 is ready for splitting)")
                                
                             # After Selecting the Second node reset the nodes Statistic
                            VFDTleaf.example_seen = 0
                            VFDTleaf.statistic = np.zeros(VFDTleaf.num_class,int)
                            level = level + 1                
                    else: #When A2 is root
                        Gain1_1 = Attribute1_1.Gain() # Calculating GAIN of left child of Root
                        Gain1_2 = Attribute1_2.Gain() # Calculating GAIN of right child of Root
                        epsilon = H_Bound(R,delta,n) # Calculating H_Bound
                        print("No. of example seen: ",n," G11: ",Gain1_1," G12: ",Gain1_2," epsilon: ",epsilon)
                        if np.abs(Gain1_1 - Gain1_2) > epsilon: # When Difference between G11 and G12 is greater than epsilon
                            print(n)
                            if Gain1_1 >= Gain1_2:  # When Gain of left child is greater than right child
                                VFDTleaf.set_LeftChild(False,A11)
                                print("Gain1_1>=Gain1_2 (left branch of Attribute2 is ready for splitting)")
                            else:  # When Gain of right child is greater than left child
                                VFDTleaf.set_RightChild(False,A12)
                                print("Gain1_1<Gain1_2 (right branch of Attribute2 is ready for splitting)")
                            # After Selecting the Second node reset the nodes Statistic
                            VFDTleaf.example_seen = 0
                            VFDTleaf.statistic = np.zeros(VFDTleaf.num_class,int)
                            level = level +1
        except ValueError:
            print("Input is not Proper", data)
    except ValueError:
        print ("Input is not proper", dataset)
f.close()

No. of example seen:  100  Gain of attribute 1:  0.3958156020033584  Gain of attribute 2:  0.2812908992306927  epsilon:  0.567769242755511
No. of example seen:  200  Gain of attribute 1:  0.46805777390617237  Gain of attribute 2:  0.23406805537549114  epsilon:  0.40147348170157293
No. of example seen:  300  Gain of attribute 1:  0.4591479170272448  Gain of attribute 2:  0.2516291673878229  epsilon:  0.3278017251424843
No. of example seen:  400  Gain of attribute 1:  0.4427020948434207  Gain of attribute 2:  0.2597361225311662  epsilon:  0.2838846213777555
No. of example seen:  500  Gain of attribute 1:  0.4635994573783388  Gain of attribute 2:  0.24481870497302982  epsilon:  0.25391412446698053
No. of example seen:  600  Gain of attribute 1:  0.4591479170272448  Gain of attribute 2:  0.2516291673878229  epsilon:  0.23179082273289942
No. of example seen:  700  Gain of attribute 1:  0.4496968510411249  Gain of attribute 2:  0.2563172881141057  epsilon:  0.21459660262893474
No. of example