In [53]:
# HYPER-PARAMETERS
SEED = 7 # seed number set for keeping the dataset the same (for reproduction of work)
# We use random dataset generated to simulate the Isolation Forest Algorithm
X = 7 # the number of observations for simulation
Y = 7 # the number of attributes for simulation
max_layer = 2 # If the leaf is generated below or equal to the max_layer, we add 1 at the index of the current observation to increase its probability of anomaly
tree_num = 20 # The number of trees we want to build

import numpy as np

class Random_Tree:
    
    def __init__(obj, val, LHS, RHS):
        obj.val = val
        obj.LHS = LHS
        obj.RHS = RHS
    
class Isolation_Forest:
    def __init__(obj, data_set):
        obj.data_set = data_set
        obj.data_size = 0
        obj.num_attributes = 0
        obj.tree = None
        obj.__initialization()
        
    def __initialization(obj):
        obj.data_size = len(obj.data_set)
        obj.num_attributes = len(obj.data_set[0])
        obj.tree = obj.__get_isolation_forest(np.arange(obj.data_size), -1, "")
    
    # One typical binary tree within an isolation forest is built based on the algorithm behind isolation forest
    # flag is a variable that can have value: ROOT, LHS, RHS. It indicates which side of the tree the current node belongs to
    def __get_isolation_forest(obj, index, tree_height, flag):
        # return if the length of the index argument == 0
        if len(index) == 0:
            return
        elif len(index) == 1:
            return Random_Tree((index[0], tree_height + 1, flag), None, None)
        else:
            rand_att_index = np.random.choice(obj.num_attributes)
            rand_att = obj.data_set[index][:, rand_att_index]
            threshold = np.random.choice(rand_att)
            LHS_index, RHS_index = obj.__node_split(index, rand_att, threshold)
            
            # recursively get trees
            return Random_Tree((len(index), tree_height + 1, rand_att_index, threshold, flag), 
                        obj.__get_isolation_forest(LHS_index, tree_height + 1, "LHS"), 
                        obj.__get_isolation_forest(RHS_index, tree_height + 1, "RHS"))
        
    def __node_split(obj, index, att, threshold):
        LHS_list = []
        RHS_list = []
        for i in range(len(index)):
            if att[i] < threshold:
                LHS_list.append(index[i])
            else:
                RHS_list.append(index[i])   
        return LHS_list, RHS_list
    
    ####################################################### tree is displayed in the following way #################################################
    # if it is the root node, we display: The number of total observations, the layer that the current node is located, the index of the attribute used for the root node split, the threshold level, the flag variable set to ROOT 
    # if it is a internal node, we display: The number of observations split to the node, the layer that the current node is located, the index of the attribute used for the node split, the threshold level, the flag variable
    # if it is a leaf node, we display: The index of the attribute used for the node split, the layer that the current node is located, the flag variable
    def display_tree(obj, counter):
        my_tree = [obj.tree]
        while my_tree is not None and len(my_tree) > 0 :
            node = my_tree.pop(0)
            val = node.val
            if len(val) == 5:
                if val[-1] == "":
                    print("number obs in curr_node: %d; layer: %d; att_index: %d; threshold: %.3f; flag: ROOT" % val[:-1]) # root node 
                else:
                    print("number obs in curr_node: %d; layer: %d; att_index: %d; threshold: %.3f; flag: %s" % val) # situation when it is parent node
            else:
                print("Observation: %d; layer: %d; flag: %s" % val) # situation when it is the leaf node
                if list(val)[1] <= max_layer:
                    counter[list(val)[0]] = counter[list(val)[0]] + 1
            if node.LHS is not None:
                my_tree.append(node.LHS)
            if node.RHS is not None:
                my_tree.append(node.RHS)
        print()
        return

    
# DRIVER FUNCTION
np.random.seed(SEED)
my_data = np.random.rand(X,Y)
print("\nThe randomly constructed 2D data_set is:")
count = 0
for i in my_data:
    print("Observation %d: " % count)
    for j in i:
        print("%.3f" % j, end=" ")
    count = count + 1
    print()
print()

counter = [0] * X # set counter to see the frequency of observations that end in leaf nodes with low number of layers (High frequency indicates anomaly)
# generate isolation forest
for i in range(tree_num):
    my_model = Isolation_Forest(my_data)
    my_model.display_tree(counter)
    
# check counter array and determine anomaly
print("The possible outlier (anomaly) is observation: %d" % counter.index(max(counter)))
print("The number of times that this observation ends up at a leaf with short distance to root is %d" % max(counter))
                
            


The randomly constructed 2D data_set is:
Observation 0: 
0.076 0.780 0.438 0.723 0.978 0.538 0.501 
Observation 1: 
0.072 0.268 0.500 0.679 0.804 0.381 0.066 
Observation 2: 
0.288 0.910 0.213 0.452 0.931 0.025 0.601 
Observation 3: 
0.950 0.230 0.548 0.909 0.133 0.523 0.750 
Observation 4: 
0.669 0.468 0.205 0.491 0.372 0.477 0.366 
Observation 5: 
0.838 0.769 0.314 0.573 0.276 0.453 0.353 
Observation 6: 
0.657 0.370 0.459 0.719 0.413 0.906 0.180 

number obs in curr_node: 7; layer: 0; att_index: 2; threshold: 0.548; flag: ROOT
number obs in curr_node: 6; layer: 1; att_index: 2; threshold: 0.213; flag: LHS
Observation: 3; layer: 1; flag: RHS
Observation: 4; layer: 2; flag: LHS
number obs in curr_node: 5; layer: 2; att_index: 5; threshold: 0.453; flag: RHS
number obs in curr_node: 2; layer: 3; att_index: 3; threshold: 0.679; flag: LHS
number obs in curr_node: 3; layer: 3; att_index: 0; threshold: 0.076; flag: RHS
Observation: 2; layer: 4; flag: LHS
Observation: 1; layer: 4; flag: RHS