In [248]:
import import_ipynb
import copy
import time
import random
import pandas as pd
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from mpl_toolkits.mplot3d import Axes3D

In [23]:
class PartitionNode:
    '''
    A partition node, including both the internal and leaf nodes in the partition tree
    '''
    def __init__(self, num_dims = 0, boundary = [], nid = None, pid = None, is_irregular_shape_parent = False,
                 is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 0):
        
        self.num_dims = num_dims # number of dimensions
        # the domain, [l1,l2,..,ln, u1,u2,..,un,], for irregular shape partition, one need to exempt its siblings
        self.boundary = boundary # I think the lower side should be inclusive and the upper side should be exclusive?
        self.nid = nid # node id
        self.pid = pid # parent id
        self.is_irregular_shape_parent = is_irregular_shape_parent # whether the last child is an irregular shape partition
        self.is_irregular_shape = is_irregular_shape # indicate whether this partition is irregular shape
        self.num_children = num_children # number of children, should be 0, 2, or 3
        self.children_ids = children_ids # if it's the irregular shape parent, then the last child should be the irregular partition
        self.is_leaf = is_leaf
        self.node_size = node_size # number of records in this partition
        
        self.dataset = None # only used in partition algorithms, tempoary
        
    def is_overlap(self, query):
        '''
        query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
        !query dimension should match the partition dimensions! i.e., all projected or all not projected
        return 0 if no overlap
        return 1 if overlap
        return 2 if inside
        '''
        if len(query) != 2 * self.num_dims:
            return -1 # error
        
        overlap_flag = True
        inside_flag = True
        
        for i in range(self.num_dims):
            if query[i] > self.boundary[self.num_dims + i] or query[self.num_dims + i] < self.boundary[i]:
                overlap_flag = False
                inside_flag = False
                return 0
            elif query[i] < self.boundary[i] or query[self.num_dims + i] > self.boundary[self.num_dims + i]:
                inside_flag = False
                
        if inside_flag:
            return 2
        elif overlap_flag:
            return 1
        else:
            return 0
    
    def if_split(self, split_dim, split_value):
        '''
        return 2 child nodes if a split take place on given dimension with given value
        This function is only used to simplify the skip calculation process
        '''
        boundary1 = self.boundary.copy()
        boundary1[split_dim + self.num_dims] = split_value
        boundary2 = self.boundary.copy()
        boundary2[split_dim] = split_value
        child_node1 = PartitionNode(self.num_dims, boundary1)
        child_node2 = PartitionNode(self.num_dims, boundary2)
        return child_node1, child_node2

In [221]:
 class PartitionTree:
        '''
        The data structure that represent the partition layout, which also maintain the parent, children relation info
        Designed to provide efficient online query and serialized ability
        
        The node data structure could be checked from the PartitionNode class
        
        '''   
        def __init__(self, num_dims = 0, boundary = [], nid = 0, pid = -1, is_irregular_shape_parent = False,
                     is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 0):
            
            # the node id of root should be 0, its pid should be -1
            self.pt_root = PartitionNode(num_dims, boundary, nid, pid, is_irregular_shape_parent, is_irregular_shape, 
                                    num_children, children_ids, is_leaf, node_size)
            self.nid_node_dict = {nid: self.pt_root} # node id to node dictionary
            self.node_count = 1 # the root node
            
        
        # = = = = = public functions (API) = = = = =
        
        def save_tree(self, path):
            node_list = self.__generate_node_list(self.pt_root) # do we really need this step?
            serialized_node_list = self.__serialize(node_list)
            np.savetxt(path, serialized_node_list, delimiter=',')
            return serialized_node_list
            
        def load_tree(self, path):
            serialized_node_list = genfromtxt(path, delimiter=',')
            self.__build_tree_from_serialized_node_list(serialized_node_list)
        
        def query_single(self, query):
            '''
            query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
            return the overlapped leaf partitions ids!
            '''
            partition_ids = self.__find_overlapped_partition(self.pt_root, query)
            return partition_ids
        
        def query_batch(self, queries):
            '''
            to be implemented
            '''
            pass
        
        def add_node(self, parent_id, child_node):
            child_node.nid = self.node_count + 1
            child_node.pid = parent_id
            self.nid_node_dict[child_node.nid] = child_node
            self.node_count += 1
            
            self.nid_node_dict[parent_id].children_ids.append(child_node.nid)
            self.nid_node_dict[parent_id].num_children += 1
            self.nid_node_dict[parent_id].is_leaf = False
        
        
        def apply_split(self, parent_nid, split_dim, split_value):
            '''
            split a node into 2 sub-nodes by a given dimension and value
            '''
            parent_node = self.nid_node_dict[parent_nid]
            
            # create sub nodes
            child_node1 = copy.deepcopy(parent_node)
            child_node1.boundary[split_dim + child_node1.num_dims] = split_value   
            
            child_node2 = copy.deepcopy(parent_node)
            child_node2.boundary[split_dim] = split_value
            
            # if parent_node.dataset != None: # The truth value of an array with more than one element is ambiguous.
            # https://stackoverflow.com/questions/36783921/valueerror-when-checking-if-variable-is-none-or-numpy-array
            if parent_node.dataset is not None:
                child_node1.dataset = parent_node.dataset[parent_node.dataset[:,split_dim] < split_value]
                child_node1.node_size = len(child_node1.dataset)
                child_node2.dataset = parent_node.dataset[parent_node.dataset[:,split_dim] >= split_value]
                child_node2.node_size = len(child_node2.dataset)
            
            # update current node
            self.add_node(parent_nid, child_node1)
            self.add_node(parent_nid, child_node2)
            del self.nid_node_dict[parent_nid].dataset
            
            return child_node1, child_node2
        
        def get_leaves(self):
            nodes = []
            for nid, node in self.nid_node_dict.items():
                if node.is_leaf:
                    nodes.append(node)
            return nodes
        
        def visualize(self, dims = [0, 1], queries = []):
            '''
            visualize the partition tree's leaf nodes
            '''
            if len(dims) == 2:
                self.__visualize_2d(dims, queries)
            else:
                self.__visualize_3d(dims[0:3], queries)
            
        
        # = = = = = internal functions = = = = =
        
        def __generate_node_list(self, node):
            '''
            recursively add childrens into the list
            '''
            node_list = [node]
            for nid in node.children_ids:
                node_list += self.__generate_node_list(self.nid_node_dict[nid])
            return node_list
        
        def __serialize(self, node_list):
            '''
            convert object to attributes to save
            '''
            serialized_node_list = []
            for node in node_list:
                # follow the same order of attributes in partition class
                attributes = [node.num_dims]
                attributes += node.boundary
                attributes.append(node.nid) # node id = its ow id
                attributes.append(node.pid) # parent id
                attributes.append(1 if node.is_irregular_shape_parent else 0)
                attributes.append(1 if node.is_irregular_shape else 0)
                attributes.append(node.num_children) # number of children
                attributes += node.children_ids
                attributes.append(1 if node.is_leaf else 0)
                attributes.append(node.node_size)
                
                serialized_node_list.append(attributes)
            return serialized_node_list
        
        def __build_tree_from_serialized_node_list(self, serialized_node_list):
            
            self.pt_root = None
            self.nid_node_dict.clear()
            
            for serialized_node in serialized_node_list:
                num_dims = serialized_node[0]
                boundary = serialized_node[1: 1+2*num_dims]
                nid = serialized_node[1+2*num_dims] # node id
                pid = serialized_node[2+2*num_dims] # parent id
                is_irregular_shape_parent = False if serialized_node[3+2*num_dims] == 0 else True
                is_irregular_shape = False if serialized_node[4+2*num_dims] == 0 else True
                num_children = serialized_node[5+2*num_dims]
                children_ids = []
                if num_children != 0:
                    children_ids = serialized_node[1+5+2*num_dims: 1+num_children+1+5+2*num_dims] # +1 for the end exclusive
                is_leaf = False if serialized_node[1+num_children+5+2*num_dims] == 0 else True
                node_size = serialized_node[2+num_children+5+2*num_dims] # don't use -1 in case of match error
                
                node = PartitionNode(num_dims, boundary, nid, pid, is_irregular_shape_parent, 
                                     is_irregular_shape, num_children, children_ids, is_leaf, node_size)
                self.nid_node_dict[nid] = node # update dict
                
            self.pt_root = self.nid_node_dict[0]
            
        def __find_overlapped_partition(self, node, query):
            
            if node.is_leaf:
                return [node.nid] if node.is_overlap(query) > 0 else []
            
            node_id_list = []
            if node.is_overlap(query) <= 0:
                pass
            elif node.is_irregular_shape_parent: # special process for irregular shape partitions!
                overlap_irregular_shape_node_flag = False
                for nid in node.children_ids[0: -1]: # except the last one, should be the irregular shape partition
                    overlap_case = self.nid_node_dict[nid].is_overlap(query)
                    if overlap_case == 1:
                        overlap_irregular_shape_node_flag = True
                    if overlap_case > 0:
                        node_id_list.append(nid)      
                if overlap_irregular_shape_node_flag:
                    node_id_list.append(node.children_ids[-1])
            else:  
                for nid in node.children_ids:
                    node_id_list += self.__find_overlapped_partition(self.nid_node_dict[nid], query)
            return node_id_list
        
        def __visualize_2d(self, dims, queries = [], path = None):
            fig, ax = plt.subplots(1)
            
            num_dims = self.pt_root.num_dims
            plt.xlim(self.pt_root.boundary[dims[0]], self.pt_root.boundary[dims[0]+num_dims])
            plt.ylim(self.pt_root.boundary[dims[1]], self.pt_root.boundary[dims[1]+num_dims])
            
            leaves = self.get_leaves()
            for leaf in leaves:
                
                lower1 = leaf.boundary[dims[0]]
                lower2 = leaf.boundary[dims[1]]             
                upper1 = leaf.boundary[dims[0]+num_dims]
                upper2 = leaf.boundary[dims[1]+num_dims]
                
                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='g',linewidth=1)
                ax.add_patch(rect)
                   
            for query in queries:

                lower1 = query[dims[0]]
                lower2 = query[dims[1]]  
                upper1 = query[dims[0]+num_dims]
                upper2 = query[dims[1]+num_dims]

                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='r',linewidth=1)
                ax.add_patch(rect)

            ax.set_xlabel('dim 1', fontsize=15)
            ax.set_ylabel('dim 2', fontsize=15)
            #plt.xticks(np.arange(0, 400001, 100000), fontsize=10)
            #plt.yticks(np.arange(0, 20001, 5000), fontsize=10)

            plt.tight_layout() # preventing clipping the labels when save to pdf

            if path != None:
                fig.savefig(path)

            plt.show()
        
        %matplotlib notebook
        def __visualize_3d(self, dims, queries = [], path = None):
            fig = plt.figure()
            ax = Axes3D(fig)
            
            num_dims = self.pt_root.num_dims
            plt.xlim(self.pt_root.boundary[dims[0]], self.pt_root.boundary[dims[0]+num_dims])
            plt.ylim(self.pt_root.boundary[dims[1]], self.pt_root.boundary[dims[1]+num_dims])
            ax.set_zlim(self.pt_root.boundary[dims[2]], self.pt_root.boundary[dims[2]+num_dims])
            
            leaves = self.get_leaves()
            for leaf in leaves:
                
                L1 = leaf.boundary[dims[0]]
                L2 = leaf.boundary[dims[1]]
                L3 = leaf.boundary[dims[2]]      
                U1 = leaf.boundary[dims[0]+num_dims]
                U2 = leaf.boundary[dims[1]+num_dims]
                U3 = leaf.boundary[dims[2]+num_dims]
                
                # the 12 lines to form a rectangle
                x = [L1, U1]
                y = [L2, L2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="g")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="g")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="g")
                y = [L2, L2]
                ax.plot3D(x,y,z,color="g")

                x = [L1, L1]
                y = [L2, U2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="g")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="g")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="g")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="g")

                x = [L1, L1]
                y = [L2, L2]
                z = [L3, U3]
                ax.plot3D(x,y,z,color="g")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="g")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="g")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="g")
            
            for query in queries:

                L1 = query[dims[0]]
                L2 = query[dims[1]]
                L3 = query[dims[2]]
                U1 = query[dims[0]+num_dims]
                U2 = query[dims[1]+num_dims]
                U3 = query[dims[2]+num_dims]

                # the 12 lines to form a rectangle
                x = [L1, U1]
                y = [L2, L2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="r")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="r")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="r")
                y = [L2, L2]
                ax.plot3D(x,y,z,color="r")

                x = [L1, L1]
                y = [L2, U2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="r")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="r")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="r")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="r")

                x = [L1, L1]
                y = [L2, L2]
                z = [L3, U3]
                ax.plot3D(x,y,z,color="r")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="r")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="r")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="r")

            if path != None:
                fig.savefig(path)

            plt.show()

In [214]:
class PartitionAlgorithm:
    '''
    The partition algorithms, inlcuding NORA, QdTree and kd-tree.
    '''
    def __init__(self):
        self.partition_tree = None
    
    
    # = = = = = public functions (API) = = = = =
    
    def InitializeWithNORA(self):
        pass
    
    def InitializeWithQDT(self, queries, boundary, dataset, data_threshold):
        '''
        # should I also store the candidate cut positions in Partition Node ?
        The dimension of queries should match the dimension of boundary and dataset!
        '''
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        
        # generate candidate cuts!
        candidate_cut_pos = []
        num_dims = self.partition_tree.pt_root.num_dims
        for query in queries:
            for dim in num_dims:
                candidate_cut_pos.append((dim, query[dim]))
                candidate_cut_pos.append((dim, query[dim+num_dims]))
        self.__QDT(queries, candidate_cuts, data_threshold, self.partition_tree.pt_root)
    
    
    def InitializeWithKDT(self, num_dims, boundary, dataset, data_threshold):
        '''
        num_dims denotes the (first) number of dimension to split
        rewrite the KDT using PartitionTree data structure
        call the recursive __KDT methods
        '''
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        # start from the first dimension
        self.__KDT(0, data_threshold, self.partition_tree.pt_root)
        
    
    def ContinuePartitionWithKDT(self, existing_partition_tree, data_threshold):
        '''
        pass in a PartitionTree instance
        then keep partition its leaf nodes with KDT, if available
        '''
        self.partition_tree = existing_partition_tree
        leaves = existing_partition_tree.get_leaves()
        for leaf in leaves:
            self.__KDT(0, data_threshold, leaf)
    
    
    # = = = = = internal functions = = = = =
    
    def __try_split(self, current_node, split_dim, split_value, data_threshold, queries):
        '''
        return the skip gain if split a node from a given dimension and split value
        '''    
        sub_dataset1_size = np.count_nonzero(current_node.dataset[:,split_dim] < split_value) # process time: 0.007
        sub_dataset2_size = current_node.node_size - sub_dataset1_size

        if sub_dataset1_size < data_threshold or sub_dataset2_size < data_threshold:
            return 0
        
        temp_child_node1, temp_child_node2 = current_node.if_split(split_dim, split_value)
        
        # calculate skip gain
        num_overlap_current, num_overlap_child1, num_overlap_child2 = 0, 0, 0
        for query in queries:
            if current_node.is_overlap(query) > 0:
                num_overlap_current += 1
            if temp_child_node1.is_overlap(query) > 0:
                num_overlap_child1 += 1
            if temp_child_node2.is_overlap(query) > 0:
                num_overlap_child2 += 1
        skip_gain = num_overlap_current * current_node.node_size - num_overlap_child1 * sub_dataset1_size - num_overlap_child2 * sub_dataset2_size
        return skip_gain  
    
    def __QDT(self, queries, candidate_cuts, data_threshold, current_node):
        
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            for leaf in self.partition_tree.get_leaves():
                if leaf.node_size >= 2 * data_threshold:
                    
                    # get best candidate cut position
                    skip, max_skip, max_skip_split_dim, max_skip_split_value = 0, -1, 0, 0
                    for split_dim, split_value in candidate_cuts:
                    
                        skip = self.__try_split(current_node, split_dim, split_value, data_threshold, queries)
                        if skip > max_skip:
                            max_skip = skip
                            max_skip_split_dim = split_dim
                            max_skip_split_value = split_value
                            
                    if max_skip > 0:
                        # if the cost become smaller, apply the cut
                        child_node1, child_node2 = self.partition_tree.apply_split(leaf.nid, split_dim, split_value)
                        CanSplit = True
            
    
    def __KDT(self, current_dim, data_threshold, current_node):
        '''
        Store the dataset in PartitionNode: we can keep it, but only as a tempoary attribute
        '''
        # cannot be further split
        if current_node.node_size < 2 * data_threshold:
            return   
        
        # split the node into equal halves by its current split dimension
        median = np.median(current_node.dataset[:,current_dim])
        
        sub_dataset1_size = np.count_nonzero(current_node.dataset[:,current_dim] < median)
        sub_dataset2_size = len(current_node.dataset) - sub_dataset1_size
        
        if sub_dataset1_size < data_threshold or sub_dataset2_size < data_threshold:
            pass
        else:
            child_node1, child_node2 = self.partition_tree.apply_split(current_node.nid, current_dim, median)
            
            # update next split dimension
            current_dim += 1
            if current_dim >= current_node.num_dims:
                current_dim %= current_node.num_dims
    
            # recursive call on sub nodes
            self.__KDT(current_dim, data_threshold, child_node1)
            self.__KDT(current_dim, data_threshold, child_node2)

In [242]:
class DatasetAndQuerysetHelper:
    '''
    naming:
    dataset: [base_path]/dataset/lineitem_[scale_factor]_[prob_threshold].csv
    domain: [base_path]/dataset/lineitem_[scale_factor]_[prob_threshold]_domains.csv
    queryset: [base_path]/queryset/[prob]/[vary_item]/[vary_val]_[used_dimensions]_[distribution/random].csv
    '''    
    def __init__(self, used_dimensions = None, scale_factor = 100, base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments',
                prob_id = 1, vary_id = 0, vary_val = 0, train_percent = 0.5, random_percent = 0):
        
        self.used_dimensions = used_dimensions # i.e., [1,2,3,4]
        self.total_dims = 16 # the dimensions of lineitem table
        self.domain_dims = 8 # the dimensions we used for split and maintain min max for
        
        self.scale_factor = scale_factor
        self.prob_threshold = 1 / self.scale_factor # the probability of an original record being sampled into this dataset
        self.block_size = 1000000 // self.scale_factor # in original file, 1M rows take approximately 128MB
        
        self.base_path = base_path
        self.save_path_data = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '.csv'
        self.save_path_domain = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '_domains.csv'
        
        self.vary_items = ['default', 'alpha', 'num_dims', 'prob_dims', 'num_X']
        self.vary_id = vary_id
        self.vary_val = vary_val
        
        self.query_base_path = self.base_path + '/queryset/prob' + str(prob_id) + '/' + self.vary_items[vary_id] + '/'
        self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions) # dependent on used_dimensions, so change dim first
        
        self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
        self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'
        
        self.train_percent = train_percent
        
        # the following are default query generation settings
        self.random_percent = random_percent # usef for query generation
        self.cluster_center_amount = 10
        self.maximum_range_percent = 0.1 # 10% of the corresponding domain
        self.sigma_percent = 0.2 # control the differences in a cluster
        
    
    # = = = = = public functions (API) = = = = =
    
    def set_config(self, scale_factor, base_path, used_dimensions, vary_id, vary_val):
        self.used_dimensions = used_dimensions
        self.scale_factor = scale_factor
        self.prob_threshold = 1 / self.scale_factor
        self.block_size = 1000000 // self.scale_factor # in original file, 1M rows take approximately 128MB
        self.base_path = base_path
        self.save_path_data = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '.csv'
        self.save_path_domain = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '_domains.csv'
        self.vary_id = vary_id
        self.vary_val = vary_val 
        self.query_base_path = base_path + '/queryset/prob' + str(prob_id) + '/' + self.vary_items[vary_id] + '/'
        self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions) + 
        self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
        self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'    
        self.train_percent = train_percent
    
    def load_dataset(self, used_dimensions = []):
        '''
        the priority of the used_dimensions argument in the function is higher than the saved attribute version
        domains: [[L1, U1], [L2, U2],...]
        return the dataset projected on selected dimensions
        '''
        dataset = np.genfromtxt(save_path_data, delimiter=',') # the sampled subset
        domains = np.genfromtxt(save_path_domain, delimiter=',') # the domain of that scale
        if used_dimension != []:
            dataset = dataset[:,used_dimensions]
            domains = domains[used_dimensions]
        elif self.used_dimension is not None:
            dataset = dataset[:,self.used_dimensions]
            domains = domains[self.used_dimensions]
        return dataset, domains
    
    def load_queryset(self, return_train_test = True, query_distribution_path = None, query_random_path = None):
        '''
        query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
        how about the used dimension?
        return the saved queryset, should be projected on selected dimensions.
        '''
        # embed used_dimension info into query file's name
        # when load, will auto matically check whether used_dimension is matched!!! or load will failed
        
        distribution_query, random_query = None, None
        
        if query_distribution_path is not None and query_random_path is not None:
            distribution_query = np.genfromtxt(query_distribution_path, delimiter=',')
            random_query = np.genfromtxt(query_random_path, delimiter=',')
        else:
            distribution_query = np.genfromtxt(self.query_distribution_path, delimiter=',')
            random_query = np.genfromtxt(self.query_random_path, delimiter=',')
        
        if return_train_test:
            training_set, testing_set = self.__convert_to_train_test(distribution_query, random_query)
            return training_set, testing_set
        else:
            return distribution_query, random_query
    
    def generate_dataset_and_save(self, original_table_path, chunk_size = 100000):
        '''
        refer to TPCH tools to generate the original dataset (.tbl)
        this function is used to process the .tbl file with given sampling rate to generate a .csv file
        consider the possible table size, this function is implemented in a batch processing manner
        '''
        sampled_subset = []
        domains = [[float('Infinity'), float('-Infinity')] for i in range(self.domain_dims)] # indicate min, max
        
        col_names = ['_c'+str(i) for i in range(self.total_dims)]
        cols = [i for i in range(self.total_dims)]

        start_time = time.time()
        
        batch_count = 0
        for chunk in pd.read_table(original_table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
            print('current chunk: ', batch_count)
            chunk.apply(lambda row: self.__process_chunk_sampling(row, domains, sampled_subset), axis=1)
            batch_count += 1

        end_time = time.time()
        print('total processing time: ', end_time - start_time)
        
        sampled_subset = np.array(sampled_subset)
        domains = np.array(domains)
        np.savetxt(self.save_path_data, sampled_subset, delimiter=',')
        np.savetxt(self.save_path_domain, domains, delimiter=',')
    
    def generate_queryset_and_save(self, query_amount, dim_prob = [], prob_id = 1, vary_id = 0, vary_val = 0, return_train_test = True):
        '''
        generate queryset for given dimensions.
        query_amount: total query amount, including distribution queries and random queries
        dim_prob: the probability of using a given dimension (in used_dimensions) in a query
        other configurations are stored in class attributes
        REMEMBER to change the used_dimensions first if not using the previous one !!!
        '''
        num_random_query = int(query_amount * self.random_percent)
        num_distribution_query = query_amount - num_random_query
        domains = np.genfromtxt(save_path_domain, delimiter=',')[self.used_dimensions]
        if dim_prob == []: # by default, use all the selected dimensions
            dim_prob = [1 for i in range(len(self.used_dimensions))]  
        maximum_range = [(domains[i,1] - domains[i,0]) * self.maximum_range_percent for i in range(len(domains))]
        
        distribution_query = self.__generate_distribution_query()
        random_query = self.__generate_random_query(num_random_query, dim_prob, domains, maximum_range)
        
        # refresh query related class attributes
        self.vary_id = vary_id
        self.vary_val = vary_val      
        self.query_base_path = self.base_path + '/queryset/prob' + str(prob_id) + '/' + self.vary_items[vary_id] + '/'
        self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions) +   
        self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
        self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'
        
        # save
        np.savetxt(self.save_path_data, distribution_query, delimiter=',')
        np.savetxt(self.save_path_domain, random_query, delimiter=',')
        
        if return_train_test:
            training_set, testing_set = self.__convert_to_train_test(distribution_query, random_query)
            return training_set, testing_set
        else:
            return distribution_query, random_query
        
    
    # = = = = = internal functions = = = = =
    
    def __process_chunk_sampling(self, row, domains, sampled_subset):
        prob = random.uniform(0, 1)
        row_numpy = row.to_numpy()  
        for i in range(len(domains)):
            if row_numpy[i] > domains[i][1]:
                domains[i][1] = row_numpy[i]
            if row_numpy[i] < domains[i][0]:
                domains[i][0] = row_numpy[i]
        if prob <= self.prob_threshold:    
            sampled_subset.append(row_numpy[0:self.domain_dims].tolist())
    
    def __convert_to_train_test(self, distribution_query, random_query):
        train_distribution = distribution_query[0:int(self.train_percent*len(distribution_query))]
        test_distribution = distribution_query[int(self.train_percent*len(distribution_query)):]
        train_random = random_query[0:int(self.train_percent*len(random_query))]
        test_random = random_query[int(self.train_percent*len(random_query)):]
        training_set = np.concatenate((train_distribution, train_random), axis=0)
        testing_set = np.concatenate((test_distribution, test_random), axis=0)
        return training_set, testing_set
    
    def __generate_distribution_query(self, query_amount, dim_prob, domains, maximum_range):
        
        # first, generate cluster centers
        centers = []
        for i in range(self.cluster_center_amount):
            center = [] # [D1, D2,..., Dk]
            for k in range(len(domains)):
                ck = random.uniform(domains[k][0], domains[k][1])
                center.append(ck)
            centers.append(center)

        # second, generate expected range for each dimension for each center
        centers_ranges = []
        for i in range(self.cluster_center_amount):
            ranges = [] # the range in all dimensions for a given center
            for k in range(len(domain)):
                ran = random.uniform(0, maximum_range[k])
                ranges.append(ran)
            centers_ranges.append(ranges)

        # third, generate sigma for each dimension for each center
        centers_sigmas = []
        for i in range(self.cluster_center_amount):
            sigmas = []
            for k in range(len(domain)):
                sigma = random.uniform(0, maximum_range[k] * self.sigma_percent)
                sigmas.append(sigma)
            centers_sigmas.append(sigmas)

        # fourth, generate queries
        distribution_query = [] = []
        for i in range(query_amount):
            # choose a center
            center_index = random.randint(0, cluster_center_amount-1) # this is inclusive            
            query_lower, query_upper = [], []
            for k in range(len(domains)):
                # consider whether or not to use this dimension
                L, U = None
                prob = random.uniform(0, 1)
                if prob > dim_probs[k]:
                    L = domains[k][0]
                    U = domains[k][1]
                else:
                    center = centers[center_index]
                    query_range = centers_ranges[center_index][k]
                    L = center[k] - query_range/2
                    U = center[k] + query_range/2
                    L = random.gauss(L, centers_sigmas[center_index][k])
                    U = random.gauss(U, centers_sigmas[center_index][k])
                    if L <= domains[k][0]:
                        L = domains[k][0]
                    if U >= domains[k][1]:
                        U = domains[k][1]
                    if L > U:
                        L, U = U, L
                query_lower.append(L)
                query_upper.append(U)
            distribution_query.append(query_lower + query_upper)
        return distribution_query
    
    def __generate_random_query(self, query_amount, dim_prob, domains, maximum_range):
        random_query = []
        for i in range(query_amount):
            query_lower, query_upper = [], []
            for k in range(len(domains)):     
                # consider whether or not to use this dimension
                L, U = None
                prob = random.uniform(0, 1)
                if prob > dim_prob[k]:
                    L = domains[k][0]
                    U = domains[k][1]
                else:
                    center = random.uniform(domains[k][0], domains[k][1])
                    query_range = random.uniform(0, self.maximum_range[k])
                    L = center - query_range/2
                    U = center + query_range/2
                    if L <= domainS[k][0]:
                        L = domain[k][0]
                    if U >= domain[k][1]:
                        U = domain[k][1]
                query_lower.append(L)
                query_upper.append(U)
            random_query.append(query_lower + query_upper)
        return random_query

In [104]:
# = = = Test = = =

par_tree = PartitionTree(num_dims = 2, boundary = [0,0,2,4], nid = 0, pid = -1, is_irregular_shape_parent = False,
                         is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 100)

new_node = PartitionNode(num_dims = 2, boundary = [0,0,1,2], nid = 1, pid = 0, is_irregular_shape_parent = False,
                 is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 20)

par_tree.add_node(0, new_node)



In [105]:
another_node = copy.deepcopy(new_node)
another_node.boundary[1] = 3

In [106]:
new_node.boundary

[0, 0, 1, 2]

In [107]:
another_node.boundary

[0, 3, 1, 2]

In [86]:
par_tree.nid_node_dict
# par_tree.nid_node_dict[0].is_leaf

{0: <__main__.PartitionNode at 0x176cf11bda0>,
 1: <__main__.PartitionNode at 0x176cf11bdd8>}

In [87]:
par_tree.get_leaves()

[<__main__.PartitionNode at 0x176cf11bdd8>]

In [88]:
serialized_nodes = par_tree.save_tree("")

In [89]:
par_tree.query_single([0,0,1,1])

[1]

In [90]:
par_tree2 = PartitionTree()

In [91]:
par_tree2._PartitionTree__build_tree_from_serialized_node_list(serialized_nodes)

In [96]:
# par_tree2.pt_root
par_tree2.nid_node_dict[1].boundary

[0, 0, 1, 2]

In [109]:
# = = = = = Test PartitionAlgorithm = = = = = 

scale_factor = 100
prob_threshold = 1 / scale_factor
total_dims = 16 # the dimensions of lineitem table
domain_dims = 8 # the dimensions we used and maintain min max for
chunk_size = 100000 # 0.1M 

# base_table_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/9a84f6cd-727f-4f10-ae95-10a0214e10a4-tpc-h-tool/2.18.0_rc2/dbgen/lineitem_'
# table_path = base_table_path + str(scale_factor) + '.tbl'

base_save_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/Robust/dataset/lineitem_'
save_path_data = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '.csv'
save_path_domain = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '_domains.csv'

# by default, the sampled size always equal to 6M (i.e., using scale factor 1), then for a higher scale factor, we need to divide it
block_size = 1000000 // scale_factor # in original file, 1M rows take approximately 128MB

# = = = Data Loading = = =
dataset = np.genfromtxt(save_path_data, delimiter=',') # the sampled subset
domains = np.genfromtxt(save_path_domain, delimiter=',') # the domain of that scale

# Configuration
used_dimensions = [1,2,3,4] # the second and third dimensions

dataset = dataset[:,used_dimensions]
domains = domains[used_dimensions]

In [209]:
dataset.shape
domains

array([[1.e+00, 2.e+07],
       [1.e+00, 1.e+06],
       [1.e+00, 7.e+00],
       [1.e+00, 5.e+01]])

In [222]:
pa = PartitionAlgorithm()
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
pa.InitializeWithKDT(4, boundary, dataset, chunk_size)

In [223]:
pa.partition_tree.visualize([0,1,2])

<IPython.core.display.Javascript object>

In [224]:
pa.partition_tree.node_count

87

In [225]:
pa.partition_tree.pt_root.boundary

[1.0, 1.0, 1.0, 1.0, 20000000.0, 1000000.0, 7.0, 50.0]

In [226]:
leave_nodes = pa.partition_tree.get_leaves()

In [227]:
leave_nodes[-1].nid

87

In [228]:
leave_nodes[0].boundary
# leave_nodes[0].pid

[1.0, 1.0, 1.0, 1.0, 4981078.0, 499904.0, 3.0, 26.0]

In [229]:
leave_nodes[1].boundary
# leave_nodes[1].pid

[4981078.0, 1.0, 1.0, 1.0, 10003198.0, 499904.0, 3.0, 26.0]

In [184]:
leave_nodes[2].boundary
# leave_nodes[2].pid

[1.0, 1.0, 1.0, 26.0, 5006013.0, 499904.0, 3.0, 50.0]

In [202]:
leave_nodes[0].pid
leave_nodes[0].num_dims
leave_nodes[0].node_size

173730

In [164]:
pa.partition_tree.nid_node_dict[8].children_ids

[10, 11]

In [143]:
pa.partition_tree.nid_node_dict[8].boundary

[0, 0, 1, 249824.0]

In [169]:
pa.partition_tree.nid_node_dict[9].boundary
# pa.partition_tree.nid_node_dict[9].children_ids

[0, 249824.0, 1, 2]

In [None]:
new_node = PartitionNode(num_dims = 4, boundary = [0,0,1,2,3,3,3,3], nid = 1, pid = 0, is_irregular_shape_parent = False,
                 is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 20)

pa.partition_tree.add_node(87, new_node)

In [243]:
helper = DatasetAndQuerysetHelper()
training_set, testing_set = helper.load_queryset()

In [244]:
training_set.shape

(50, 8)

In [245]:
testing_set.shape

(50, 8)

In [None]:
dataset, domains = helper.load_dataset([1,2,3,4])
pa = PartitionAlgorithm()
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
pa.InitializeWithKDT(4, boundary, dataset, chunk_size)

In [252]:
def FIFO_simulator(pages_str, frame_size = 3)
    '''
    return the number of page faults
    pages in the form of strings
    '''
    pages = []
    for c in pages_str:
        if c >= '0' and c <= '9':
            pages.append(int(c))
    
    stack = []
    page_faluts = 0
    
    for p in pages:
        if len(stack) < frame_size:
            stack.append(p)
            page_faluts += 1
        elif p in stack:
            stack.remove(p)
            stack.append(p)
        else:
            stack = stack[1:]
            stack.append(p)
            page_faluts += 1
    
    print("FIFO page_faluts:", page_faluts, "Sequence:", pages)

def LRU_simulator(pages_str, frame_size = 3):
    '''
    return the number of page faults
    pages in the form of strings
    '''
    pages = []
    for c in pages_str:
        if c >= '0' and c <= '9':
            pages.append(int(c))
    
    stack = []
    page_faluts = 0
    
    for p in pages:
        if len(stack) < frame_size:
            stack.append(p)
            page_faluts += 1
        elif p in stack:
            stack.remove(p)
            stack.append(p)
        else:
            stack = stack[1:]
            stack.append(p)
            page_faluts += 1
    
    print("LRU page_faluts:", page_faluts, "Sequence:", pages)

In [253]:
LRU_simulator("0 1 2 3 2 4 5 2 4 3 2 0 4 8 1 0 2 4 5 1 2 4 3 2 0")

page_faluts: 20 Sequence: [0, 1, 2, 3, 2, 4, 5, 2, 4, 3, 2, 0, 4, 8, 1, 0, 2, 4, 5, 1, 2, 4, 3, 2, 0]
