In [None]:
import findspark
findspark.init() # this must be executed before the below import

In [None]:
from pyspark.sql import SparkSession
import pyarrow as pa
import pyarrow.parquet as pq
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL Execution") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory","8g") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","8g") \
    .getOrCreate()

In [None]:
fs = pa.hdfs.connect(host='10.77.110.133', port=9001, user='liupengju')
import numpy as np
import time
import rtree
from rtree import index
from partition_tree import PartitionTree

In [None]:
def find_overlap_parquets(query, partition_index):
    '''
    find out all the overlap partition ids
    '''
    query_lower = [qr[0] for qr in query]
    query_upper = [qr[1] for qr in query]
    query_border = tuple(query_lower + query_upper)
    overlap_pids = list(partition_index.intersection(query_border))

    return overlap_pids


def transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, querytype=0, pids=None):
    sql = ''
    for i, dim in enumerate(used_dims):
        # if query[i][0] != -1:
        sql += column_name_dict[dim] + '>' + str(query[i]) + ' and '
        # if query[i][1] != -1:
        sql += column_name_dict[dim] + '<' + str(query[len(used_dims) + i]) + ' and '
    sql = sql[0:-4]  # remove the last 'and '
    
    if pids is not None and len(pids) != 0:
        pids = str(set(pids)).replace(" ", "")  # '{1,2,3}'
        hdfs_path = hdfs_path + '/partition_' + pids + ".parquet"

    if querytype == 0:
        sql = "SELECT * FROM parquet.`" + hdfs_path + "`WHERE " + sql
    elif querytype == 1:
        sql = "SELECT COUNT(*) FROM parquet.`" + hdfs_path + "`WHERE " + sql
    elif querytype == 2:
        sql = "SELECT variance(_c0) FROM parquet.`" + hdfs_path + "`WHERE " + sql
    elif querytype == 3:
        sql = "SELECT * FROM parquet.`" + hdfs_path + "`"
    elif querytype == 4:
        sql = "SELECT _c1,_c2,_c3 FROM parquet.`" + hdfs_path + "`"
    # else:
    # pids = str(set(pids)).replace(" ", "") # '{1,2,3}'
    # sql = "SELECT * FROM parquet.`" + hdfs_path + 'partition_' + pids + ".parquet` WHERE " + sql
    # sql = "SELECT COUNT(*) FROM parquet.`" + hdfs_path + 'partition_' + pids + ".parquet` WHERE " + sql
    # sql = "SELECT variance(_c0) FROM parquet.`" + hdfs_path + 'partition_' + pids + ".parquet` WHERE " + sql
    return sql


def query_with_parquets(query, used_dims, column_name_dict, hdfs_path, querytype=0, partition_tree=None,
                        print_execution_time=False):
    start_time = time.time()

    sql = None
    if partition_tree == None:
        sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, querytype)
    else:
        pids = partition_tree.query_single(query)  # find_overlap_parquets(query, rtree_idx)
        sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, querytype, pids)
        pre_sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, 4, pids)
        # print(sql)
        print("pids:", pids)

    # print("generated sql:", sql)
    end_time_1 = time.time()
    # spark.sql(pre_sql).collect()
    query_result = spark.sql(sql).collect()

    # lazy_query = spark.sql(sql) # lazy execution
    # query_time = spark.time(spark.sql(sql).collect())  # there is no .time in pyspark

    end_time_2 = time.time()
    #     print("result size:", len(query_result))
    #     print("result content:", query_result)

    # 1. compute actual tuples from parquet file
    # pids = partition_tree.query_single(query)
    # pids = str(set(pids)).replace(" ", "")  # '{1,2,3}'
    # parquets_path = hdfs_path + '/partition_' + pids + ".parquet"
    # count_sql = "SELECT COUNT(*) FROM parquet.`" + parquets_path+"`"
    # actual_data_size=spark.sql(count_sql).collect()[0]['count(1)']
    # 2. compute actual tuples from parquet meta file
    pids = partition_tree.query_single(query)
    data_size_list1=[]
    data_size_list2=[]
    [data_size_list1.append(partition_tree.nid_node_dict[pid].node_size) for pid in pids]
    parquets_path=[hdfs_path + '/partition_' + str(pid) + ".parquet" for pid in pids]
    actual_data_size2=0
    for par_path in parquets_path:
        fw=fs.open(par_path,'rb')
        meta = pa.parquet.read_metadata(fw, memory_map=False).to_dict()
        actual_data_size2+=meta['num_rows']
        data_size_list2.append(meta['num_rows'])
        fw.close()
    query_translation_time = end_time_1 - start_time
    query_execution_time = end_time_2 - end_time_1
    print('query execution time: ', query_execution_time,' parquet size:',sum(data_size_list2))
    
    if print_execution_time:
        print('query translation time: ', query_translation_time)
        print('query execution time: ', query_execution_time)

    # return (query_result, query_translation_time, query_execution_time) # this takes too much memory
    return (query_translation_time, query_execution_time, len(query_result),actual_data_size2,sum(data_size_list1))

In [None]:
def load_query(path):
    query_set = np.genfromtxt(path, delimiter=' ')
    # query_set = query_set.reshape(len(query_set),-1,2)
    return query_set


def kdnode_2_border(kdnode):
    lower = [domain[0] for domain in kdnode[0]]
    upper = [domain[1] for domain in kdnode[0]]
    border = tuple(lower + upper)  # non interleave
    return border


def load_partitions_from_file(path):
    '''
    the loaded stretched_kdnodes: [num_dims, l1,l2,...,ln, u1,u2,...,un, size, id, pid, left_child,id, right_child_id]
    '''
    stretched_kdnodes = np.genfromtxt(path, delimiter=',')
    num_dims = int(stretched_kdnodes[0, 0])
    kdnodes = []
    for i in range(len(stretched_kdnodes)):
        domains = [[stretched_kdnodes[i, k + 1], stretched_kdnodes[i, 1 + num_dims + k]] for k in range(num_dims)]
        row = [domains]
        row.append(stretched_kdnodes[i, 2 * num_dims + 1])
        # to be compatible with qd-tree's partition, that do not have the last 4 attributes
        if len(stretched_kdnodes[i]) > 2 * num_dims + 2:
            row.append(stretched_kdnodes[i, -4])
            row.append(stretched_kdnodes[i, -3])
            row.append(stretched_kdnodes[i, -2])
            row.append(stretched_kdnodes[i, -1])
        kdnodes.append(row)
    return kdnodes


# def prepare_partition_index(partition_path):
#     partitions = load_partitions_from_file(partition_path)

#     p = index.Property()
#     p.leaf_capacity = 32
#     p.index_capacity = 32
#     p.NearMinimumOverlaoFactor = 16
#     p.fill_factor = 0.8
#     p.overwrite = True
#     pidx = index.Index(properties = p)

#     partition_index = index.Index(properties = p)
#     for i in range(len(partitions)):
#         partition_index.insert(i, kdnode_2_border(partitions[i]))

#     return partition_index

def batch_query(queryset, used_dims, column_name_dict, hdfs_path, querytype=0, partition_path=""):
    #     rtree_idx = None
    #     if use_rtree_idx:
    #         rtree_idx = prepare_partition_index(partition_path)

    partition_tree = PartitionTree(len(used_dims))  # newly added
    partition_tree.load_tree(partition_path)

    start_time = time.time()

    # add statistics result
    results = []
    count = 0
    for i in range(0, len(queryset)):
        result = query_with_parquets(queryset[i], used_dims, column_name_dict, hdfs_path, querytype, partition_tree)
        print('finish query#', count)
        count += 1
        results.append(result)
        # print("query:",queryset[i])
    #         if i == 0:
    #             break # just analysis top k queries
    end_time = time.time()

    result_size = 0
    actual_data_size=0
    thoery_data_size=0
    total_response_time=0
    for result in results:
        total_response_time+=result[1]
        result_size += result[2]
        actual_data_size+=result[3]
        thoery_data_size+=result[4]
    avg_result_size = int(result_size // len(queryset))
    avg_actual_data_size = int(actual_data_size // len(queryset))
    avg_thoery_data_size = int(thoery_data_size // len(queryset))
    print('average result size: ', avg_result_size)
    print('average actual data size: ', avg_actual_data_size)
    print('total query response time: ', total_response_time)
    print('average query response time: ', total_response_time / len(queryset))
    # print('average thoery data size: ', avg_thoery_data_size)

In [None]:
# ==== set environment parameters and generate dataset ====
scale_factor = 10
problem_type = 2
query_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/NORA_experiments/queryset/'
## scale 50 and 10 / 100
training_set = np.genfromtxt(query_path+"prob"+str(problem_type)+"_"+str(scale_factor)+"_train"+".csv", delimiter=',')
used_dims = [1,2,3]
num_dims = 16
column_names = ['_c'+str(i) for i in range(num_dims)]
column_name_dict = {}
for i in range(num_dims):
    column_name_dict[i] = column_names[i]
# scale 100
# hdfs_path_nora = 'hdfs://10.77.110.133:9001/par_nora/NORA/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_qdtree ='hdfs://10.77.110.133:9001/par_nora/QdTree/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_paw ='hdfs://10.77.110.133:9001/par_nora/PAW/prob'+str(problem_type)+'/scale100/merged/'

# scale 50 and 10
hdfs_base_path = 'hdfs://10.77.110.133:9001/par_nora/'
hdfs_path_nora = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"
hdfs_path_qdtree = hdfs_base_path + 'QdTree/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"
hdfs_path_paw = hdfs_base_path + 'PAW/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"

# newly added
querytype = 2 # 0: SELECT *;  2: SELECT variance(_c0)
partition_base_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/PartitionLayout/'
# scale 100
nora_partition_path = partition_base_path + 'prob' + str(problem_type) + '_nora_scale' + str(scale_factor)
qdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_qdtree_scale' + str(scale_factor)
paw_partition_path = partition_base_path + 'prob' + str(problem_type) + '_paw_scale' + str(scale_factor)

# scale 50 and 10 / 100
# nora_partition_path = partition_base_path + 'prob' + str(problem_type) + '_nora_scale' + str(scale_factor)
# qdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_qdtree_scale' + str(scale_factor)
# kdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_kdtree_scale' + str(scale_factor)


In [30]:
def load_query(path):
    query_set = np.genfromtxt(path, delimiter=' ')
    # query_set = query_set.reshape(len(query_set),-1,2)
    return query_set


def kdnode_2_border(kdnode):
    lower = [domain[0] for domain in kdnode[0]]
    upper = [domain[1] for domain in kdnode[0]]
    border = tuple(lower + upper)  # non interleave
    return border


def load_partitions_from_file(path):
    '''
    the loaded stretched_kdnodes: [num_dims, l1,l2,...,ln, u1,u2,...,un, size, id, pid, left_child,id, right_child_id]
    '''
    stretched_kdnodes = np.genfromtxt(path, delimiter=',')
    num_dims = int(stretched_kdnodes[0, 0])
    kdnodes = []
    for i in range(len(stretched_kdnodes)):
        domains = [[stretched_kdnodes[i, k + 1], stretched_kdnodes[i, 1 + num_dims + k]] for k in range(num_dims)]
        row = [domains]
        row.append(stretched_kdnodes[i, 2 * num_dims + 1])
        # to be compatible with qd-tree's partition, that do not have the last 4 attributes
        if len(stretched_kdnodes[i]) > 2 * num_dims + 2:
            row.append(stretched_kdnodes[i, -4])
            row.append(stretched_kdnodes[i, -3])
            row.append(stretched_kdnodes[i, -2])
            row.append(stretched_kdnodes[i, -1])
        kdnodes.append(row)
    return kdnodes


# def prepare_partition_index(partition_path):
#     partitions = load_partitions_from_file(partition_path)

#     p = index.Property()
#     p.leaf_capacity = 32
#     p.index_capacity = 32
#     p.NearMinimumOverlaoFactor = 16
#     p.fill_factor = 0.8
#     p.overwrite = True
#     pidx = index.Index(properties = p)

#     partition_index = index.Index(properties = p)
#     for i in range(len(partitions)):
#         partition_index.insert(i, kdnode_2_border(partitions[i]))

#     return partition_index

def batch_query(queryset, used_dims, column_name_dict, hdfs_path, querytype=0, partition_path=""):
    #     rtree_idx = None
    #     if use_rtree_idx:
    #         rtree_idx = prepare_partition_index(partition_path)

    partition_tree = PartitionTree(len(used_dims))  # newly added
    partition_tree.load_tree(partition_path)

    start_time = time.time()

    # add statistics result
    results = []
    count = 0
    for i in range(0, len(queryset)):
        result = query_with_parquets(queryset[i], used_dims, column_name_dict, hdfs_path, querytype, partition_tree)
        print('finish query#', count)
        count += 1
        results.append(result)
        # print("query:",queryset[i])
    #         if i == 0:
    #             break # just analysis top k queries
    end_time = time.time()

    result_size = 0
    actual_data_size=0
    thoery_data_size=0
    total_response_time=0
    for result in results:
        total_response_time+=result[1]
        result_size += result[2]
        actual_data_size+=result[3]
        thoery_data_size+=result[4]
    avg_result_size = int(result_size // len(queryset))
    avg_actual_data_size = int(actual_data_size // len(queryset))
    avg_thoery_data_size = int(thoery_data_size // len(queryset))
    print('average result size: ', avg_result_size)
    print('average actual data size: ', avg_actual_data_size)
    print('total query response time: ', total_response_time)
    print('average query response time: ', total_response_time / len(queryset))
    # print('average thoery data size: ', avg_thoery_data_size)

In [31]:
# ==== set environment parameters and generate dataset ====
scale_factor = 10
problem_type = 2
query_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/NORA_experiments/queryset/'
## scale 50 and 10 / 100
training_set = np.genfromtxt(query_path+"prob"+str(problem_type)+"_"+str(scale_factor)+"_train"+".csv", delimiter=',')
used_dims = [1,2,3]
num_dims = 16
column_names = ['_c'+str(i) for i in range(num_dims)]
column_name_dict = {}
for i in range(num_dims):
    column_name_dict[i] = column_names[i]
# scale 100
# hdfs_path_nora = 'hdfs://10.77.110.133:9001/par_nora/NORA/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_qdtree ='hdfs://10.77.110.133:9001/par_nora/QdTree/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_paw ='hdfs://10.77.110.133:9001/par_nora/PAW/prob'+str(problem_type)+'/scale100/merged/'

# scale 50 and 10
hdfs_base_path = 'hdfs://10.77.110.133:9001/par_nora/'
hdfs_path_nora = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"
hdfs_path_qdtree = hdfs_base_path + 'QdTree/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"
hdfs_path_paw = hdfs_base_path + 'PAW/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged/"

# newly added
querytype = 2 # 0: SELECT *;  2: SELECT variance(_c0)
partition_base_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/PartitionLayout/'
# scale 100
nora_partition_path = partition_base_path + 'prob' + str(problem_type) + '_nora_scale' + str(scale_factor)
qdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_qdtree_scale' + str(scale_factor)
paw_partition_path = partition_base_path + 'prob' + str(problem_type) + '_paw_scale' + str(scale_factor)

# scale 50 and 10 / 100
# nora_partition_path = partition_base_path + 'prob' + str(problem_type) + '_nora_scale' + str(scale_factor)
# qdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_qdtree_scale' + str(scale_factor)
# kdtree_partition_path = partition_base_path + 'prob' + str(problem_type) + '_kdtree_scale' + str(scale_factor)


In [32]:
# Qd-Tree
batch_query(training_set, used_dims, column_name_dict, hdfs_path_qdtree, querytype, qdtree_partition_path)

pids: [371]
query execution time:  0.9663307666778564  parquet size: 212842
finish query# 0
pids: [315, 308]
query execution time:  0.1431427001953125  parquet size: 453248
finish query# 1
pids: [329, 403, 404]
query execution time:  4.598523378372192  parquet size: 813850
finish query# 2
pids: [477, 478]
query execution time:  0.14386534690856934  parquet size: 212865
finish query# 3
pids: [441, 442]
query execution time:  0.13976740837097168  parquet size: 256178
finish query# 4
pids: [380, 220]
query execution time:  1.2594013214111328  parquet size: 501172
finish query# 5
pids: [332]
query execution time:  0.13219809532165527  parquet size: 171969
finish query# 6
pids: [298]
query execution time:  0.12680602073669434  parquet size: 115541
finish query# 7
pids: [312, 437]
query execution time:  0.31265807151794434  parquet size: 360642
finish query# 8
pids: [318, 215]
query execution time:  1.7723743915557861  parquet size: 508386
finish query# 9
pids: [423]
query execution time:  0

In [33]:
# PAW
batch_query(training_set, used_dims, column_name_dict, hdfs_path_paw, querytype, paw_partition_path)

pids: [491]
query execution time:  0.9224138259887695  parquet size: 111089
finish query# 0
pids: [450, 461]
query execution time:  0.13250017166137695  parquet size: 213663
finish query# 1
pids: [329, 411, 508, 507]
query execution time:  4.47746467590332  parquet size: 813850
finish query# 2
pids: [523]
query execution time:  0.13434457778930664  parquet size: 106523
finish query# 3
pids: [459, 460]
query execution time:  0.13706469535827637  parquet size: 256178
finish query# 4
pids: [499, 359]
query execution time:  1.2031636238098145  parquet size: 201400
finish query# 5
pids: [332]
query execution time:  0.11739301681518555  parquet size: 171969
finish query# 6
pids: [297]
query execution time:  0.11224889755249023  parquet size: 99698
finish query# 7
pids: [457, 453]
query execution time:  0.259533166885376  parquet size: 234329
finish query# 8
pids: [352, 465, 351]
query execution time:  1.927386999130249  parquet size: 343537
finish query# 9
pids: [433]
query execution time:  

In [34]:
# NORA
batch_query(training_set, used_dims, column_name_dict, hdfs_path_nora, querytype, nora_partition_path)

pids: [501]
query execution time:  0.888373613357544  parquet size: 114315
finish query# 0
pids: [471]
query execution time:  0.1459202766418457  parquet size: 99876
finish query# 1
pids: [536, 743, 535]
query execution time:  5.798771858215332  parquet size: 463526
finish query# 2
pids: [473]
query execution time:  0.14585280418395996  parquet size: 101521
finish query# 3
pids: [470]
query execution time:  0.13528180122375488  parquet size: 109233
finish query# 4
pids: [373]
query execution time:  1.220435380935669  parquet size: 111687
finish query# 5
pids: [344]
query execution time:  0.11366605758666992  parquet size: 160490
finish query# 6
pids: [303]
query execution time:  0.12602567672729492  parquet size: 113698
finish query# 7
pids: [467]
query execution time:  0.2734949588775635  parquet size: 100834
finish query# 8
pids: [529, 363]
query execution time:  1.914266586303711  parquet size: 223644
finish query# 9
pids: [451]
query execution time:  0.2988474369049072  parquet siz

In [35]:
# check number of row groups
# import pyarrow as pa
# fs = pa.fs.HadoopFileSystem('192.168.6.62', port=9000, user='hdfs', replication=1)
# path1 = 'hdfs://192.168.6.62:9000/user/cloudray/NORA/prob1/merged/partition_94.parquet'
# path2 = 'hdfs://192.168.6.62:9000/user/cloudray/KDTree/prob1/merged/partition_98.parquet'
# fw1 = fs.open_input_file(path1)
# meta1 = pa.parquet.read_metadata(fw1, memory_map=False)
# print(meta1)
# print(meta1.row_group(0))
# fw1.close()
#
# fw2 = fs.open_input_file(path2)
# meta2 = pa.parquet.read_metadata(fw2, memory_map=False)
# print(meta2)
# print(meta2.row_group(0))
# fw2.close()