In [None]:
import findspark
findspark.init() # this must be executed before the below import
from pyspark.sql import SparkSession
import pyarrow as pa
import pyarrow.parquet as pq
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Python Spark SQL Execution") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory","8g") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","8g") \
    .getOrCreate()
fs = pa.hdfs.connect(host='10.77.110.133', port=9001, user='liupengju')
import numpy as np
import time
from partition_tree import PartitionTree
from join_until import JOIN_UNTIL

In [None]:
def query_with_parquets(hdfs_path, querytype=0, partition_tree=None,pids=[],print_execution_time=False):
    start_time = time.time()
    pids = str(set(pids)).replace(" ", "")  # '{1,2,3}'
    pars_path = hdfs_path + '/partition_' + pids + ".parquet"
    if querytype == 0:
        sql = "SELECT * FROM parquet.`" + pars_path
    elif querytype == 1:
        sql = "SELECT COUNT(*) FROM parquet.`" + pars_path
    elif querytype == 2:
        sql = "SELECT _c0 FROM parquet.`" + pars_path

    # print("generated sql:", sql)
    end_time_1 = time.time()
    query_result = spark.sql(sql).collect()
    #     query_result = spark.sql(sql) # lazy execution
    #     query_time = spark.time(spark.sql(sql).collect())  # there is no .time in pyspark

    end_time_2 = time.time()

    parquets_path=[hdfs_path + '/partition_' + str(pid) + ".parquet" for pid in pids]
    actual_data_size2=0
    for par_path in parquets_path:
        fw=fs.open(par_path,'rb')
        meta = pa.parquet.read_metadata(fw, memory_map=False).to_dict()
        actual_data_size2+=meta['num_rows']
        fw.close()
    query_translation_time = end_time_1 - start_time
    query_execution_time = end_time_2 - end_time_1
    # print('query execution time: ', query_execution_time)

    if print_execution_time:
        print('query translation time: ', query_translation_time)
        print('query execution time: ', query_execution_time)

    # return (query_result, query_translation_time, query_execution_time) # this takes too much memory
    return (query_translation_time, query_execution_time, len(query_result),actual_data_size2)

In [None]:
def batch_query(a_training_set,b_training_set,a_training_set_for_join,b_training_set_for_join,A_partition_path,B_partition_path,group_type,hdfs_path_a,hdfs_path_b):
    ju = JOIN_UNTIL(None, None, join_attr, len(used_dims))
    a_join_queries, b_join_queries = ju.generate_join_queries(a_training_set_for_join, b_training_set_for_join)
    partition_tree_a = PartitionTree(len(used_dims))
    partition_tree_b = PartitionTree(len(used_dims))
    partition_tree_a.join_attr,partition_tree_b.join_attr=0,0
    pa_A,pa_B=partition_tree_a.load_tree(A_partition_path),partition_tree_b.load_tree(B_partition_path)
    ju.set_partitioner(pa_A, pa_B)
    total_a_ids,total_b_ids= ju.print_shuffle_hyper_blocks(a_join_queries, b_join_queries,group_type)
    total_query_exection_time,totol_real_data_size=0,0
    # record time for normal queries
    for i,query_set in enumerate([a_training_set,b_training_set]):
        tree=pa_A if i==0 else pa_B
        cur_hdfs_path=hdfs_path_a if i==0 else hdfs_path_b
        for query in query_set:
            pids=tree.query_single(query)
            result=query_with_parquets(hdfs_path=cur_hdfs_path,querytype=2,partition_tree=tree,pids=pids)
            total_query_exection_time+=result[1]
            totol_real_data_size+=result[3]
    # record time for join queries
    for no,a_ids in enumerate(total_a_ids):
        b_ids=total_b_ids[no]
        result_a=query_with_parquets(hdfs_path=hdfs_path_a,querytype=2,partition_tree=pa_A,pids=a_ids)
        result_b=query_with_parquets(hdfs_path=hdfs_path_b,querytype=2,partition_tree=pa_B,pids=b_ids)
        for result in [result_a,result_b]:
            total_query_exection_time+=result[1]
            totol_real_data_size+=result[3]
    print("Query execution time:",total_query_exection_time)
    print("Access row count:",totol_real_data_size)

In [1]:
# ==== set environment parameters and generate dataset ====
scale_factor = 10
problem_type = 2
query_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/NORA_experiments/queryset/join/'
# scale 100
a_training_set=np.genfromtxt(f"{query_path}a_prob{problem_type}_{scale_factor}_train.csv",delimiter=',')
b_training_set=np.genfromtxt(f"{query_path}a_prob{problem_type}_{scale_factor}_train.csv",delimiter=',')
a_training_set_for_join = np.genfromtxt(f"{query_path}join_a_prob{problem_type}_{scale_factor}_train.csv",delimiter=',')
b_training_set_for_join = np.genfromtxt(f"{query_path}join_b_prob{problem_type}_{scale_factor}_train.csv",delimiter=',')
used_dims = [1,2,3]
num_dims = 16
join_attr=0
column_names = ['_c'+str(i) for i in range(num_dims)]
column_name_dict = {}
for i in range(num_dims):
    column_name_dict[i] = column_names[i]
# scale 100
# hdfs_path_nora = 'hdfs://10.77.110.133:9001/par_nora/NORA/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_qdtree ='hdfs://10.77.110.133:9001/par_nora/QdTree/prob'+str(problem_type)+'/scale100/merged/'
# hdfs_path_paw ='hdfs://10.77.110.133:9001/par_nora/PAW/prob'+str(problem_type)+'/scale100/merged/'

# scale 50 and 10
hdfs_base_path = 'hdfs://10.77.110.133:9001/par_nora/join/'
hdfs_path_nora_a = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged_A/"
hdfs_path_nora_b = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged_B/"
hdfs_path_adaptdb_a = hdfs_base_path + 'AdaptDB/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged_A/"
hdfs_path_adaptdb_b = hdfs_base_path + 'AdaptDB/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/merged_B/"

# newly added
querytype = 2 # 0: SELECT *;  2: SELECT variance(_c0)
partition_base_path = '/home/liupengju/pycharmProjects/NORA_JOIN_SIMULATION/PartitionLayout/join/'
# scale 100
jnora_A_partition_path = partition_base_path + 'prob' + str(problem_type) + '_jnora_A_scale' + str(scale_factor)
jnora_B_partition_path = partition_base_path + 'prob' + str(problem_type) + '_jnora_B_scale' + str(scale_factor)
adaptdb_A_partition_path = partition_base_path + 'prob' + str(problem_type) + '_adaptdb_A_scale' + str(scale_factor)
adaptdb_B_partition_path = partition_base_path + 'prob' + str(problem_type) + '_adaptdb_B_scale' + str(scale_factor)

NameError: name 'np' is not defined

In [None]:
batch_query(a_training_set,b_training_set,a_training_set_for_join,b_training_set_for_join,jnora_A_partition_path,jnora_B_partition_path,3,hdfs_path_nora_a,hdfs_path_nora_b)


In [None]:
batch_query(a_training_set,b_training_set,a_training_set_for_join,b_training_set_for_join,adaptdb_A_partition_path,adaptdb_B_partition_path,1,hdfs_path_adaptdb_a,hdfs_path_adaptdb_b)