In [None]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import ray
import time
import rtree
from rtree import index
import pandas as pd
import numpy as np
from numpy import genfromtxt
import threading
import pyarrow as pa
import pyarrow.parquet as pq

In [4]:
from NORAPartitionTree import *

In [5]:
conf = SparkConf().setAll([("spark.executor.memory", "24g"),("spark.driver.memory","24g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","16g"),
                          ("spark.driver.maxResultSize", "16g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [6]:
def process_chunk_row(row, used_dims, partition_tree, pid_data_dict, count, k):
    if count[0] % 100000 == 0:
        print('proces',k,'has routed',count[0],'rows')
    count[0] += 1
    row_numpy = row.to_numpy()
    row_point = row_numpy[used_dims].tolist()
    pid = 0
    try:
        pid = partition_tree.get_pid_for_data_point(row_point)
    except:
        print(row_point)
    if pid in pid_data_dict:
        pid_data_dict[pid]+=[row_numpy.tolist()]
    else:
        pid_data_dict[pid]=[row_numpy.tolist()]

@ray.remote
def process_chunk(chunk, used_dims, partition_path, k, partition_tree):
    print("enter data routing process", k, '..')    
    pid_data_dict = {}
    count = [0]
    chunk.apply(lambda row: process_chunk_row(row, used_dims, partition_tree, pid_data_dict, count, k), axis=1)
    dict_id = ray.put(pid_data_dict)
    print("exit data routing process", k, ".")
    return dict_id

In [7]:
@ray.remote
def merge_epochs(parameters):
    #fs = pa.hdfs.connect()
    pids, epoch_count, hdfs_path, fs, merge_process = parameters
    for pid in pids:
        parquets = []
        for epoch in range(epoch_count):
            path = hdfs_path + "epoch_" + str(epoch) + '/partition_' + str(pid)+'.parquet'
            #print(path)
            try:
                par = pq.read_table(path)
                parquets.append(par)
            except:
                continue
        print("process", merge_process, "pid", pid, " len parquets (epochs):", len(parquets))
        if len(parquets) == 0:
            continue
        merged_parquet = pa.concat_tables(parquets)
        merge_path = hdfs_path + 'merged/partition_' + str(pid)+'.parquet'
        fw = fs.open_output_stream(merge_path)
        pq.write_table(merged_parquet, fw)
        fw.close()
    print('exit merge process', merge_process)

def merge_dict(base_dict, new_dict):
    for key, val in new_dict.items():
        if key in base_dict:
            base_dict[key] += val
        else:
            base_dict[key] = val
    new_dict.clear()

def dump_dict_2_hdfs_epoch(merged_dict, column_names, hdfs_path, fs, epoch):
    #print('= = = start dumping in main thread = = =')
    for pid, val in merged_dict.items():
        #print("writing to pid:",pid)
        path = hdfs_path + 'epoch_'+ str(epoch) +'/partition_' + str(pid) + '.parquet'
        pdf = pd.DataFrame(val, columns=column_names)
        adf = pa.Table.from_pandas(pdf)
        #fw = fs.open(path, 'wb')
        fw = fs.open_output_stream(path) # it seems the new version does not have the open function
        pq.write_table(adf, fw)
        fw.close()
    #print('= = = exit dumping = = =')

In [8]:
def batch_data_parallel(table_path, partition_path, chunk_size, used_dims, hdfs_path, num_dims, num_process, hdfs_private_ip):
    
    begin_time = time.time()
    
    ray.init(num_cpus=num_process)
    
    # column names for pandas dataframe
    cols = [i for i in range(num_dims)]
    col_names = ['_c'+str(i) for i in range(num_dims)]
    
    # pyarrow parquent append
    fs = pa.fs.HadoopFileSystem(hdfs_private_ip, port=9000, user='hdfs', replication=1)
    
    partition_tree = PartitionTree(len(used_dims))
    partition_tree.load_tree(partition_path)
    
    # chunks
    chunk_count = 0
    epoch_count = 0
    
    # collect object refs
    result_ids = []
    last_batch_ids = [] 
    first_loop = True
    
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
        print('reading chunk: ', chunk_count)
        
        chunk_id = ray.put(chunk)
        result_id = process_chunk.remote(chunk_id, used_dims, partition_path, chunk_count, partition_tree)
        
        del chunk_id
        result_ids.append(result_id)
        del result_id
        
        # after all process allocated a chunk, process and dump the data
        if chunk_count % num_process == num_process - 1:
            
            if first_loop:
                first_loop = False
                last_batch_ids = result_ids.copy()
                result_ids.clear()
                chunk_count += 1
                continue
            else:
                print("= = = Process Dump For Chunk", chunk_count-2*num_process+1, "to", chunk_count-num_process, "= = =")
                base_dict = {}
                while len(last_batch_ids):
                    done_id, last_batch_ids = ray.wait(last_batch_ids)
                    dict_id = ray.get(done_id[0])
                    result_dict = ray.get(dict_id)
                    merge_dict(base_dict, result_dict)
                dump_dict_2_hdfs_epoch(base_dict, col_names, hdfs_path, fs, epoch_count) # consider whether we should use another process
                epoch_count += 1
                base_dict.clear()
                print("= = = Finish Dump For Chunk", chunk_count-2*num_process+1, "to", chunk_count-num_process, "= = =")
                last_batch_ids = result_ids.copy()
                result_ids.clear()
                
            current_time = time.time()
            time_elapsed = current_time - begin_time
            print("= = = TOTAL PROCESSED SO FAR:", (chunk_count-num_process+1) * chunk_size,"ROWS. TIME SPENT:", time_elapsed, "SECONDS = = =")
                
        chunk_count += 1
        
    # process the last few batches
    print("= = = Process Dump For Last Few Chunks = = =")
    base_dict = {}
    while len(last_batch_ids):
        done_id, last_batch_ids = ray.wait(last_batch_ids)
        dict_id = ray.get(done_id[0])
        result_dict = ray.get(dict_id)
        merge_dict(base_dict, result_dict)
    dump_dict_2_hdfs_epoch(base_dict, col_names, hdfs_path, fs, epoch_count)
    epoch_count += 1
    base_dict.clear()
    last_batch_ids.clear()

    base_dict = {}
    while len(result_ids):
        done_id, result_ids = ray.wait(result_ids)
        dict_id = ray.get(done_id[0])
        result_dict = ray.get(dict_id)
        merge_dict(base_dict, result_dict)
    result_ids.clear() # clear up the references
    dump_dict_2_hdfs_epoch(base_dict, col_names, hdfs_path, fs, epoch_count)
    epoch_count += 1
    base_dict.clear()
    result_ids.clear()
    
    
    # Merge all the epochs
    print("= = = Start Merging the Epochs = = =")
    leaves = partition_tree.get_leaves()
    pids = [leaf.nid for leaf in leaves]
    steps = len(pids) // num_process
    not_ready_ids = []
    for i in range(num_process):
        sub_pids = pids[i*steps:(i+1)*steps]
        if i == num_process - 1:
            sub_pids = pids[i*steps:]
        rid = merge_epochs.remote([sub_pids, epoch_count, hdfs_path, fs, i])
        not_ready_ids.append(rid)
        
    while len(not_ready_ids):
        ready_ids, not_ready_ids = ray.wait(not_ready_ids)
    
    ray.shutdown()
    
    finish_time = time.time()
    print('= = = = = TOTAL DATA ROUTING AND PERISITING TIME:', finish_time - begin_time, "= = = = =")

In [9]:
# = = = Configuration (UBDA Cloud Centos) = = =
scale_factor = 50
table_base_path = '/media/datadrive1/TPCH/dbgen/'
table_path = table_base_path + 'lineitem_' + str(scale_factor) + '.tbl'

num_process = 4
chunk_size = 2000000 
# 6M rows = about 1GB raw data

num_dims = 16
used_dims = [1,2,3,4]

# base path of HDFS
hdfs_private_ip = '192.168.6.62'
hdfs_base_path = 'hdfs://192.168.6.62:9000/user/cloudray/'

problem_type = 2

# nora_hdfs = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/'
# qdtree_hdfs = hdfs_base_path + 'QdTree/prob' + str(problem_type) + '/'
# kdtree_hdfs = hdfs_base_path + 'KDTree/prob' + str(problem_type) + '/'

nora_hdfs = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/"
qdtree_hdfs = hdfs_base_path + 'QdTree/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/"
kdtree_hdfs = hdfs_base_path + 'KDTree/prob' + str(problem_type) + '/scale' + str(scale_factor) + "/"

# base path of Partition
partition_base_path = '/home/centos/PartitionLayout/'

# nora_partition = partition_base_path + 'prob' + str(problem_type) + '_nora'
# qdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_qdtree'
# kdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_kdtree'
nora_partition = partition_base_path + 'prob' + str(problem_type) + '_nora_scale' + str(scale_factor)
qdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_qdtree_scale' + str(scale_factor)
kdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_kdtree_scale'+ str(scale_factor)

In [10]:
# # if the merge phase failed, rerun with this one
# # Adjust epoch_count, num_process, partition_tree.load_tree(...), and qdtree_hdfs !!!

# ray.init(num_cpus=1)
# fs = pa.fs.HadoopFileSystem('192.168.6.62', port=9000, user='hdfs', replication=1)
    
# partition_tree = PartitionTree(len(used_dims))
# partition_tree.load_tree(qdtree_partition)

# epoch_count = 51

# leaves = partition_tree.get_leaves()
# pids = [leaf.nid for leaf in leaves]
# steps = len(pids) // num_process
# not_ready_ids = []
# for i in range(num_process):
#     sub_pids = pids[i*steps:(i+1)*steps]
#     if i == num_process - 1:
#         sub_pids = pids[i*steps:]
#     rid = merge_epochs.remote([sub_pids, epoch_count, qdtree_hdfs, fs, i])
#     not_ready_ids.append(rid)

In [10]:
# = = = Execution = = =
if __name__ == '__main__':
    batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, num_process, hdfs_private_ip)
    print('finish nora data routing..')
#     batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, num_process, hdfs_private_ip)
#     print('finish qdtree data routing..')
#     batch_data_parallel(table_path, kdtree_partition, chunk_size, used_dims, kdtree_hdfs, num_dims, num_process, hdfs_private_ip)
#     print('finish kdtree data routing..')

2021-07-20 20:13:15,299	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


reading chunk:  0
[2m[36m(pid=25582)[0m enter data routing process 0 ..
[2m[36m(pid=25582)[0m proces 0 has routed 0 rows
reading chunk:  1
[2m[36m(pid=25582)[0m proces 0 has routed 100000 rows
[2m[36m(pid=25583)[0m enter data routing process 1 ..
[2m[36m(pid=25583)[0m proces 1 has routed 0 rows
reading chunk:  2
[2m[36m(pid=25582)[0m proces 0 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 1 has routed 100000 rows
[2m[36m(pid=25585)[0m enter data routing process 2 ..
[2m[36m(pid=25582)[0m proces 0 has routed 300000 rows
[2m[36m(pid=25585)[0m proces 2 has routed 0 rows
reading chunk:  3
[2m[36m(pid=25583)[0m proces 1 has routed 200000 rows
[2m[36m(pid=25582)[0m proces 0 has routed 400000 rows
[2m[36m(pid=25585)[0m proces 2 has routed 100000 rows
[2m[36m(pid=25584)[0m enter data routing process 3 ..
[2m[36m(pid=25583)[0m proces 1 has routed 300000 rows
[2m[36m(pid=25584)[0m proces 3 has routed 0 rows
reading chunk:  4
[2m[36m(pid=2558

= = = Finish Dump For Chunk 0 to 3[2m[36m(pid=25585)[0m proces 6 has routed 1100000 rows
[2m[36m(pid=25583)[0m proces 5 has routed 1300000 rows
[2m[36m(pid=25584)[0m proces 7 has routed 1000000 rows
 = = =
= = = TOTAL PROCESSED SO FAR: 8000000 ROWS. TIME SPENT: 162.9341585636139 SECONDS = = =
[2m[36m(pid=25582)[0m proces 4 has routed 1500000 rows
[2m[36m(pid=25585)[0m proces 6 has routed 1200000 rows
[2m[36m(pid=25583)[0m proces 5 has routed 1400000 rows
[2m[36m(pid=25584)[0m proces 7 has routed 1100000 rows
reading chunk:  8
[2m[36m(pid=25582)[0m proces 4 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 6 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 5 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 7 has routed 1200000 rows
[2m[36m(pid=25582)[0m proces 4 has routed 1700000 rows
[2m[36m(pid=25585)[0m proces 6 has routed 1400000 rows
reading chunk:  9
[2m[36m(pid=25583)[0m proces 5 has routed 1600000 rows
[2m[36m(pid=25584)[0

[2m[36m(pid=25582)[0m proces 12 has routed 400000 rows
[2m[36m(pid=25584)[0m enter data routing process 15 ..
[2m[36m(pid=25583)[0m proces 13 has routed 300000 rows
[2m[36m(pid=25585)[0m proces 14 has routed 100000 rows
[2m[36m(pid=25584)[0m proces 15 has routed 0 rows
[2m[36m(pid=25582)[0m proces 12 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 13 has routed 400000 rows
[2m[36m(pid=25585)[0m proces 14 has routed 200000 rows
[2m[36m(pid=25584)[0m proces 15 has routed 100000 rows
[2m[36m(pid=25582)[0m proces 12 has routed 600000 rows
[2m[36m(pid=25583)[0m proces 13 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 14 has routed 300000 rows
[2m[36m(pid=25584)[0m proces 15 has routed 200000 rows
[2m[36m(pid=25582)[0m proces 12 has routed 700000 rows
[2m[36m(pid=25583)[0m proces 13 has routed 600000 rows
[2m[36m(pid=25585)[0m proces 14 has routed 400000 rows
[2m[36m(pid=25584)[0m proces 15 has routed 300000 rows
[2m[36m(pid=25582

[2m[36m(pid=25585)[0m proces 18 has routed 1200000 rows
[2m[36m(pid=25583)[0m proces 17 has routed 1400000 rows
[2m[36m(pid=25582)[0m proces 16 has routed 1600000 rows
reading chunk:  20
[2m[36m(pid=25584)[0m proces 19 has routed 1200000 rows
[2m[36m(pid=25585)[0m proces 18 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 17 has routed 1500000 rows
[2m[36m(pid=25582)[0m proces 16 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 19 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 17 has routed 1600000 rows
reading chunk:  21
[2m[36m(pid=25585)[0m proces 18 has routed 1400000 rows
[2m[36m(pid=25582)[0m proces 16 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 19 has routed 1400000 rows
[2m[36m(pid=25585)[0m proces 18 has routed 1500000 rows
[2m[36m(pid=25582)[0m proces 16 has routed 1900000 rows
[2m[36m(pid=25583)[0m proces 17 has routed 1700000 rows
reading chunk:  22
[2m[36m(pid=25584)[0m proces 19 has routed 1500000 rows

[2m[36m(pid=25584)[0m proces 27 has routed 100000 rows
[2m[36m(pid=25585)[0m proces 26 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 25 has routed 400000 rows
[2m[36m(pid=25582)[0m proces 24 has routed 600000 rows
[2m[36m(pid=25584)[0m proces 27 has routed 200000 rows
[2m[36m(pid=25585)[0m proces 26 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 25 has routed 500000 rows
[2m[36m(pid=25582)[0m proces 24 has routed 700000 rows
[2m[36m(pid=25584)[0m proces 27 has routed 300000 rows
[2m[36m(pid=25585)[0m proces 26 has routed 400000 rows
[2m[36m(pid=25583)[0m proces 25 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 24 has routed 800000 rows
[2m[36m(pid=25584)[0m proces 27 has routed 400000 rows
[2m[36m(pid=25585)[0m proces 26 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 25 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 24 has routed 900000 rows
[2m[36m(pid=25584)[0m proces 27 has routed 500000 rows
[2m[36m(pid=

[2m[36m(pid=25584)[0m proces 31 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 29 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 30 has routed 1400000 rows
[2m[36m(pid=25582)[0m proces 28 has routed 1700000 rows
reading chunk:  32
[2m[36m(pid=25584)[0m proces 31 has routed 1400000 rows
[2m[36m(pid=25585)[0m proces 30 has routed 1500000 rows
[2m[36m(pid=25582)[0m proces 28 has routed 1800000 rows
[2m[36m(pid=25583)[0m proces 29 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 31 has routed 1500000 rows
reading chunk:  33
[2m[36m(pid=25585)[0m proces 30 has routed 1600000 rows
[2m[36m(pid=25582)[0m proces 28 has routed 1900000 rows
[2m[36m(pid=25583)[0m proces 29 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 31 has routed 1600000 rows
[2m[36m(pid=25583)[0m proces 29 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 30 has routed 1700000 rows
reading chunk:  34
[2m[36m(pid=25584)[0m proces 31 has routed 1700000 rows

[2m[36m(pid=25585)[0m proces 38 has routed 300000 rows
[2m[36m(pid=25584)[0m proces 39 has routed 300000 rows
[2m[36m(pid=25582)[0m proces 36 has routed 700000 rows
[2m[36m(pid=25583)[0m proces 37 has routed 600000 rows
[2m[36m(pid=25585)[0m proces 38 has routed 400000 rows
[2m[36m(pid=25584)[0m proces 39 has routed 400000 rows
[2m[36m(pid=25583)[0m proces 37 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 36 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 38 has routed 500000 rows
[2m[36m(pid=25584)[0m proces 39 has routed 500000 rows
[2m[36m(pid=25582)[0m proces 36 has routed 900000 rows
[2m[36m(pid=25583)[0m proces 37 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 38 has routed 600000 rows
[2m[36m(pid=25584)[0m proces 39 has routed 600000 rows
[2m[36m(pid=25583)[0m proces 37 has routed 900000 rows
[2m[36m(pid=25582)[0m proces 36 has routed 1000000 rows
[2m[36m(pid=25585)[0m proces 38 has routed 700000 rows
[2m[36m(pid

[2m[36m(pid=25585)[0m proces 42 has routed 1400000 rows
reading chunk:  45
[2m[36m(pid=25584)[0m proces 43 has routed 1500000 rows
[2m[36m(pid=25582)[0m proces 40 has routed 1900000 rows
[2m[36m(pid=25583)[0m proces 41 has routed 1800000 rows
[2m[36m(pid=25585)[0m proces 42 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 43 has routed 1600000 rows
reading chunk:  46
[2m[36m(pid=25583)[0m proces 41 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 42 has routed 1600000 rows
[2m[36m(pid=25584)[0m proces 43 has routed 1700000 rows
[2m[36m(pid=25582)[0m exit data routing process 40 .
reading chunk:  47
[2m[36m(pid=25585)[0m proces 42 has routed 1700000 rows
= = = Process Dump For Chunk 40 to 43 = = =
[2m[36m(pid=25584)[0m proces 43 has routed 1800000 rows
[2m[36m(pid=25582)[0m enter data routing process 44 ..
[2m[36m(pid=25583)[0m exit data routing process 41 .
[2m[36m(pid=25585)[0m proces 42 has routed 1800000 rows
[2m[36m(pid=25582)

[2m[36m(pid=25585)[0m proces 51 has routed 400000 rows
[2m[36m(pid=25582)[0m proces 48 has routed 900000 rows
[2m[36m(pid=25583)[0m proces 49 has routed 800000 rows
[2m[36m(pid=25584)[0m proces 50 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 51 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 49 has routed 900000 rows
[2m[36m(pid=25582)[0m proces 48 has routed 1000000 rows
[2m[36m(pid=25584)[0m proces 50 has routed 600000 rows
[2m[36m(pid=25585)[0m proces 51 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 48 has routed 1100000 rows
[2m[36m(pid=25583)[0m proces 49 has routed 1000000 rows
[2m[36m(pid=25584)[0m proces 50 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 51 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 48 has routed 1200000 rows
[2m[36m(pid=25583)[0m proces 49 has routed 1100000 rows
[2m[36m(pid=25584)[0m proces 50 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 51 has routed 800000 rows
[2m[36m

[2m[36m(pid=25584)[0m proces 54 has routed 1600000 rows
reading chunk:  58
[2m[36m(pid=25585)[0m proces 55 has routed 1600000 rows
[2m[36m(pid=25582)[0m exit data routing process 52 .
[2m[36m(pid=25584)[0m proces 54 has routed 1700000 rows
[2m[36m(pid=25582)[0m enter data routing process 56 ..
reading chunk:  59
[2m[36m(pid=25585)[0m proces 55 has routed 1700000 rows
[2m[36m(pid=25582)[0m proces 56 has routed 0 rows
[2m[36m(pid=25583)[0m exit data routing process 53 .
[2m[36m(pid=25584)[0m proces 54 has routed 1800000 rows
= = = Process Dump For Chunk 52 to 55 = = =
[2m[36m(pid=25583)[0m enter data routing process 57 ..
[2m[36m(pid=25585)[0m proces 55 has routed 1800000 rows
[2m[36m(pid=25582)[0m proces 56 has routed 100000 rows
[2m[36m(pid=25584)[0m proces 54 has routed 1900000 rows
[2m[36m(pid=25583)[0m proces 57 has routed 0 rows
[2m[36m(pid=25582)[0m proces 56 has routed 200000 rows
[2m[36m(pid=25585)[0m proces 55 has routed 1900000 r

[2m[36m(pid=25585)[0m proces 63 has routed 500000 rows
[2m[36m(pid=25584)[0m proces 62 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 60 has routed 1100000 rows
[2m[36m(pid=25583)[0m proces 61 has routed 1000000 rows
[2m[36m(pid=25584)[0m proces 62 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 63 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 60 has routed 1200000 rows
[2m[36m(pid=25583)[0m proces 61 has routed 1100000 rows
[2m[36m(pid=25584)[0m proces 62 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 63 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 60 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 61 has routed 1200000 rows
[2m[36m(pid=25584)[0m proces 62 has routed 900000 rows
[2m[36m(pid=25585)[0m proces 63 has routed 800000 rows
[2m[36m(pid=25582)[0m proces 60 has routed 1400000 rows
[2m[36m(pid=25583)[0m proces 61 has routed 1300000 rows
[2m[36m(pid=25584)[0m proces 62 has routed 1000000 rows
[2m

[2m[36m(pid=25582)[0m enter data routing process 68 ..
reading chunk:  71
[2m[36m(pid=25582)[0m proces 68 has routed 0 rows
[2m[36m(pid=25583)[0m exit data routing process 65 .
= = = Process Dump For Chunk 64 to 67 = = =
[2m[36m(pid=25584)[0m proces 66 has routed 1800000 rows
[2m[36m(pid=25585)[0m proces 67 has routed 1700000 rows
[2m[36m(pid=25583)[0m enter data routing process 69 ..
[2m[36m(pid=25582)[0m proces 68 has routed 100000 rows
[2m[36m(pid=25583)[0m proces 69 has routed 0 rows
[2m[36m(pid=25584)[0m proces 66 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 67 has routed 1800000 rows
[2m[36m(pid=25582)[0m proces 68 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 69 has routed 100000 rows
[2m[36m(pid=25585)[0m proces 67 has routed 1900000 rows
[2m[36m(pid=25582)[0m proces 68 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 69 has routed 200000 rows
[2m[36m(pid=25584)[0m exit data routing process 66 .
[2m[36m(pid=2558

[2m[36m(pid=25582)[0m proces 72 has routed 1200000 rows
[2m[36m(pid=25584)[0m proces 74 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 75 has routed 600000 rows
[2m[36m(pid=25583)[0m proces 73 has routed 1100000 rows
[2m[36m(pid=25584)[0m proces 74 has routed 800000 rows
[2m[36m(pid=25582)[0m proces 72 has routed 1300000 rows
[2m[36m(pid=25585)[0m proces 75 has routed 700000 rows
[2m[36m(pid=25583)[0m proces 73 has routed 1200000 rows
[2m[36m(pid=25582)[0m proces 72 has routed 1400000 rows
[2m[36m(pid=25584)[0m proces 74 has routed 900000 rows
[2m[36m(pid=25585)[0m proces 75 has routed 800000 rows
[2m[36m(pid=25583)[0m proces 73 has routed 1300000 rows
[2m[36m(pid=25582)[0m proces 72 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 74 has routed 1000000 rows
[2m[36m(pid=25585)[0m proces 75 has routed 900000 rows
[2m[36m(pid=25583)[0m proces 73 has routed 1400000 rows
[2m[36m(pid=25582)[0m proces 72 has routed 1600000 rows
[2m

= = = Process Dump For Chunk 76 to 79 = = =
[2m[36m(pid=25584)[0m proces 78 has routed 1900000 rows
[2m[36m(pid=25583)[0m proces 81 has routed 0 rows
[2m[36m(pid=25585)[0m proces 79 has routed 1800000 rows
[2m[36m(pid=25582)[0m proces 80 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 81 has routed 100000 rows
[2m[36m(pid=25585)[0m proces 79 has routed 1900000 rows
[2m[36m(pid=25582)[0m proces 80 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 81 has routed 200000 rows
[2m[36m(pid=25584)[0m exit data routing process 78 .
[2m[36m(pid=25582)[0m proces 80 has routed 400000 rows
[2m[36m(pid=25583)[0m proces 81 has routed 300000 rows
[2m[36m(pid=25584)[0m enter data routing process 82 ..
[2m[36m(pid=25584)[0m proces 82 has routed 0 rows
[2m[36m(pid=25585)[0m exit data routing process 79 .
[2m[36m(pid=25582)[0m proces 80 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 81 has routed 400000 rows
[2m[36m(pid=25585)[0m enter data rou

[2m[36m(pid=25583)[0m proces 85 has routed 1400000 rows
[2m[36m(pid=25584)[0m proces 87 has routed 400000 rows
[2m[36m(pid=25582)[0m proces 84 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 86 has routed 1000000 rows
[2m[36m(pid=25583)[0m proces 85 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 87 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 86 has routed 1100000 rows
[2m[36m(pid=25582)[0m proces 84 has routed 1700000 rows
[2m[36m(pid=25583)[0m proces 85 has routed 1600000 rows
[2m[36m(pid=25584)[0m proces 87 has routed 600000 rows
[2m[36m(pid=25585)[0m proces 86 has routed 1200000 rows
[2m[36m(pid=25582)[0m proces 84 has routed 1800000 rows
[2m[36m(pid=25583)[0m proces 85 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 87 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 84 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 86 has routed 1300000 rows
[2m[36m(pid=25583)[0m proces 85 has routed 1800000 rows


reading chunk:  95
[2m[36m(pid=25585)[0m proces 94 has routed 0 rows
= = = Process Dump For Chunk 88 to 91 = = =
[2m[36m(pid=25583)[0m proces 92 has routed 400000 rows
[2m[36m(pid=25584)[0m proces 91 has routed 1700000 rows
[2m[36m(pid=25582)[0m proces 93 has routed 300000 rows
[2m[36m(pid=25585)[0m proces 94 has routed 100000 rows
[2m[36m(pid=25583)[0m proces 92 has routed 500000 rows
[2m[36m(pid=25584)[0m proces 91 has routed 1800000 rows
[2m[36m(pid=25582)[0m proces 93 has routed 400000 rows
[2m[36m(pid=25585)[0m proces 94 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 92 has routed 600000 rows
[2m[36m(pid=25584)[0m proces 91 has routed 1900000 rows
[2m[36m(pid=25582)[0m proces 93 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 94 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 92 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 93 has routed 600000 rows
[2m[36m(pid=25585)[0m proces 94 has routed 400000 rows
[2m[36m(p

[2m[36m(pid=25583)[0m proces 96 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 98 has routed 1200000 rows
[2m[36m(pid=25584)[0m proces 99 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 97 has routed 1500000 rows
[2m[36m(pid=25583)[0m proces 96 has routed 1700000 rows
[2m[36m(pid=25585)[0m proces 98 has routed 1300000 rows
[2m[36m(pid=25582)[0m proces 97 has routed 1600000 rows
[2m[36m(pid=25584)[0m proces 99 has routed 800000 rows
[2m[36m(pid=25583)[0m proces 96 has routed 1800000 rows
[2m[36m(pid=25585)[0m proces 98 has routed 1400000 rows
[2m[36m(pid=25584)[0m proces 99 has routed 900000 rows
[2m[36m(pid=25582)[0m proces 97 has routed 1700000 rows
[2m[36m(pid=25583)[0m proces 
[2m[36m(pid=25583)[0m 96 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 98 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 99 has routed 1000000 rows
[2m[36m(pid=25582)[0m proces 97 has routed 1800000 rows
[2m[36m(pid=25585)[0m proces 9

[2m[36m(pid=25585)[0m proces 106 has routed 100000 rows
[2m[36m(pid=25583)[0m proces 104 has routed 400000 rows
[2m[36m(pid=25582)[0m proces 105 has routed 300000 rows
[2m[36m(pid=25584)[0m proces 103 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 106 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 104 has routed 500000 rows
[2m[36m(pid=25582)[0m proces 105 has routed 400000 rows
[2m[36m(pid=25585)[0m proces 106 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 104 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 105 has routed 500000 rows
[2m[36m(pid=25584)[0m exit data routing process 103 .
[2m[36m(pid=25585)[0m proces 106 has routed 400000 rows
[2m[36m(pid=25583)[0m proces 104 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 105 has routed 600000 rows
[2m[36m(pid=25584)[0m enter data routing process 107 ..
[2m[36m(pid=25584)[0m proces 107 has routed 0 rows
[2m[36m(pid=25585)[0m proces 106 has routed 500000 rows
[2

[2m[36m(pid=25584)[0m proces 111 has routed 800000 rows
[2m[36m(pid=25583)[0m proces 108 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 110 has routed 1300000 rows
[2m[36m(pid=25582)[0m proces 109 has routed 1500000 rows
[2m[36m(pid=25584)[0m proces 111 has routed 900000 rows
[2m[36m(pid=25585)[0m proces 110 has routed 1400000 rows
[2m[36m(pid=25582)[0m proces 109 has routed 1600000 rows
[2m[36m(pid=25583)[0m proces 108 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 111 has routed 1000000 rows
[2m[36m(pid=25585)[0m proces 110 has routed 1500000 rows
[2m[36m(pid=25583)[0m proces 108 has routed 1800000 rows
= = = Finish Dump For Chunk 104 to 107 = = =
= = = TOTAL PROCESSED SO FAR: 216000000 ROWS. TIME SPENT: 2579.595992088318 SECONDS = = =
[2m[36m(pid=25582)[0m proces 109 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 111 has routed 1100000 rows
[2m[36m(pid=25585)[0m proces 110 has routed 1600000 rows
[2m[36m(pid=25583)[0m pr

[2m[36m(pid=25585)[0m proces 118 has routed 200000 rows
[2m[36m(pid=25583)[0m proces 116 has routed 500000 rows
[2m[36m(pid=25582)[0m proces 117 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 118 has routed 300000 rows
[2m[36m(pid=25583)[0m proces 116 has routed 600000 rows
[2m[36m(pid=25584)[0m exit data routing process 115 .
[2m[36m(pid=25582)[0m proces 117 has routed 600000 rows
[2m[36m(pid=25583)[0m proces 116 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 118 has routed 400000 rows
[2m[36m(pid=25584)[0m enter data routing process 119 ..
[2m[36m(pid=25584)[0m proces 119 has routed 0 rows
[2m[36m(pid=25582)[0m proces 117 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 118 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 116 has routed 800000 rows
[2m[36m(pid=25584)[0m proces 119 has routed 100000 rows
[2m[36m(pid=25582)[0m proces 117 has routed 800000 rows
[2m[36m(pid=25583)[0m proces 116 has routed 900000 rows
[2m

[2m[36m(pid=25582)[0m proces 121 has routed 1600000 rows
[2m[36m(pid=25585)[0m proces 122 has routed 1400000 rows
[2m[36m(pid=25583)[0m proces 120 has routed 1700000 rows
[2m[36m(pid=25584)[0m proces 123 has routed 1000000 rows
[2m[36m(pid=25582)[0m proces 121 has routed 1700000 rows
[2m[36m(pid=25585)[0m proces 122 has routed 1500000 rows
= = = Finish Dump For Chunk 116 to 119 = = =
= = = TOTAL PROCESSED SO FAR: 240000000[2m[36m(pid=25583)[0m proces 120 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 123 has routed 1100000 rows
[2m[36m(pid=25582)[0m proces 121 has routed 1800000 rows
 ROWS. TIME SPENT: 2854.165899991989 SECONDS = = =
[2m[36m(pid=25585)[0m proces 122 has routed 1600000 rows
[2m[36m(pid=25583)[0m proces 120 has routed 1900000 rows
[2m[36m(pid=25584)[0m proces 123 has routed 1200000 rows
[2m[36m(pid=25582)[0m proces 121 has routed 1900000 rows
[2m[36m(pid=25585)[0m proces 122 has routed 1700000 rows
reading chunk:  124
[2m

[2m[36m(pid=25583)[0m proces 128 has routed 500000 rows
[2m[36m(pid=25582)[0m proces 129 has routed 500000 rows
[2m[36m(pid=25585)[0m proces 130 has routed 300000 rows
[2m[36m(pid=25584)[0m exit data routing process 127 .
[2m[36m(pid=25583)[0m proces 128 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 129 has routed 600000 rows
[2m[36m(pid=25584)[0m enter data routing process 131 ..
[2m[36m(pid=25585)[0m proces 130 has routed 400000 rows
[2m[36m(pid=25584)[0m proces 131 has routed 0 rows
[2m[36m(pid=25583)[0m proces 128 has routed 700000 rows
[2m[36m(pid=25582)[0m proces 129 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 130 has routed 500000 rows
[2m[36m(pid=25584)[0m proces 131 has routed 100000 rows
[2m[36m(pid=25583)[0m proces 128 has routed 800000 rows
[2m[36m(pid=25582)[0m proces 129 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 130 has routed 600000 rows
[2m[36m(pid=25584)[0m proces 131 has routed 200000 rows
[2m

[2m[36m(pid=25583)[0m proces 132 has routed 1700000 rows= = = Finish Dump For Chunk
[2m[36m(pid=25584)[0m proces 135 has routed 900000 rows
[2m[36m(pid=25582)[0m proces 133 has routed 1700000 rows
[2m[36m(pid=25585)[0m proces 134 has routed 1500000 rows
 128 to 131 = = =
= = = TOTAL PROCESSED SO FAR: 264000000 ROWS. TIME SPENT: 3127.3543446063995 SECONDS = = =
[2m[36m(pid=25583)[0m proces 132 has routed 1800000 rows
[2m[36m(pid=25582)[0m proces 133 has routed 1800000 rows
[2m[36m(pid=25585)[0m proces 134 has routed 1600000 rows
[2m[36m(pid=25584)[0m proces 135 has routed 1000000 rows
reading chunk:  136
[2m[36m(pid=25583)[0m proces 132 has routed 1900000 rows
[2m[36m(pid=25582)[0m proces 133 has routed 1900000 rows
[2m[36m(pid=25584)[0m proces 135 has routed 1100000 rows
[2m[36m(pid=25585)[0m proces 134 has routed 1700000 rows
reading chunk:  137
[2m[36m(pid=25585)[0m proces 134 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 135 has route

[2m[36m(pid=25583)[0m proces 141 has routed 500000 rows
[2m[36m(pid=25584)[0m proces 139 has routed 1900000 rows
[2m[36m(pid=25582)[0m proces 140 has routed 700000 rows
[2m[36m(pid=25585)[0m proces 142 has routed 400000 rows
[2m[36m(pid=25583)[0m proces 141 has routed 600000 rows
[2m[36m(pid=25582)[0m proces 140 has routed 800000 rows
[2m[36m(pid=25585)[0m proces 142 has routed 500000 rows
[2m[36m(pid=25583)[0m proces 141 has routed 700000 rows
[2m[36m(pid=25584)[0m exit data routing process 139 .
[2m[36m(pid=25582)[0m proces 140 has routed 900000 rows
[2m[36m(pid=25585)[0m proces 142 has routed 600000 rows
[2m[36m(pid=25583)[0m proces 141 has routed 800000 rows
[2m[36m(pid=25584)[0m enter data routing process 143 ..
[2m[36m(pid=25582)[0m proces 140 has routed 1000000 rows
[2m[36m(pid=25584)[0m proces 143 has routed 0 rows
[2m[36m(pid=25585)[0m proces 142 has routed 700000 rows
[2m[36m(pid=25583)[0m proces 141 has routed 900000 rows
[

[2m[36m(pid=25582)[0m proces 144 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 147 has routed 900000 rows
[2m[36m(pid=25585)[0m proces 146 has routed 1500000 rows
[2m[36m(pid=25583)[0m proces 145 has routed 1700000 rows
[2m[36m(pid=25582)[0m proces 144 has routed 1900000 rows
[2m[36m(pid=25584)[0m proces 147 has routed 1000000 rows
[2m[36m(pid=25585)[0m proces 146 has routed 1600000 rows
= = = Finish Dump For Chunk 140 to 143 = = =
= = = TOTAL PROCESSED SO FAR: 288000000 ROWS. TIME SPENT: 3419.7263247966766 SECONDS = = =
[2m[36m(pid=25583)[0m proces 145 has routed 1800000 rows
[2m[36m(pid=25584)[0m proces 147 has routed 1100000 rows
[2m[36m(pid=25585)[0m proces 146 has routed 1700000 rows
[2m[36m(pid=25583)[0m proces 145 has routed 1900000 rows
[2m[36m(pid=25584)[0m proces 147 has routed 1200000 rows
[2m[36m(pid=25582)[0m exit data routing process 144 .
[2m[36m(pid=25585)[0m proces 146 has routed 1800000 rows
reading chunk:  148
[2m[36

[2m[36m(pid=25585)[0m 2021-07-20 21:12:24,760 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[2m[36m(pid=25584)[0m 2021-07-20 21:12:24,757 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[2m[36m(pid=25582)[0m 2021-07-20 21:12:24,789 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[2m[36m(pid=25583)[0m 2021-07-20 21:12:24,923 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[2m[36m(pid=25584)[0m process 1 pid 115  len parquets (epochs): 38
[2m[36m(pid=25582)[0m process 0 pid 62  len parquets (epochs): 38
[2m[36m(pid=25585)[0m process 3 pid 94  len parquets (epochs): 38
[2m[36m(pid=25583)[0m process 2 pid 123  len parquets (epochs): 38
[2m[36m(pid=25585)[0m process 3 pid 95  len parquets (epochs): 38
[2m[36m(pid=25583)[0m process 2 pid 124  len parquets (epochs): 38
[2m[36m(pid=25584)[0m process 1 pid 116  len parquets (epochs): 38
[2m[36m(pid=25583)[0m process 2 pid 84  len parquets (epochs): 38
[2m[36m(pid=25584)[0m process 1 pid 117  len parquets (epochs): 38
[2m[36m(pid=25585)[0m process 3 pid 96  len parquets (epochs): 38
[2m[36m(pid=25584)[0m process 1 pid 118  len parquets (epochs): 38
[2m[36m(pid=25583)[0m process 2 pid 85  len parquets (epochs): 38
[2m[36m(pid=25582)[0m process 0 pid 63  len parquets (epochs): 38
[2m[36m(pid=25585)[0m process 3 pid 137  len parquets (epochs): 38
[2m[36m(pid=25584)[0m pr

In [None]:
# num_process = 4
# problem_type = 2

# nora_hdfs = hdfs_base_path + 'NORA/prob' + str(problem_type) + '/'
# qdtree_hdfs = hdfs_base_path + 'QdTree/prob' + str(problem_type) + '/'
# kdtree_hdfs = hdfs_base_path + 'KDTree/prob' + str(problem_type) + '/'

# # base path of Partition
# partition_base_path = '/home/centos/PartitionLayout/'

# nora_partition = partition_base_path + 'prob' + str(problem_type) + '_nora'
# qdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_qdtree'
# kdtree_partition = partition_base_path + 'prob' + str(problem_type) + '_kdtree'

# if __name__ == '__main__':
#     #batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, num_process, hdfs_private_ip)
#     #print('finish nora data routing..')
#     #batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, num_process, hdfs_private_ip)
#     #print('finish qdtree data routing..')
#     batch_data_parallel(table_path, kdtree_partition, chunk_size, used_dims, kdtree_hdfs, num_dims, num_process, hdfs_private_ip)
#     print('finish kdtree data routing..')

In [11]:
ray.shutdown()

In [None]:
# import pyarrow as pa
# import pyarrow.parquet as pq
# fs = pa.hdfs.connect()

In [None]:
# # read every parquet and dump it, see if it has a difference in query response time
# # use pyarrow !!!
# start_time = time.time()
# for i in range(421):
#     print("processing", i, "..")
#     pid = i
#     read_path = nora_hdfs + 'partition_' + str(pid)+'.parquet'
#     save_path = nora_hdfs + 'reorganized/partition_' + str(pid)+'.parquet'
#     adf = pq.read_table(read_path)
#     print('done read parquet from path')
#     fw = fs.open(save_path, 'wb')
#     print("start writing..")
#     pq.write_table(adf, fw)
#     fw.close()
# end_time = time.time()
# print('parquet merge time:',end_time-start_time)

In [None]:
# start_time = time.time()
# for i in range(68):
#     print("processing", i, "..")
#     pid = i
#     read_path = qdtree_hdfs + 'partition_' + str(pid)+'.parquet'
#     save_path = qdtree_hdfs + 'reorganized/partition_' + str(pid)+'.parquet'
#     adf = pq.read_table(read_path)
#     print('done read parquet from path')
#     fw = fs.open(save_path, 'wb')
#     print("start writing..")
#     pq.write_table(adf, fw)
#     fw.close()
# end_time = time.time()
# print('parquet merge time:',end_time-start_time)

In [None]:
# start_time = time.time()
# for i in range(512):
#     print("processing", i, "..")
#     pid = i
#     read_path = kdtree_hdfs + 'partition_' + str(pid)+'.parquet'
#     save_path = kdtree_hdfs + 'reorganized/partition_' + str(pid)+'.parquet'
#     adf = pq.read_table(read_path)
#     print('done read parquet from path')
#     fw = fs.open(save_path, 'wb')
#     print("start writing..")
#     pq.write_table(adf, fw)
#     fw.close()
# end_time = time.time()
# print('parquet merge time:',end_time-start_time)

In [2]:
import findspark
findspark.init() # this must be executed before the below import
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import ray
import time
import rtree
from rtree import index
import pandas as pd
import numpy as np
from numpy import genfromtxt
import threading
import pyarrow as pa
import pyarrow.parquet as pq
from partition_tree import PartitionTree

In [3]:
conf = SparkConf().setAll([("spark.executor.memory", "24g"),("spark.driver.memory","24g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","16g"),
                          ("spark.driver.maxResultSize", "16g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
import os
os.environ['HADOOP_HOME'] = '/home/liupengju/hadoop'
os.environ['JAVA_HOME'] = '/home/liupengju/java/jdk1.8.0_281'
os.environ['ARROW_LIBHDFS_DIR'] = '/home/liupengju/hadoop/lib/native'



In [5]:
fs=pa.hdfs.connect(host='10.77.110.133', port=9001, user='liupengju')

  """Entry point for launching an IPython kernel.


In [12]:
path='hdfs://10.77.110.133:9001/par_nora/NORA/prob2/scale100/epoch_0/partition_767.parquet'
adf=pa.Table.from_pandas(pd.DataFrame(np.array([[2,3,4]]), columns=['a1','a2','a3']))
with fs.open(path,'wb') as f:
    pq.write_table(adf, f)