In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import cudf
from cudf.dataframe import DataFrame
from collections import OrderedDict
import gc
from glob import glob
import os
import pyblazing
import pandas as pd
import time
from dask.distributed import Client
from chronometer import Chronometer

from pyblazing import FileSystemType, SchemaFrom, DriverType

def register_hdfs():
    print('*** Register a HDFS File System ***')
    fs_status = pyblazing.register_file_system(
        authority="myLocalHdfs",
        type=FileSystemType.HDFS,
        root="/",
        params={
            "host": "127.0.0.1",
            "port": 54310,
            "user": "hadoop",
            "driverType": DriverType.LIBHDFS3,
            "kerberosTicket": ""
        }
    )
    print(fs_status)


def deregister_hdfs():
    fs_status = pyblazing.deregister_file_system(authority="myLocalHdfs")
    print(fs_status)

def register_posix():

    print('*** Register a POSIX File System ***')
    fs_status = pyblazing.register_file_system(
        authority="mortgage",
        type=FileSystemType.POSIX,
        root="/"
    )
    print(fs_status)

def deregister_posix():
    fs_status = pyblazing.deregister_file_system(authority="mortgage")
    print(fs_status)

from libgdf_cffi import ffi, libgdf
import os
 
def run_gpu_workflow(params):
    quarter=params['quarter']
    year=params['year']
    perf_file=params['perf_file']
    
    script_path = "/blazingdb/data/results/start_conda_gpu.sh"
    print("params:", str(quarter), str(year), str(perf_file))
    os.system(script_path + " " + str(quarter) + " " + str(year) + " " +  str(perf_file))

    print("params:", str(quarter), str(year+1), str(perf_file))
    os.system(script_path + " " + str(quarter) + " " + str(year+1) + " " +  str(perf_file))

    return [1, 1]
 
use_registered_hdfs = False
use_registered_posix = True

if use_registered_hdfs:
    register_hdfs()
elif use_registered_posix:
    register_posix()

acq_data_path = "/blazingdb/data/tpch/mortgage/acq"
perf_data_path = "/blazingdb/data/tpch/mortgage/perf"
col_names_path = "/blazingdb/data/tpch/mortgage/names.csv"

start_year = 2000
end_year = 2001
start_quarter = 1
end_quarter = 4

import time 
def range1(start, end):
    return range(start, end+1)

def use_file_type_suffix(year, quarter):
    if year==2001 and quarter>=2:
        return True
    return False

def getChunks(year, quarter):
    if use_file_type_suffix(year, quarter):
        return range(0, 1+1)
    return range(0, 0+1)

def clear_times():
    for year in range1(start_year, end_year):
        for quarter in range1(start_quarter, end_quarter):
            file_path = '/blazingdb/data/results/'   +  str(year) + "Q" + str(quarter)  +'.txt'
            if os.path.exists(file_path):
                os.remove(file_path)
    
def load_times(nro_nodes):
    all_load_times = []
    all_etl_times = []

    for year in range1(start_year, end_year):
        for quarter in range1(start_quarter, end_quarter):
            with open('/blazingdb/data/results/'   +  str(year) + "Q" + str(quarter)  +'.txt', 'r') as file:
                load_time, etl_time = [float(x) for x in next(file).split()] # read first line
                all_load_times.append(load_time)
                all_etl_times.append(etl_time)
    
    print("TIMES SUMMARY Total Elapsed on all machines")
    print('LOAD Time: %fs' % ((sum(all_load_times))/nro_nodes))
    print('ETL Time: %fs' % ((sum(all_etl_times))/nro_nodes))

    total_time_all_machines = (sum(all_load_times) + sum(all_etl_times)) / nro_nodes # + sum(all_xgb_convert_times)
    percent_load = (sum(all_load_times) / total_time_all_machines) / nro_nodes
    percent_etl = (sum(all_etl_times) / total_time_all_machines) / nro_nodes
    return (percent_load, percent_etl)

final_cpu_df_label = None
final_cpu_df_data = None

all_load_times = []
all_etl_times = []
all_xgb_convert_times = []
chunk_parameters = []

client = Client('127.0.0.1:8786')
workers_ips = ['172.18.0.21', '172.18.0.22', '172.18.0.23', '172.18.0.24']


clear_times()
total_start = time.time()
for year in range1(start_year, start_year):
    for quarter in range1(start_quarter, end_quarter):
        for chunk in getChunks(year, quarter):
            chunk_sufix = "_{}".format(chunk) if use_file_type_suffix(year, quarter) else ""
            perf_file = perf_data_path + "/Performance_" + str(year) + "Q" + str(quarter) + ".txt" + chunk_sufix
            args = {"quarter" : quarter, "year" : year, "perf_file" : perf_file}
            print("processing: ", args)
            chunk_parameters.append(args)


futures = client.map(run_gpu_workflow, chunk_parameters, workers=workers_ips)

gather_response = client.gather(futures)
print("gather: ", gather_response)
percent_load, percent_etl = load_times(len(workers_ips))
total_end = time.time()
print('Wall Time %fs' % (total_end - total_start) )
print('Wall LOAD Time: %fs' % ((total_end - total_start) * percent_load))
print('Wall ETL Time: %fs' % ((total_end - total_start) * percent_etl))

deregister_posix()

Chronometer.show_resume()

*** Register a POSIX File System ***
1
processing:  {'quarter': 1, 'year': 2000, 'perf_file': '/blazingdb/data/tpch/mortgage/perf/Performance_2000Q1.txt'}
processing:  {'quarter': 2, 'year': 2000, 'perf_file': '/blazingdb/data/tpch/mortgage/perf/Performance_2000Q2.txt'}
processing:  {'quarter': 3, 'year': 2000, 'perf_file': '/blazingdb/data/tpch/mortgage/perf/Performance_2000Q3.txt'}
processing:  {'quarter': 4, 'year': 2000, 'perf_file': '/blazingdb/data/tpch/mortgage/perf/Performance_2000Q4.txt'}
gather:  [[1, 1], [1, 1], [1, 1], [1, 1]]
TIMES SUMMARY Total Elapsed on all machines
LOAD Time: 4.036589s
ETL Time: 10.133855s
Wall Time 21.337518s
Wall LOAD Time: 6.078200s
Wall ETL Time: 15.259319s
1
[32mResume
[0m
