In [52]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import cudf
from cudf.dataframe import DataFrame
from collections import OrderedDict
import gc
from glob import glob
import os
import pyblazing
import pandas as pd
import time
from dask.distributed import Client
from chronometer import Chronometer

from pyblazing import FileSystemType, SchemaFrom, DriverType

def register_hdfs():
    print('*** Register a HDFS File System ***')
    fs_status = pyblazing.register_file_system(
        authority="myLocalHdfs",
        type=FileSystemType.HDFS,
        root="/",
        params={
            "host": "127.0.0.1",
            "port": 54310,
            "user": "hadoop",
            "driverType": DriverType.LIBHDFS3,
            "kerberosTicket": ""
        }
    )
    print(fs_status)


def deregister_hdfs():
    fs_status = pyblazing.deregister_file_system(authority="myLocalHdfs")
    print(fs_status)

def register_posix():

    print('*** Register a POSIX File System ***')
    fs_status = pyblazing.register_file_system(
        authority="mortgage",
        type=FileSystemType.POSIX,
        root="/"
    )
    print(fs_status)

def deregister_posix():
    fs_status = pyblazing.deregister_file_system(authority="mortgage")
    print(fs_status)

from libgdf_cffi import ffi, libgdf
import os
 
def run_gpu_workflow(params):
    quarter=params['quarter']
    perf_group=params['perf_files']
    
    script_path = "/blazingdb/notebooks/start_conda_gpu.sh"
    for i in range (len(perf_group)):
        item = perf_group[i]
        year = get_year_id(item)
        quarter = get_quarter(perf_group[i])
        perf_file_id = get_perf_file_id(perf_group[i])
        print("params:", str(quarter), str(year), perf_file_id)
        os.system(script_path + " " + str(quarter) + " " + str(year) + " " +  " " + perf_file_id)
        
    return [1, 1]
 
use_registered_hdfs = False
use_registered_posix = True

if use_registered_hdfs:
    register_hdfs()
elif use_registered_posix:
    register_posix()

acq_data_path = "/blazingdb/data/tpch/mortgage/acq"
perf_data_path = "/blazingdb/data/tpch/mortgage/perf"
col_names_path = "/blazingdb/data/tpch/mortgage/names.csv"

start_year = 2000
end_year = 2007
start_quarter = 1
end_quarter = 4

import time 
def range1(start, end):
    return range(start, end+1)

def use_file_type_suffix(year, quarter):
    if year==2001 and quarter>=2:
        return True
    return False

def getChunks(year, quarter):
    if use_file_type_suffix(year, quarter):
        return range(0, 1+1)
    return range(0, 0+1)

def clear_times():
    import glob
    for file_path in glob.glob('/blazingdb/data/results/*.txt'):
        if os.path.exists(file_path):
            os.remove(file_path)
    
def get_performance_list():
    return ["Performance_2000Q1.txt", "Performance_2000Q2.txt", "Performance_2000Q3.txt", "Performance_2000Q4.txt", "Performance_2001Q1.txt", "Performance_2001Q2.txt_0", "Performance_2001Q2.txt_1", "Performance_2001Q3.txt_0", "Performance_2001Q3.txt_1", "Performance_2001Q4.txt_0", "Performance_2001Q4.txt_1", "Performance_2002Q1.txt_0", "Performance_2002Q1.txt_1", "Performance_2002Q2.txt", "Performance_2002Q3.txt_0", "Performance_2002Q3.txt_1", "Performance_2002Q4.txt_0_0", "Performance_2002Q4.txt_0_1", "Performance_2002Q4.txt_1_0", "Performance_2002Q4.txt_1_1", "Performance_2003Q1.txt_0_0", "Performance_2003Q1.txt_0_1", "Performance_2003Q1.txt_1_0", "Performance_2003Q1.txt_1_1", "Performance_2003Q2.txt_0_0_0", "Performance_2003Q2.txt_0_0_1", "Performance_2003Q2.txt_0_1_0", "Performance_2003Q2.txt_0_1_1", "Performance_2003Q2.txt_1_0_0", "Performance_2003Q2.txt_1_0_1", "Performance_2003Q2.txt_1_1_0", "Performance_2003Q2.txt_1_1_1", "Performance_2003Q3.txt_0_0_0", "Performance_2003Q3.txt_0_0_1", "Performance_2003Q3.txt_0_1_0", "Performance_2003Q3.txt_0_1_1", "Performance_2003Q3.txt_1_0_0", "Performance_2003Q3.txt_1_0_1", "Performance_2003Q3.txt_1_1_0", "Performance_2003Q3.txt_1_1_1", "Performance_2003Q4.txt_0_0", "Performance_2003Q4.txt_0_1", "Performance_2003Q4.txt_1_0", "Performance_2003Q4.txt_1_1", "Performance_2004Q1.txt_0", "Performance_2004Q1.txt_1", "Performance_2004Q2.txt_0", "Performance_2004Q2.txt_1", "Performance_2004Q3.txt", "Performance_2004Q4.txt", "Performance_2005Q1.txt", "Performance_2005Q2.txt", "Performance_2005Q3.txt_0", "Performance_2005Q3.txt_1", "Performance_2005Q4.txt_0", "Performance_2005Q4.txt_1", "Performance_2006Q1.txt", "Performance_2006Q2.txt", "Performance_2006Q3.txt", "Performance_2006Q4.txt", "Performance_2007Q1.txt", "Performance_2007Q2.txt", "Performance_2007Q3.txt", "Performance_2007Q4.txt", "Performance_2008Q1.txt", "Performance_2008Q2.txt", "Performance_2008Q3.txt", "Performance_2008Q4.txt", "Performance_2009Q1.txt_0", "Performance_2009Q1.txt_1", "Performance_2009Q2.txt_0", "Performance_2009Q2.txt_1", "Performance_2009Q3.txt_0", "Performance_2009Q3.txt_1", "Performance_2009Q4.txt", "Performance_2010Q1.txt", "Performance_2010Q2.txt", "Performance_2010Q3.txt", "Performance_2010Q4.txt_0", "Performance_2010Q4.txt_1", "Performance_2011Q1.txt", "Performance_2011Q2.txt", "Performance_2011Q3.txt", "Performance_2011Q4.txt_0", "Performance_2011Q4.txt_1", "Performance_2012Q1.txt_0", "Performance_2012Q1.txt_1", "Performance_2012Q2.txt_0", "Performance_2012Q2.txt_1", "Performance_2012Q3.txt_0", "Performance_2012Q3.txt_1", "Performance_2012Q4.txt_0", "Performance_2012Q4.txt_1", "Performance_2013Q1.txt_0", "Performance_2013Q1.txt_1", "Performance_2013Q2.txt_0", "Performance_2013Q2.txt_1", "Performance_2013Q3.txt_0", "Performance_2013Q3.txt_1", "Performance_2013Q4.txt", "Performance_2014Q1.txt", "Performance_2014Q2.txt", "Performance_2014Q3.txt", "Performance_2014Q4.txt", "Performance_2015Q1.txt", "Performance_2015Q2.txt", "Performance_2015Q3.txt", "Performance_2015Q4.txt", "Performance_2016Q1.txt", "Performance_2016Q2.txt", "Performance_2016Q3.txt", "Performance_2016Q4.txt"]

def get_perf_files(year):
    perf_list = get_performance_list()
    return list(filter(lambda item: item.find(str(year)) > 0, perf_list))
        
def get_quarter(s):
    start = s.find('Q') + 1
    end = s.find('.txt', start)
    return s[start:end]

def get_perf_file_id(s):
    start = s.find('Q')
    return s[start:len(s)]

def get_year_id(s):
    start = s.find('_') + 1
    end = s.find('Q', start)
    return s[start:end]
    
def show_quarters(items, year):
    for i in range(len(items)):
        s = items[i]
        perf_file_id = get_perf_file_id(s)
        perf_file = perf_data_path + "/Performance_" + str(year) + perf_file_id
        print(perf_file)

def group_by_quarters (quarter):
    perf_list = []
    for year in range1(start_year, end_year):
        ret = get_perf_files(year)
        perf_list = perf_list + ret
        
    group = list(filter(lambda item: item.find("Q" + str(quarter)) > 0, perf_list))
    print("quarter******", quarter)
    response = []
    for item in group:
        year = get_year_id(item)
        perf_file_id = get_perf_file_id(item)
        perf_file = perf_data_path + "/Performance_" + str(year) + perf_file_id
        print(perf_file)
        response.append(perf_file)
    print()
    return response
        
def load_times(nro_nodes):
    all_load_times = []
    all_etl_times = []

    for year in range1(start_year, end_year):
        perf_files = get_perf_files(year)
        for i in range(len(perf_files)):
            s = perf_files[i]
            perf_file_id = get_perf_file_id(s)
            with open('/blazingdb/data/results/'  +  perf_file_id, 'r') as file:
                load_time, etl_time = [float(x) for x in next(file).split()] # read first line
                all_load_times.append(load_time)
                all_etl_times.append(etl_time)
    
    n_tasks = len(all_load_times)
    total_time_all_machines = (sum(all_load_times) + sum(all_etl_times)) / n_tasks # + sum(all_xgb_convert_times)
    percent_load = (sum(all_load_times) / total_time_all_machines) / n_tasks
    percent_etl = (sum(all_etl_times) / total_time_all_machines) / n_tasks
    return (percent_load, percent_etl)

final_cpu_df_label = None
final_cpu_df_data = None

all_load_times = []
all_etl_times = []
all_xgb_convert_times = []
chunk_parameters = []

client = Client('127.0.0.1:8786')
workers_ips = ['172.18.0.21', '172.18.0.22', '172.18.0.23', '172.18.0.24']


clear_times()
total_start = time.time()

for quarter in range1(start_quarter, end_quarter):
    print("processing: ", quarter)
    perf_files = group_by_quarters(quarter)
    args = {"quarter" : quarter, "perf_files": perf_files}
    chunk_parameters.append(args)

futures = client.map(run_gpu_workflow, chunk_parameters, workers=workers_ips)

gather_response = client.gather(futures)
print("gather: ", gather_response)
percent_load, percent_etl = load_times(len(workers_ips))
total_end = time.time()
print('Wall Time %fs' % (total_end - total_start) )
print('Wall LOAD Time: %fs' % ((total_end - total_start) * percent_load))
print('Wall ETL Time: %fs' % ((total_end - total_start) * percent_etl))

deregister_posix()

Chronometer.show_resume()

*** Register a POSIX File System ***
1
processing:  1
quarter****** 1
/blazingdb/data/tpch/mortgage/perf/Performance_2000Q1.txt
/blazingdb/data/tpch/mortgage/perf/Performance_2001Q1.txt
/blazingdb/data/tpch/mortgage/perf/Performance_2002Q1.txt_0
/blazingdb/data/tpch/mortgage/perf/Performance_2002Q1.txt_1
/blazingdb/data/tpch/mortgage/perf/Performance_2003Q1.txt_0_0
/blazingdb/data/tpch/mortgage/perf/Performance_2003Q1.txt_0_1
/blazingdb/data/tpch/mortgage/perf/Performance_2003Q1.txt_1_0
/blazingdb/data/tpch/mortgage/perf/Performance_2003Q1.txt_1_1
/blazingdb/data/tpch/mortgage/perf/Performance_2004Q1.txt_0
/blazingdb/data/tpch/mortgage/perf/Performance_2004Q1.txt_1
/blazingdb/data/tpch/mortgage/perf/Performance_2005Q1.txt
/blazingdb/data/tpch/mortgage/perf/Performance_2006Q1.txt
/blazingdb/data/tpch/mortgage/perf/Performance_2007Q1.txt

processing:  2
quarter****** 2
/blazingdb/data/tpch/mortgage/perf/Performance_2000Q2.txt
/blazingdb/data/tpch/mortgage/perf/Performance_2001Q2.txt_0
/b