diff --git a/.travis.yml b/.travis.yml index 387e9432d..6b266ebb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,6 +24,7 @@ # To use docker later... sudo: required +dist: focal # let's go! language: python @@ -34,6 +35,7 @@ matrix: env: NO_DLG_RUNTIME=1 - python: "3.8" env: NO_DLG_TRANSLATOR=1 + - python: "3.9" # NOTE: The OpenAPI code still needs to be removed # - python: "3.8" # env: TEST_OPENAPI=1 @@ -68,7 +70,7 @@ matrix: python: "3.8" before_install: install: - - pip install sphinx sphinx-rtd-theme + - pip install sphinx sphinx-rtd-theme gputil merklelib script: - READTHEDOCS=True make -C docs html SPHINXOPTS="-W --keep-going" diff --git a/Dockerfile b/Dockerfile index 86dba0c96..2b7b461e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,4 +12,4 @@ RUN cd /home/ray/daliuge/daliuge-common && pip install . \ && sudo apt-get remove cmake gcc -y \ && sudo apt-get clean -CMD ["dlg", "daemon", "-vv", "--no-nm"] +CMD ["dlg", "daemon", "-vv", "--no-nm"] \ No newline at end of file diff --git a/OpenAPI/tests/managers_test_client.py b/OpenAPI/tests/managers_test_client.py index 34608b78a..96b8550cc 100644 --- a/OpenAPI/tests/managers_test_client.py +++ b/OpenAPI/tests/managers_test_client.py @@ -2,7 +2,6 @@ import node_manager_client as nmc from composite_manager_client.api.default_api import DefaultApi - nm_config = nmc.Configuration() nm_config.host = "127.0.0.1:8000" dim_config = cmc.Configuration() diff --git a/OpenAPI/tests/translator_test_client.py b/OpenAPI/tests/translator_test_client.py index f6935ee02..0653fa996 100644 --- a/OpenAPI/tests/translator_test_client.py +++ b/OpenAPI/tests/translator_test_client.py @@ -2,7 +2,6 @@ import translator_client as tc - translator_config = tc.Configuration() translator_config.host = "127.0.0.1:8084" diff --git a/README.rst b/README.rst index 227014db1..9fef7cc80 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,8 @@ Data Activated 流 Graph Engine ============================== -.. image:: https://travis-ci.org/ICRAR/daliuge.svg?branch=master - :target: https://travis-ci.org/ICRAR/daliuge +.. image:: https://travis-ci.com/ICRAR/daliuge.svg?branch=master + :target: https://travis-ci.com/github/ICRAR/daliuge .. image:: https://coveralls.io/repos/github/ICRAR/daliuge/badge.svg?branch=master :target: https://coveralls.io/github/ICRAR/daliuge?branch=master diff --git a/__init__.py b/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/daliuge-common/build_common.sh b/daliuge-common/build_common.sh index 077fbb908..457da754a 100755 --- a/daliuge-common/build_common.sh +++ b/daliuge-common/build_common.sh @@ -19,7 +19,7 @@ case "$1" in "devcuda") export VCS_TAG=`git rev-parse --abbrev-ref HEAD | tr '[:upper:]' '[:lower:]'` echo "Building daliuge-common development version using tag ${VCS_TAG}" - docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-common:${VCS_TAG} -f docker/Dockerfile.cuda . + docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-common:${VCS_TAG} -f docker/Dockerfile.devcuda . echo "Build finished!" exit 0;; "casa") diff --git a/daliuge-common/dlg/__init__.py b/daliuge-common/dlg/__init__.py index 0cb96f675..ba61d0929 100644 --- a/daliuge-common/dlg/__init__.py +++ b/daliuge-common/dlg/__init__.py @@ -22,3 +22,11 @@ # Declaring this as a namespace package __path__ = __import__("pkgutil").extend_path(__path__, __name__) # @ReservedAssignment +# set the version +try: + from dlg.common import version + + __version__ = version.full_version +except: + # This can happen when running from source + __version__ = "unknown" diff --git a/daliuge-common/dlg/clients.py b/daliuge-common/dlg/clients.py index 281efc043..bfef4ef66 100644 --- a/daliuge-common/dlg/clients.py +++ b/daliuge-common/dlg/clients.py @@ -165,6 +165,33 @@ def session_status(self, sessionId): ) return status + def session_repro_status(self, sessionId): + """ + Returns the reproducibility status of session `sessionId`. + """ + status = self._get_json("/sessions/%s/repro/status" % (quote(sessionId),)) + logger.debug( + "Successfully read session %s reproducibility status (%s) from %s:%s", + sessionId, + status, + self.host, + self.port, + ) + return status + + def session_repro_data(self, sessionId): + """ + Returns the graph-wide reproducibility information of session `sessionId`. + """ + data = self._get_json("/sessions/%s/repro/data" % (quote(sessionId),)) + logger.debug( + "Successfully read session %s reproducibility data from %s:%s", + sessionId, + self.host, + self.port, + ) + return data + def graph_size(self, sessionId): """ Returns the size of the graph of session `sessionId` diff --git a/daliuge-common/dlg/common/__init__.py b/daliuge-common/dlg/common/__init__.py index 3198ce50a..5c6bbfd11 100644 --- a/daliuge-common/dlg/common/__init__.py +++ b/daliuge-common/dlg/common/__init__.py @@ -39,7 +39,7 @@ class Categories: PLASMA = "Plasma" PLASMAFLIGHT = "PlasmaFlight" PARSET = "ParameterSet" - ENVIRONMENTVARS = "EnvironmentVars" + ENVIRONMENTVARS = "EnvironmentVariables" MKN = "MKN" SCATTER = "Scatter" @@ -74,7 +74,7 @@ class Categories: Categories.PLASMA, Categories.PLASMAFLIGHT, Categories.PARSET, - Categories.ENVIRONMENTVARS + Categories.ENVIRONMENTVARS, } APP_DROP_TYPES = [ Categories.COMPONENT, @@ -125,29 +125,47 @@ class dropdict(dict): DROPManager. """ - def _addSomething(self, other, key): + def _addSomething(self, other, key, IdText=None): if key not in self: self[key] = [] if other["oid"] not in self[key]: - self[key].append(other["oid"]) + append = {other["oid"]: IdText} if IdText else other["oid"] + self[key].append(append) - def addConsumer(self, other): - self._addSomething(other, "consumers") + def addConsumer(self, other, IdText=None): + self._addSomething(other, "consumers", IdText=IdText) - def addStreamingConsumer(self, other): - self._addSomething(other, "streamingConsumers") + def addStreamingConsumer(self, other, IdText=None): + self._addSomething(other, "streamingConsumers", IdText=IdText) - def addInput(self, other): - self._addSomething(other, "inputs") + def addInput(self, other, IdText=None): + self._addSomething(other, "inputs", IdText=IdText) - def addStreamingInput(self, other): - self._addSomething(other, "streamingInputs") + def addStreamingInput(self, other, IdText=None): + self._addSomething(other, "streamingInputs", IdText=IdText) - def addOutput(self, other): - self._addSomething(other, "outputs") + def addOutput(self, other, IdText=None): + self._addSomething(other, "outputs", IdText=IdText) - def addProducer(self, other): - self._addSomething(other, "producers") + def addProducer(self, other, IdText=None): + self._addSomething(other, "producers", IdText=IdText) + + +def _sanitize_links(links): + """ + Links can now be dictionaries, but we only need + the key. + """ + if isinstance(links, list): + nlinks = [] + for l in links: + if isinstance(l, dict): # could be a list of dicts + nlinks.extend(list(l.keys())) + else: + nlinks.extend(l) if isinstance(l, list) else nlinks.append(l) + return nlinks + elif isinstance(links, dict): + return list(links.keys()) if isinstance(links, dict) else links def get_roots(pg_spec): @@ -169,14 +187,17 @@ def get_roots(pg_spec): if dropspec.get("inputs", None) or dropspec.get("streamingInputs", None): nonroots.add(oid) if dropspec.get("outputs", None): - nonroots |= set(dropspec["outputs"]) + do = _sanitize_links(dropspec["outputs"]) + nonroots |= set(do) elif dropspec["type"] == DropType.PLAIN: if dropspec.get("producers", None): nonroots.add(oid) if dropspec.get("consumers", None): - nonroots |= set(dropspec["consumers"]) + dc = _sanitize_links(dropspec["consumers"]) + nonroots |= set(dc) if dropspec.get("streamingConsumers", None): - nonroots |= set(dropspec["streamingConsumers"]) + dsc = _sanitize_links(dropspec["streamingConsumers"]) + nonroots |= set(dsc) return all_oids - nonroots @@ -200,18 +221,23 @@ def get_leaves(pg_spec): if dropspec.get("outputs", None): nonleaves.add(oid) if dropspec.get("streamingInputs", None): - nonleaves |= set(dropspec["streamingInputs"]) + dsi = _sanitize_links(dropspec["streamingInputs"]) + nonleaves |= set(dsi) if dropspec.get("inputs", None): - nonleaves |= set(dropspec["inputs"]) + di = _sanitize_links(dropspec["inputs"]) + nonleaves |= set(di) if dropspec["type"] == DropType.SERVICE_APP: nonleaves.add(oid) # services are never leaves if dropspec.get("streamingInputs", None): - nonleaves |= set(dropspec["streamingInputs"]) + dsi = _sanitize_links(dropspec["streamingInputs"]) + nonleaves |= set(dsi) if dropspec.get("inputs", None): - nonleaves |= set(dropspec["inputs"]) + di = _sanitize_links(dropspec["inputs"]) + nonleaves |= set(di) elif dropspec["type"] == DropType.PLAIN: if dropspec.get("producers", None): - nonleaves |= set(dropspec["producers"]) + dp = _sanitize_links(dropspec["producers"]) + nonleaves |= set(dp) if dropspec.get("consumers", None) or dropspec.get( "streamingConsumers", None ): diff --git a/daliuge-common/dlg/common/network.py b/daliuge-common/dlg/common/network.py index 13358b8b4..1d2efbd32 100644 --- a/daliuge-common/dlg/common/network.py +++ b/daliuge-common/dlg/common/network.py @@ -19,12 +19,11 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, # MA 02111-1307 USA # +import contextlib import errno import logging import socket import time -import contextlib - logger = logging.getLogger(__name__) @@ -162,7 +161,7 @@ def connect_to(host, port, timeout=None): return s -def write_to(host, port, data, timeout=None): +def write_to(host, port, data, timeout=5): """ Connects to ``host``:``port`` within the given timeout and write the given piece of ``data`` into the connected socket. diff --git a/daliuge-common/dlg/common/osutils.py b/daliuge-common/dlg/common/osutils.py index 784cd7807..34a0769fd 100644 --- a/daliuge-common/dlg/common/osutils.py +++ b/daliuge-common/dlg/common/osutils.py @@ -24,7 +24,6 @@ import math import time - logger = logging.getLogger(__name__) diff --git a/daliuge-common/dlg/common/reproducibility/__init__.py b/daliuge-common/dlg/common/reproducibility/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/daliuge-common/dlg/common/reproducibility/apps.py b/daliuge-common/dlg/common/reproducibility/apps.py new file mode 100644 index 000000000..03bb9773a --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/apps.py @@ -0,0 +1,83 @@ +""" +Contains several very basic apps to test python function reproducibility. +""" +import numpy as np + +from dlg.apps.pyfunc import PyFuncApp + + +def write_in(): + """ + :return: "world" always + """ + return "world" + + +def write_out(phrase="everybody"): + """ + Appends s to "Hello " + :param phrase: The string to be appended + :return: "Hello " + s + """ + return "Hello " + phrase + + +def numpy_av(nums): + """ + Finds the mean of a list of numbers using numpy. + :param nums: The numbers to be averaged. + :return: The mean. + """ + return np.asscalar(np.mean(nums)) + + +def my_av(nums): + """ + Finds the mean of a list of numbers manually + :param nums: The numbers to be averaged + :return: The mean. + """ + res = 0.0 + for num in nums: + res += num + return res / len(nums) + + +class HelloWorldPythonIn(PyFuncApp): + """ + Wrapper app turning writeIn into a Python function app + """ + + def initialize(self, **kwargs): + fname = "dlg.common.reproducibility.apps.write_in" + super().initialize(func_name=fname) + + +class HelloWorldPythonOut(PyFuncApp): + """ + Wrapper app turning writeOut into a Python function app + """ + + def initialize(self, **kwargs): + fname = "dlg.common.reproducibility.apps.write_out" + super().initialize(func_name=fname) + + +class NumpyAverage(PyFuncApp): + """ + Wrapper app turning numpy_av into a Python function app + """ + + def initialize(self, **kwargs): + fname = "dlg.common.reproducibility.apps.numpy_av" + super().initialize(func_name=fname) + + +class MyAverage(PyFuncApp): + """ + Wrapper app turning my_av into a Python function app + """ + + def initialize(self, **kwargs): + fname = "dlg.common.reproducibility.apps.my_av" + super().initialize(func_name=fname) diff --git a/daliuge-common/dlg/common/reproducibility/apps_lowpass.py b/daliuge-common/dlg/common/reproducibility/apps_lowpass.py new file mode 100644 index 000000000..0ba06b459 --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/apps_lowpass.py @@ -0,0 +1,457 @@ +""" +Implements several DALiuGE drops to build low-pass filters with various methods. +""" + +import numpy as np +import pyfftw +from dlg import droputils +from dlg.apps.simple import BarrierAppDROP +from dlg.common.reproducibility.constants import system_summary +from dlg.meta import dlg_batch_output, dlg_streaming_input +from dlg.meta import dlg_component, dlg_batch_input +from dlg.meta import dlg_int_param, dlg_list_param, dlg_float_param, dlg_bool_param + + +def determine_size(length): + """ + :param length: + :return: Computes the next largest power of two needed to contain |length| elements + """ + return int(2 ** np.ceil(np.log2(length))) - 1 + + +class LP_SignalGenerator(BarrierAppDROP): + """ + Generates a noisy sine signal for filtering. Effectively an input generator. + """ + + component_meta = dlg_component( + "LPSignalGen", + "Low-pass filter example signal generator", + [None], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + # default values + length = dlg_int_param("length", 256) + srate = dlg_int_param("sample rate", 5000) + freqs = dlg_list_param("frequencies", [440, 800, 1000, 2000]) + noise = dlg_list_param("noise", []) + series = None + + def add_noise( + self, series: np.array, mean, std, freq, sample_rate, seed, alpha=0.1 + ): + """ + A noise to the provided signal by producing random values of a given frequency + :param series: The input (and output) numpy array signal series + :param mean: The average value + :param std: The standard deviation of the value + :param freq: The frequency of the noisy signal + :param sample_rate: The sample rate of the input series + :param seed: The random seed + :param alpha: The multiplier + :return: The input series with noisy values added + """ + np.random.seed(seed) + samples = alpha * np.random.normal(mean, std, size=len(series)) + for i in range(len(series)): + samples[i] += np.sin(2 * np.pi * i * freq / sample_rate) + np.add(series, samples, out=series) + return series + + def gen_sig(self): + """ + Generates an initial signal + :return: Numpy array of signal values. + """ + series = np.zeros(self.length, dtype=np.float64) + for freq in self.freqs: + for i in range(self.length): + series[i] += np.sin(2 * np.pi * i * freq / self.srate) + return series + + def run(self): + """ + Called by DALiuGE to start signal generation. Conditionally adds noise if parameters are set + :return: Writes signal to output ports. + """ + outs = self.outputs + if len(outs) < 1: + raise Exception("At least one output required for %r" % self) + self.series = self.gen_sig() + if len(self.noise) > 0: + self.noise[0] = 1 / self.noise[0] + self.series = self.add_noise( + self.series, + self.noise[2], + self.noise[4], + self.noise[1], + self.srate, + self.noise[3], + self.noise[0], + ) + + data = self.series.tostring() + for output in outs: + output.len = len(data) + output.write(data) + + def generate_recompute_data(self): + # This will do for now + return { + "length": self.length, + "sample_rate": self.srate, + "frequencies": self.freqs, + "status": self.status, + "system": system_summary(), + } + + +class LP_WindowGenerator(BarrierAppDROP): + """ + Generates a Hann window for low-pass filtering. + """ + + component_meta = dlg_component( + "LPWindowGen", + "Low-pass filter example window generator", + [None], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + # default values + length = dlg_int_param("length", 256) + cutoff = dlg_int_param("cutoff", 600) + srate = dlg_int_param("sample_rate", 5000) + series = None + + def sinc(self, x_val: np.float64): + """ + Computes the sin_c value for the input float + :param x_val: + """ + if np.isclose(x_val, 0.0): + return 1.0 + return np.sin(np.pi * x_val) / (np.pi * x_val) + + def gen_win(self): + """ + Generates the window values. + :return: Numpy array of window series. + """ + alpha = 2 * self.cutoff / self.srate + win = np.zeros(self.length, dtype=np.float64) + for i in range(int(self.length)): + ham = 0.54 - 0.46 * np.cos( + 2 * np.pi * i / int(self.length) + ) # Hamming coefficient + hsupp = i - int(self.length) / 2 + win[i] = ham * alpha * self.sinc(alpha * hsupp) + return win + + def run(self): + """ + Called by DALiuGE to start drop execution + :return: + """ + outs = self.outputs + if len(outs) < 1: + raise Exception("At least one output required for %r" % self) + self.series = self.gen_win() + data = self.series.tostring() + for output in outs: + output.len = len(data) + output.write(data) + + def generate_recompute_data(self): + output = dict() + output["length"] = self.length + output["cutoff"] = self.cutoff + output["sample_rate"] = self.srate + output["status"] = self.status + output["system"] = system_summary() + return output + + +class LP_AddNoise(BarrierAppDROP): + """ + Component to add additional noise to a signal array. + """ + + component_meta = dlg_component( + "LPAddNoise", + "Adds noise to a signal generated " "for the low-pass filter example", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + # default values + mean = dlg_float_param("avg_noise", 0.0) + std = dlg_float_param("std_deviation", 1.0) + freq = dlg_int_param("frequency", 1200) + srate = dlg_int_param("sample_rate", 5000) + seed = dlg_int_param("random_seed", 42) + alpha = dlg_float_param("noise_multiplier", 0.1) + signal = np.empty([1]) + + def add_noise(self): + """ + Adds noise at a specified frequency. + :return: Modified signal + """ + np.random.seed(self.seed) + samples = self.alpha * np.random.normal( + self.mean, self.std, size=len(self.signal) + ) + for i in range(len(self.signal)): + samples[i] += np.sin(2 * np.pi * i * self.freq / self.srate) + np.add(self.signal, samples, out=self.signal) + return self.signal + + def get_inputs(self): + """ + Reads input data into a numpy array. + :return: + """ + ins = self.inputs + if len(ins) != 1: + raise Exception("Precisely one input required for %r" % self) + + array = np.fromstring(droputils.allDropContents(ins[0])) + self.signal = np.frombuffer(array) + + def run(self): + """ + Called by DALiuGE to start drop execution. + :return: + """ + outs = self.outputs + if len(outs) < 1: + raise Exception("At least one output required for %r" % self) + self.get_inputs() + sig = self.add_noise() + data = sig.tobytes() + for output in outs: + output.len = len(data) + output.write(data) + + def generate_recompute_data(self): + return { + "mean": self.mean, + "std": self.std, + "sample_rate": self.srate, + "seed": self.seed, + "alpha": self.alpha, + "system": system_summary(), + "status": self.status, + } + + +class LP_filter_fft_np(BarrierAppDROP): + """ + Uses numpy to filter a nosiy signal. + """ + + component_meta = dlg_component( + "LP_filter_np", + "Filters a signal with " "a provided window using numpy", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + PRECISIONS = { + "double": {"float": np.float64, "complex": np.complex128}, + "single": {"float": np.float32, "complex": np.complex64}, + } + precision = {} + # default values + double_prec = dlg_bool_param("doublePrec", True) + series = [] + output = np.zeros([1]) + + def initialize(self, **kwargs): + super().initialize(**kwargs) + if self.double_prec: + self.precision = self.PRECISIONS["double"] + else: + self.precision = self.PRECISIONS["single"] + + def get_inputs(self): + """ + Reads input arrays into numpy array + :return: Sets class series variable. + """ + ins = self.inputs + if len(ins) != 2: + raise Exception("Precisely two input required for %r" % self) + + array = [np.fromstring(droputils.allDropContents(inp)) for inp in ins] + self.series = array + + def filter(self): + """ + Actually performs the filtering + :return: Numpy array of filtered signal. + """ + signal = self.series[0] + window = self.series[1] + nfft = determine_size(len(signal) + len(window) - 1) + print(nfft) + sig_zero_pad = np.zeros(nfft, dtype=self.precision["float"]) + win_zero_pad = np.zeros(nfft, dtype=self.precision["float"]) + sig_zero_pad[0 : len(signal)] = signal + win_zero_pad[0 : len(window)] = window + sig_fft = np.fft.fft(sig_zero_pad) + win_fft = np.fft.fft(win_zero_pad) + out_fft = np.multiply(sig_fft, win_fft) + out = np.fft.ifft(out_fft) + return out.astype(self.precision["complex"]) + + def run(self): + """ + Called by DALiuGE to start execution + :return: + """ + outs = self.outputs + if len(outs) < 1: + raise Exception("At least one output required for %r" % self) + self.get_inputs() + self.output = self.filter() + data = self.output.tostring() + for output in outs: + output.len = len(data) + output.write(data) + + def generate_recompute_data(self): + return { + "precision_float": str(self.precision["float"]), + "precision_complex": str(self.precision["complex"]), + "system": system_summary(), + "status": self.status, + } + + +class LP_filter_fft_fftw(LP_filter_fft_np): + """ + Uses fftw to implement a low-pass filter + """ + + component_meta = dlg_component( + "LP_filter_fftw", + "Filters a signal with " "a provided window using FFTW", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + def filter(self): + """ + Actually performs the filtering + :return: Filtered signal as numpy array. + """ + pyfftw.interfaces.cache.disable() + signal = self.series[0] + window = self.series[1] + nfft = determine_size(len(signal) + len(window) - 1) + sig_zero_pad = pyfftw.empty_aligned(len(signal), dtype=self.precision["float"]) + win_zero_pad = pyfftw.empty_aligned(len(window), dtype=self.precision["float"]) + sig_zero_pad[0 : len(signal)] = signal + win_zero_pad[0 : len(window)] = window + sig_fft = pyfftw.interfaces.numpy_fft.fft(sig_zero_pad, n=nfft) + win_fft = pyfftw.interfaces.numpy_fft.fft(win_zero_pad, n=nfft) + out_fft = np.multiply(sig_fft, win_fft) + out = pyfftw.interfaces.numpy_fft.ifft(out_fft, n=nfft) + return out.astype(self.precision["complex"]) + + +class LP_filter_fft_cuda(LP_filter_fft_np): + """ + Uses pycuda to implement a low-pass filter + """ + + component_meta = dlg_component( + "LP_filter_fft_cuda", + "Filters a signal with " "a provided window using cuda", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + def filter(self): + """ + Actually performs the filtering + :return: + """ + import pycuda.gpuarray as gpuarray + import skcuda.fft as cu_fft + import skcuda.linalg as linalg + import pycuda.driver as cuda + from pycuda.tools import make_default_context + + cuda.init() + context = make_default_context() + device = context.get_device() + signal = self.series[0] + window = self.series[1] + linalg.init() + nfft = determine_size(len(signal) + len(window) - 1) + # Move data to GPU + sig_zero_pad = np.zeros(nfft, dtype=self.precision["float"]) + win_zero_pad = np.zeros(nfft, dtype=self.precision["float"]) + sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision["float"]) + win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision["float"]) + sig_zero_pad[0 : len(signal)] = signal + win_zero_pad[0 : len(window)] = window + sig_gpu.set(sig_zero_pad) + win_gpu.set(win_zero_pad) + + # Plan forwards + sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision["complex"]) + win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision["complex"]) + sig_plan_forward = cu_fft.Plan( + sig_fft_gpu.shape, self.precision["float"], self.precision["complex"] + ) + win_plan_forward = cu_fft.Plan( + win_fft_gpu.shape, self.precision["float"], self.precision["complex"] + ) + cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) + cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) + + # Convolve + out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) + linalg.scale(2.0, out_fft) + + # Plan inverse + out_gpu = gpuarray.zeros_like(out_fft) + plan_inverse = cu_fft.Plan( + out_fft.shape, self.precision["complex"], self.precision["complex"] + ) + cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) + out_np = np.zeros(len(out_gpu), self.precision["complex"]) + out_gpu.get(out_np) + context.pop() + return out_np + + +class LP_filter_pointwise_np(LP_filter_fft_np): + """ + Uses raw numpy to implement a low-pass filter + """ + + component_meta = dlg_component( + "LP_filter_pointwise_np", + "Filters a signal with " "a provided window using cuda", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) + + def filter(self): + return np.convolve(self.series[0], self.series[1], mode="full").astype( + self.precision["complex"] + ) diff --git a/daliuge-common/dlg/common/reproducibility/constants.py b/daliuge-common/dlg/common/reproducibility/constants.py new file mode 100644 index 000000000..a69f3caa3 --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/constants.py @@ -0,0 +1,142 @@ +""" +Defines constant values for reproduciblity DAG construction and associated utility functions. +""" + +import hashlib +import platform +import sys +from enum import Enum + +import GPUtil +import psutil +from merklelib import MerkleTree + +PROTOCOL_VERSION = 1.0 + + +class ReproducibilityFlags(int, Enum): + """ + Enum for supported reproducibility modes. + TODO: Link to more detail description + """ + + ALL = -1 # Builds and maintains all standards (1-8) + NOTHING = 0 + RERUN = 1 + REPEAT = 2 + RECOMPUTE = 4 + REPRODUCE = 5 + REPLICATE_SCI = 6 # Rerun + Reproduce + REPLICATE_COMP = 7 # Recompute + Reproduce + REPLICATE_TOTAL = 8 # Repeat + Reproduce + EXPERIMENTAL = 9 + + +ALL_RMODES = [ + ReproducibilityFlags.RERUN, + ReproducibilityFlags.REPEAT, + ReproducibilityFlags.RECOMPUTE, + ReproducibilityFlags.REPRODUCE, + ReproducibilityFlags.REPLICATE_SCI, + ReproducibilityFlags.REPLICATE_COMP, + ReproducibilityFlags.REPLICATE_TOTAL, +] +REPRO_DEFAULT = ReproducibilityFlags.NOTHING +HashingAlg = hashlib.sha3_256 + + +def rflag_caster(val, default=REPRO_DEFAULT): + """ + Function to safely cast strings and ints to their appropriate ReproducibilityFlag + E.g. rflag_caster(1) -> ReproducibilityFlag.RERUN + E.g. rlag_caster("4") -> ReproducibilityFlag.RECOMPUTE + E.g. rflag_caster("two") -> REPRO_DEFAULT + :param val: The passed value (either int or str) + :param default: The default value to be returned upon failure + :return: Appropriate ReproducibilityFlag + """ + if val is not None: + out = default + try: + out = ReproducibilityFlags(val) + except ValueError: + try: + out = ReproducibilityFlags(int(val)) + except ValueError: + for rmode in ALL_RMODES: + if val == rmode.name or val == "Reproducibility." + rmode.name: + out = rmode + return out + return default + + +def rmode_supported(flag: ReproducibilityFlags): + """ + Determines in a given flag is currently supported. + A slightly pedantic solution but it does centralize the process. + There is the possibility that different functionality is possible on a per-install basis. + Named to be used as a if rmode_supported(flag) + :param flag: A ReproducibilityFlag enum being queried + :return: True if supported, False otherwise + """ + if not isinstance(flag, ReproducibilityFlags): + raise TypeError("Need to be working with a ReproducibilityFlag enum") + return flag in ( + ReproducibilityFlags.ALL, + ReproducibilityFlags.NOTHING, + ReproducibilityFlags.RERUN, + ReproducibilityFlags.REPEAT, + ReproducibilityFlags.RECOMPUTE, + ReproducibilityFlags.REPRODUCE, + ReproducibilityFlags.REPLICATE_SCI, + ReproducibilityFlags.REPLICATE_COMP, + ReproducibilityFlags.REPLICATE_TOTAL, + ReproducibilityFlags.EXPERIMENTAL, + ) + + +def find_loaded_modules(): + """ + :return: A list of all loaded modules + """ + loaded_mods = [] + for name, module in sorted(sys.modules.items()): + if hasattr(module, "__version__"): + loaded_mods.append(name + " " + str(module.__version__)) + else: + loaded_mods.append(name) + return loaded_mods + + +def system_summary(): + """ + Summarises the system this function is run on. + Includes system, cpu, gpu and module details + :return: A dictionary of system details + """ + merkletree = MerkleTree() + system_info = {} + uname = platform.uname() + system_info["system"] = { + "system": uname.system, + "release": uname.release, + "machine": uname.machine, + "processor": uname.processor, + } + cpu_freq = psutil.cpu_freq() + system_info["cpu"] = { + "cores_phys": psutil.cpu_count(logical=False), + "cores_logic": psutil.cpu_count(logical=True), + "max_frequency": cpu_freq.max, + "min_frequency": cpu_freq.min, + } + sys_mem = psutil.virtual_memory() + system_info["memory"] = {"total": sys_mem.total} + gpus = GPUtil.getGPUs() + system_info["gpu"] = {} + for gpu in gpus: + system_info["gpu"][gpu.id] = {"name": gpu.name, "memory": gpu.memoryTotal} + system_info["modules"] = find_loaded_modules() + merkletree.append(list(system_info.items())) + system_info["signature"] = merkletree.merkle_root + return system_info diff --git a/daliuge-common/dlg/common/reproducibility/reprodata_compare.py b/daliuge-common/dlg/common/reproducibility/reprodata_compare.py new file mode 100644 index 000000000..926b6dad5 --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/reprodata_compare.py @@ -0,0 +1,239 @@ +# +# ICRAR - International Centre for Radio Astronomy Research +# (c) UWA - The University of Western Australia, 2015 +# Copyright by UWA (in the framework of the ICRAR) +# All rights reserved +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +""" +This script allows the comparison of two graphs and their related reprodata files, writing the +comparison results to csv files or the command line. +It is intended to provide a simple way to provide comparison between two or more workflow executions +""" +from datetime import datetime +import os +import pathlib +import argparse +import json +import csv +import logging +import itertools + +from dlg.common.reproducibility.constants import ( + ALL_RMODES, + rflag_caster, + ReproducibilityFlags, +) + +logger = logging.getLogger(__name__) + + +def _unique_filemid(): + today = datetime.now() + return today.strftime("%d-%m-%Y-%H-%M-%S-%f") + + +def open_file(path: pathlib.Path): + """ + Opens the passed filepath, returns a dictionary of the contained rmode signatures + """ + with path.open("r", encoding="utf-8") as infile: + data = json.load(infile) + if isinstance(data, list): + return data[-1] + return data + + +def is_single(data): + """ + Determines if the passed reprodata contains several signatures, or a single signature. + """ + if data.get("rmode") == str(ReproducibilityFlags.ALL.value): + return False + return True + + +def process_single(data): + """ + Processes reprodata containing a single signature. + Builds a small dictionary mapping the 'rmode' to the signature + """ + return {rflag_caster(data.get("rmode")).value: data.get("signature")} + + +def process_multi(data): + """ + Processes reprodata containing multiple signatures. + Builds a dictionary mapping rmode.value to the provided signature + None if not present. + """ + out_data = {rmode.value: None for rmode in ALL_RMODES} + for rmode in ALL_RMODES: + out_data[rmode.value] = data.get(rmode.name, {}).get("signature") + return out_data + + +def process_file(filename: pathlib.Path): + """ + Processes a reprodata file, returning a summary dictionary mapping rmode to signature for + all rmodes. + """ + data = open_file(filename) + out_data = {rmode.value: None for rmode in ALL_RMODES} + if is_single(data): + out_data.update(process_single(data)) + else: + out_data.update(process_multi(data)) + return out_data + + +def process_directory(dirname: pathlib.Path): + """ + Processes a directory assuming to contain reprodata.out file(s) referring to the same workflow. + """ + out_data = {} + for file in dirname.glob("*.out"): + new_data = process_file(file) + for rmode, sig in new_data.items(): + if sig is not None: + out_data[rmode] = sig + return out_data + + +def generate_comparison(data): + """ + :param: data - a dictionary mapping workflow names to rmode signatures. + For each possible combination of workflows present in the data dictionary, compares their + rmode signatures. + Returns a dictionary mapping each pair to rmode booleans (true if matching, false otherwise) + """ + outdata = {} + for combination in itertools.combinations(data.keys(), 2): + outdata[combination[0] + ":" + combination[1]] = compare_signatures( + data[combination[0]], data[combination[1]] + ) + return outdata + + +def compare_signatures(data1, data2): + """ + Compares the rmode signatures of two workflow executions. + """ + output = {rmode.value: False for rmode in ALL_RMODES} + for rmode in ALL_RMODES: + if rmode.value in data1 and rmode.value in data2: + if data1[rmode.value] == data2[rmode.value]: + output[rmode.value] = True + return output + + +def write_outfile(data, outfilepath, outfilesuffix="summary", verbose=False): + """ + Writes a dictionary to csv file. + """ + fieldnames = ["workflow"] + [rmode.name for rmode in ALL_RMODES] + with open( + outfilepath + f"-{outfilesuffix}.csv", "w+", newline="", encoding="utf-8" + ) as ofile: + writer = csv.writer(ofile, delimiter=",") + writer.writerow(fieldnames) + + for filepath, signature_data in data.items(): + row = [filepath] + [signature_data[rmode.value] for rmode in ALL_RMODES] + writer.writerow(row) + if verbose: + print(row) + + +def write_comparison(data, outfilepath, verbose=False): + """ + Writes comparison dictionary to csv file. + """ + if len(data) > 0: + write_outfile(data, outfilepath, "comparison", verbose) + + +def write_outputs(data, comparisons, outfile_root=".", verbose=False): + """ + Writes reprodata signatures for all workflows to a summary csv and comparison of these + signatures to a separate comparison csv. + """ + if verbose: + print(json.dumps(data, indent=4)) + try: + write_outfile(data, outfile_root, outfilesuffix="summary", verbose=verbose) + except IOError: + logger.debug("Could not write summary csv") + try: + write_comparison(comparisons, outfile_root, verbose) + except IOError: + logger.debug("Could not write to comparsion csv") + + +def process_logfiles(pathnames: list): + """ + Processes all logfiles present in the list of pathnames + """ + paths = [] + data = {} + for pathname in pathnames: + paths.append(pathlib.Path(pathname)) + for path in paths: + if path.is_dir(): + data[path.name] = process_directory(path) + elif path.is_file(): + data[path.name] = process_file(path) + else: + raise AttributeError(f"{path.name} is not a file or directory") + + comparisons = generate_comparison(data) + return data, comparisons + + +def _main(pathnames: list, outfilepath: str, verbose=False): + outfile_root = os.path.join(outfilepath, _unique_filemid()) + data, comparisons = process_logfiles(pathnames) + write_outputs(data, comparisons, outfile_root, verbose) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "filename", + action="store", + default=None, + nargs="+", + type=str, + help="The first filename or directory to access", + ) + parser.add_argument( + "-o", + "--outfile", + action="store", + default=".", + type=str, + help="Directory to write output files to", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + help="If set, will write output to standard out", + ) + args = parser.parse_args() + _main(list(args.filename), args.outfile, args.verbose) diff --git a/daliuge-common/dlg/common/reproducibility/reproducibility.py b/daliuge-common/dlg/common/reproducibility/reproducibility.py new file mode 100644 index 000000000..effb5d143 --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/reproducibility.py @@ -0,0 +1,975 @@ +# +# ICRAR - International Centre for Radio Astronomy Research +# (c) UWA - The University of Western Australia, 2017 +# Copyright by UWA (in the framework of the ICRAR) +# All rights reserved +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# + +""" +This module handles the building of reproducibility information for worklfow components and graphs +at all stages of unrolling and execution. + +Functions are organized top-to-bottom as per-drop to whole-graph operations. +Within each level of detail there are several functions to deal with different graph abstractions +arranged top-to-bottom as logical to physical to runtime. +""" +import collections +import logging + +from dlg.common import STORAGE_TYPES +from dlg.common.reproducibility.constants import ( + ReproducibilityFlags, + REPRO_DEFAULT, + PROTOCOL_VERSION, + HashingAlg, + rmode_supported, + rflag_caster, + ALL_RMODES, +) +from dlg.common.reproducibility.reproducibility_fields import ( + lgt_block_fields, + lg_block_fields, + pgt_unroll_block_fields, + pgt_partition_block_fields, + pg_block_fields, + extract_fields, +) +from merklelib import MerkleTree + +logger = logging.getLogger(__name__) + + +def common_hash(value: bytes): + """ + Produces a hex digest of the `value` provided. + Assumes standard hashlib algorithm functionality. + :param value: Bytes to be hashed + :return: Hex-digest of the value + """ + return HashingAlg(value).hexdigest() + + +# ------ Drop-Based Functionality ------ +def accumulate_lgt_drop_data(drop: dict, level: ReproducibilityFlags): + """ + Accumulates relevant reproducibility fields for a single drop. + :param drop: + :param level: + :return: A dictionary containing accumulated reproducibility data for a given drop. + """ + if not rmode_supported(level): + raise NotImplementedError( + f"Reproducibility level {level.name} not yet supported" + ) + relevant_fields = lgt_block_fields(level) + data = extract_fields(drop, relevant_fields) + return data + + +def accumulate_lg_drop_data(drop: dict, level: ReproducibilityFlags): + """ + Accumulates relevant reproducibility fields for a single drop. + :param drop: + :param level: + :return: A dictionary containing accumulated reproducibility data for a given drop. + """ + if not rmode_supported(level): + raise NotImplementedError( + f"Reproducibility level {level.name} not yet supported" + ) + category_type = drop.get( + "categoryType", "" + ) # Made conditional to support older graphs + category = drop.get("category", "") + + # Cheeky way to get field list into dicts. map(dict, drop...) makes a copy + fields = {e.pop("name"): e["value"] for e in map(dict, drop["fields"])} + lg_fields = lg_block_fields(category_type, category, level) + data = extract_fields(fields, lg_fields) + return data + + +def accumulate_pgt_unroll_drop_data(drop: dict): + """ + Accumulates relevant reproducibility fields for a single drop at the physical template level. + :param drop: + :return: A dictionary containing accumulated reproducibility data for a given drop. + """ + if drop.get("reprodata") is None: + drop["reprodata"] = {"rmode": str(REPRO_DEFAULT.value), "lg_blockhash": None} + if drop["reprodata"].get("rmode") is None: + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + else: + level = rflag_caster(drop["reprodata"]["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + if drop.get("type") is None: + return {} + drop_type = drop["type"] + if level == ReproducibilityFlags.ALL: + data = {} + for rmode in ALL_RMODES: + pgt_fields = pgt_unroll_block_fields(drop_type, rmode) + data[rmode.name] = extract_fields(drop, pgt_fields) + else: + pgt_fields = pgt_unroll_block_fields(drop_type, level) + data = extract_fields(drop, pgt_fields) + return data + + +def accumulate_pgt_partition_drop_data(drop: dict): + """ + Is as combination of unroll drop data + :param drop: + :return: + """ + if drop.get("reprodata") is None: + drop["reprodata"] = {"rmode": str(REPRO_DEFAULT.value), "lg_blockhash": None} + if drop["reprodata"].get("rmode") is None: + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + else: + level = rflag_caster(drop["reprodata"]["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + if level == ReproducibilityFlags.ALL: + data = {} + unroll_data = accumulate_pgt_unroll_drop_data(drop) + for rmode in ALL_RMODES: + pgt_fields = pgt_partition_block_fields(rmode) + data[rmode.name] = extract_fields(drop, pgt_fields) + unroll_data[rmode.name].update(data[rmode.name]) + return unroll_data + else: + pgt_fields = pgt_partition_block_fields(level) + data = extract_fields(drop, pgt_fields) + return_data = accumulate_pgt_unroll_drop_data(drop) + return_data.update(data) + return return_data + + +def accumulate_pg_drop_data(drop: dict): + """ + Accumulate relevant reproducibility fields for a single drop at the physical graph level. + :param drop: + :return: A dictionary containing accumulated reproducibility data for a given drop. + """ + level = rflag_caster(drop["reprodata"]["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + if level == ReproducibilityFlags.ALL: + data = {} + for rmode in ALL_RMODES: + pg_fields = pg_block_fields(rmode) + data[rmode.name] = extract_fields(drop, pg_fields) + else: + pg_fields = pg_block_fields(level) + data = extract_fields(drop, pg_fields) + return data + + +def init_lgt_repro_drop_data(drop: dict, level: ReproducibilityFlags): + """ + Creates and appends per-drop reproducibility information at the logical template stage. + :param drop: + :param level: + :return: The same drop with appended reproducibility information. + """ + # Catch pre-set per-drop rmode + if "reprodata" in drop.keys(): + if "rmode" in drop["reprodata"].keys(): + level = rflag_caster(drop["reprodata"]["rmode"]) + else: + drop["reprodata"] = {"rmode": str(level.value)} + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + data = accumulate_lgt_drop_data(drop, rmode) + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + drop["reprodata"][rmode.name] = { + "rmode": str(rmode.value), + "lgt_data": data, + "lg_parenthashes": {}, + } + else: + data = accumulate_lgt_drop_data(drop, level) + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + drop["reprodata"] = { + "rmode": str(level.value), + "lgt_data": data, + "lg_parenthashes": {}, + } + return drop + + +def init_lg_repro_drop_data(drop: dict): + """ + Creates and appends per-drop reproducibility information at the logical graph stage. + :param drop: + :return: The same drop with appended reproducibility information + """ + level = rflag_caster(drop["reprodata"]["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + drop["reprodata"]["rmode"] = str(level.value) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + data = accumulate_lg_drop_data(drop, rmode) + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + drop["reprodata"][rmode.name]["lg_data"] = data + drop["reprodata"][rmode.name]["lg_parenthashes"] = {} + else: + data = accumulate_lg_drop_data(drop, level) + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + drop["reprodata"]["lg_data"] = data + drop["reprodata"]["lg_parenthashes"] = {} + return drop + + +def append_pgt_repro_data(drop: dict, data: dict): + """ + Adds provided data dictionary to drop description at PGT level. + :param drop: The drop description + :param data: The data to be added - arbitrary dictionary + :return: + """ + level = rflag_caster(drop["reprodata"]["rmode"]) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + merkletree = MerkleTree(data[rmode.name].items(), common_hash) + data[rmode.name]["merkleroot"] = merkletree.merkle_root + drop["reprodata"][rmode.name]["pgt_parenthashes"] = {} + drop["reprodata"][rmode.name]["pgt_data"] = data[rmode.name] + else: + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + # Separated so chaining can happen on independent elements (or both later) + drop["reprodata"]["pgt_parenthashes"] = {} + drop["reprodata"]["pgt_data"] = data + return drop + + +def init_pgt_unroll_repro_drop_data(drop: dict): + """ + Creates and appends per-drop reproducibility information + at the physical graph template stage when unrolling. + :param drop: The drop description + :return: The same drop with appended reproducibility information + """ + data = accumulate_pgt_unroll_drop_data(drop) + append_pgt_repro_data(drop, data) + return drop + + +def init_pgt_partition_repro_drop_data(drop: dict): + """ + Creates and appends per-drop reproducibility information + at the physical graph template stage when partitioning. + :param drop: The drop description + :return: The same drop with appended reproducibility information + """ + data = accumulate_pgt_partition_drop_data(drop) + append_pgt_repro_data(drop, data) + return drop + + +def init_pg_repro_drop_data(drop: dict): + """ + Creates and appends per-drop reproducibility information at the physical graph stage. + :param drop: The drop description + :return: The same drop with appended reproducibility information + """ + level = rflag_caster(drop["reprodata"]["rmode"]) + data = accumulate_pg_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + merkletree = MerkleTree(data[rmode.name].items(), common_hash) + data[rmode.name]["merkleroot"] = merkletree.merkle_root + drop["reprodata"][rmode.name]["pg_parenthashes"] = {} + drop["reprodata"][rmode.name]["pg_data"] = data[rmode.name] + else: + merkletree = MerkleTree(data.items(), common_hash) + data["merkleroot"] = merkletree.merkle_root + # Separated so chaining can happen on independent elements (or both later) + drop["reprodata"]["pg_parenthashes"] = {} + drop["reprodata"]["pg_data"] = data + return drop + + +def init_rg_repro_drop_data(drop: dict): + """ + Creates and appends per-drop reproducibility information at the runtime graph stage. + :param drop: + :return: The same drop with appended reproducibility information + """ + level = rflag_caster(drop["reprodata"]["rmode"]) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + drop["reprodata"][rmode.name]["rg_parenthashes"] = {} + elif level != ReproducibilityFlags.NOTHING: + drop["reprodata"]["rg_parenthashes"] = {} + return drop + + +# ------ Graph-Wide Functionality ------ + + +def accumulate_meta_data(): + """ + WARNING: Relies on naming convention in hashlib. + """ + data = {"repro_protocol": PROTOCOL_VERSION, "HashingAlg": str(HashingAlg)} + return data + + +def build_lg_block_data(drop: dict, rmode=None): + """ + Builds the logical graph reprodata entry for a processed drop description + :param drop: The drop description + :return: + """ + if rmode is None: + block_data = [drop["reprodata"]["lgt_data"]["merkleroot"]] + if "merkleroot" in drop["reprodata"]["lg_data"]: + lg_hash = drop["reprodata"]["lg_data"]["merkleroot"] + block_data.append(lg_hash) + for parenthash in sorted(drop["reprodata"]["lg_parenthashes"].values()): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"]["lg_blockhash"] = mtree.merkle_root + else: + block_data = [drop["reprodata"][rmode.name]["lgt_data"]["merkleroot"]] + if "merkleroot" in drop["reprodata"][rmode.name]["lg_data"]: + lg_hash = drop["reprodata"][rmode.name]["lg_data"]["merkleroot"] + block_data.append(lg_hash) + for parenthash in sorted( + drop["reprodata"][rmode.name]["lg_parenthashes"].values() + ): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"][rmode.name]["lg_blockhash"] = mtree.merkle_root + + +def build_pgt_block_data(drop: dict, rmode=None): + """ + Builds the physical graph template reprodata entry for a processed drop description + :param drop: The drop description + :return: + """ + if rmode is None: + block_data = [] + if "pgt_data" in drop["reprodata"]: + if "merkleroot" in drop["reprodata"]["pgt_data"]: + block_data.append(drop["reprodata"]["pgt_data"]["merkleroot"]) + if "lg_blockhash" in drop["reprodata"]: + block_data.append(drop["reprodata"]["lg_blockhash"]) + for parenthash in sorted(drop["reprodata"]["pgt_parenthashes"].values()): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"]["pgt_blockhash"] = mtree.merkle_root + else: + block_data = [] + if "pgt_data" in drop["reprodata"][rmode.name]: + if "merkleroot" in drop["reprodata"][rmode.name]["pgt_data"]: + block_data.append( + drop["reprodata"][rmode.name]["pgt_data"]["merkleroot"] + ) + if "lg_blockhash" in drop["reprodata"][rmode.name]: + block_data.append(drop["reprodata"][rmode.name]["lg_blockhash"]) + for parenthash in sorted( + drop["reprodata"][rmode.name]["pgt_parenthashes"].values() + ): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"][rmode.name]["pgt_blockhash"] = mtree.merkle_root + + +def build_pg_block_data(drop: dict, rmode=None): + """ + Builds the physical graph reprodata entry for a processed drop description + :param drop: The drop description + :return: + """ + if rmode is None: + block_data = [ + drop["reprodata"]["pg_data"]["merkleroot"], + drop["reprodata"]["pgt_blockhash"], + drop["reprodata"]["lg_blockhash"], + ] + for parenthash in sorted(drop["reprodata"]["pg_parenthashes"].values()): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"]["pg_blockhash"] = mtree.merkle_root + else: + block_data = [ + drop["reprodata"][rmode.name]["pg_data"]["merkleroot"], + drop["reprodata"][rmode.name]["pgt_blockhash"], + drop["reprodata"][rmode.name]["lg_blockhash"], + ] + for parenthash in sorted( + drop["reprodata"][rmode.name]["pg_parenthashes"].values() + ): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"][rmode.name]["pg_blockhash"] = mtree.merkle_root + + +def build_rg_block_data(drop: dict, rmode=None): + """ + Builds the runtime graph reprodata entry for a processed drop description. + :param drop: The drop description + :return: + """ + if rmode is None: + block_data = [ + drop["reprodata"]["rg_data"]["merkleroot"], + drop["reprodata"]["pg_blockhash"], + drop["reprodata"]["pgt_blockhash"], + drop["reprodata"]["lg_blockhash"], + ] + for parenthash in sorted(drop["reprodata"]["rg_parenthashes"].values()): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"]["rg_blockhash"] = mtree.merkle_root + else: + import json + + block_data = [ + drop["reprodata"][rmode.name].get("rg_data", {"merkleroot": b""})[ + "merkleroot" + ], + drop["reprodata"][rmode.name]["pg_blockhash"], + drop["reprodata"][rmode.name]["pgt_blockhash"], + drop["reprodata"][rmode.name]["lg_blockhash"], + ] + for parenthash in sorted( + drop["reprodata"][rmode.name]["rg_parenthashes"].values() + ): + block_data.append(parenthash) + mtree = MerkleTree(block_data, common_hash) + drop["reprodata"][rmode.name]["rg_blockhash"] = mtree.merkle_root + + +def lg_build_blockdag(logical_graph: dict, level=None): + """ + Uses Kahn's algorithm to topologically sort a logical graph dictionary. + Exploits that a DAG contains at least one node with in-degree 0. + Processes drops in-order. + O(V + E) time complexity. + :param logical_graph: The logical graph description (template or actual) + :return: leaves set and the list of visited components (in order). + """ + dropset = {} # Also contains in-degree information + neighbourset = {} + roots = [] + leaves = [] + visited = [] + queue = collections.deque() + # TODO: Deal with MKN/Scatter Input drops + for drop in logical_graph.get("nodeDataArray", []): + did = int(drop["key"]) + dropset[did] = [drop, 0, 0] + neighbourset[did] = [] + + for edge in logical_graph.get("linkDataArray", []): + src = int(edge["from"]) + dest = int(edge["to"]) + dropset[dest][1] += 1 + dropset[src][2] += 1 + neighbourset[src].append(dest) + + # did == 'drop id' + for did, drop in dropset.items(): + if drop[1] == 0: + queue.append(did) + roots.append(did) + if not neighbourset[did]: # Leaf node + leaves.append(did) + + while queue: + did = queue.pop() + # Process + if "reprodata" not in dropset[did][0]: + continue + build_lg_block_data(dropset[did][0], level) + visited.append(did) + rmode = rflag_caster(dropset[did][0]["reprodata"]["rmode"]) + if rmode == ReproducibilityFlags.ALL: + rmode = level # Only building one layer at a time. + for neighbour in neighbourset[did]: + dropset[neighbour][1] -= 1 + parenthash = {} + if rmode != ReproducibilityFlags.NOTHING: + if rmode == ReproducibilityFlags.REPRODUCE: + if ( + dropset[did][0]["category"] in STORAGE_TYPES + and (dropset[did][1] == 0 or dropset[did][2] == 0) + and (did in roots or did in leaves) + ): + # Add my new hash to the parent-hash list + if did not in parenthash: + if level is None: + parenthash[did] = dropset[did][0]["reprodata"][ + "lg_blockhash" + ] + else: + parenthash[did] = dropset[did][0]["reprodata"][ + level.name + ]["lg_blockhash"] + # parenthash.append(dropset[did][0]['reprodata']['lg_blockhash']) + else: + # Add my parenthashes to the parent-hash list + if level is None: + parenthash.update( + dropset[did][0]["reprodata"]["lg_parenthashes"] + ) + else: + parenthash.update( + dropset[did][0]["reprodata"][level.name][ + "lg_parenthashes" + ] + ) + # parenthash.extend(dropset[did][0]['reprodata']['lg_parenthashes']) + if rmode != ReproducibilityFlags.REPRODUCE: # Non-compressing behaviour + if level is None: + parenthash[did] = dropset[did][0]["reprodata"]["lg_blockhash"] + else: + parenthash[did] = dropset[did][0]["reprodata"][level.name][ + "lg_blockhash" + ] + # parenthash.append(dropset[did][0]['reprodata']['lg_blockhash']) + # Add our new hash to the parent-hash list + # We deal with duplicates later + if level is None: + dropset[neighbour][0]["reprodata"]["lg_parenthashes"].update( + parenthash + ) + else: + dropset[neighbour][0]["reprodata"][level.name][ + "lg_parenthashes" + ].update(parenthash) + if dropset[neighbour][1] == 0: # Add drops at the DAG-frontier + queue.append(neighbour) + + if len(visited) != len(dropset): + logger.warning("Untraversed graph") + + logger.info("BlockDAG Generated at LG/T level") + + for i, leaf in enumerate(leaves): + if level is None: + # WARNING: Remove once dealt with MKN Nodes + if "reprodata" in dropset[leaf][0]: + leaves[i] = dropset[leaf][0]["reprodata"].get("lg_blockhash", "") + else: + if "reprodata" in dropset[leaf][0]: + leaves[i] = dropset[leaf][0]["reprodata"][level.name].get( + "lg_blockhash", "" + ) + return leaves, visited + + +def build_blockdag(drops: list, abstraction: str = "pgt", level=None): + """ + Uses Kahn's algorithm to topologically sort a logical graph dictionary. + Exploits that a DAG contains at least one node with in-degree 0. + Processes drops in-order. + O(V + E) time complexity. + :param drops: The list of drops + :param abstraction: The level of graph abstraction 'pgt' || 'pg' + :return: + """ + blockstr = "pgt" + parentstr = "pgt_parenthashes" + block_builder = build_pgt_block_data + if abstraction == "pg": + blockstr = "pg" + parentstr = "pg_parenthashes" + block_builder = build_pg_block_data + if abstraction == "rg": + blockstr = "rg" + parentstr = "rg_parenthashes" + block_builder = build_rg_block_data + + dropset = {} + neighbourset = {} + roots = [] + leaves = [] + visited = [] + queue = collections.deque() + for drop in drops: + did = drop["oid"] + dropset[did] = [drop, 0, 0] + for drop in drops: + did = drop["oid"] + neighbourset[did] = [] + if "outputs" in drop: + # Assumes the model where all edges are defined from source to destination. + # This may not always be the case. + for dest in drop["outputs"]: + if isinstance(dest, dict): + dest = next(iter(dest)) + dropset[dest][1] += 1 + dropset[did][2] += 1 + neighbourset[did].append(dest) + if ( + "consumers" in drop + ): # There may be some bizarre scenario when a drop has both + for dest in drop["consumers"]: + if isinstance(dest, dict): + dest = next(iter(dest)) + dropset[dest][1] += 1 + dropset[did][2] += 1 + neighbourset[did].append( + dest + ) # TODO: Appending may not be correct behaviour + for did, drop_val in dropset.items(): + if drop_val[1] == 0: + queue.append(did) + roots.append(did) + if not neighbourset[did]: # Leaf node + leaves.append(did) + while queue: + did = queue.pop() + block_builder(dropset[did][0], level) + visited.append(did) + rmode = rflag_caster(dropset[did][0]["reprodata"]["rmode"]) + if rmode == ReproducibilityFlags.ALL: + rmode = level + for neighbour in neighbourset[did]: + dropset[neighbour][1] -= 1 + parenthash = {} + if rmode != ReproducibilityFlags.NOTHING: + if rmode == ReproducibilityFlags.REPRODUCE: + # WARNING: Hack! may break later, proceed with caution + if level is None: + category = dropset[did][0]["reprodata"]["lgt_data"]["category"] + else: + category = dropset[did][0]["reprodata"][rmode.name]["lgt_data"][ + "category" + ] + if ( + category in STORAGE_TYPES + and (dropset[did][1] == 0 or dropset[did][2] == 0) + and (did in roots or did in leaves) + ): + # Add my new hash to the parent-hash list + if did not in parenthash: + if level is None: + parenthash[did] = dropset[did][0]["reprodata"][ + blockstr + "_blockhash" + ] + else: + parenthash[did] = dropset[did][0]["reprodata"][ + level.name + ][blockstr + "_blockhash"] + # parenthash.append(dropset[did][0]['reprodata'] \ + # [blockstr + "_blockhash"]) + else: + # Add my parenthashes to the parent-hash list + if level is None: + parenthash.update(dropset[did][0]["reprodata"][parentstr]) + else: + parenthash.update( + dropset[did][0]["reprodata"][level.name][parentstr] + ) + if rmode != ReproducibilityFlags.REPRODUCE: + if level is None: + parenthash[did] = dropset[did][0]["reprodata"][ + blockstr + "_blockhash" + ] + else: + parenthash[did] = dropset[did][0]["reprodata"][level.name][ + blockstr + "_blockhash" + ] + # Add our new hash to the parent-hash list if on the critical path + if rmode == ReproducibilityFlags.RERUN: + if "iid" in dropset[did][0].keys(): + if ( + dropset[did][0]["iid"] == "0/0" + ): # TODO: This is probably wrong + if level is None: + dropset[neighbour][0]["reprodata"][parentstr].update( + parenthash + ) + else: + dropset[neighbour][0]["reprodata"][level.name][ + parentstr + ].update(parenthash) + else: + if level is None: + dropset[neighbour][0]["reprodata"][parentstr].update( + parenthash + ) + else: + dropset[neighbour][0]["reprodata"][level.name][ + parentstr + ].update(parenthash) + elif rmode != ReproducibilityFlags.RERUN: + if level is None: + dropset[neighbour][0]["reprodata"][parentstr].update(parenthash) + else: + dropset[neighbour][0]["reprodata"][level.name][ + parentstr + ].update(parenthash) + if dropset[neighbour][1] == 0: + queue.append(neighbour) + + if len(visited) != len(dropset): + logger.warning("Not a DAG") + + for i, leaf in enumerate(leaves): + + if level is None: + leaves[i] = dropset[leaf][0]["reprodata"][blockstr + "_blockhash"] + else: + leaves[i] = dropset[leaf][0]["reprodata"][level.name][ + blockstr + "_blockhash" + ] + return leaves, visited + + # logger.info("BlockDAG Generated at" + abstraction + " level") + + +def agglomerate_leaves(leaves: list): + """ + Inserts all hash values in `leaves` into a merkleTree in sorted order (ascending). + Returns the root of this tree + """ + merkletree = MerkleTree(sorted(leaves)) + return merkletree.merkle_root + + +def init_lgt_repro_data(logical_graph_template: dict, rmode: str): + """ + Creates and appends graph-wide reproducibility data at the logical template stage. + Currently, this is basically a stub that adds the requested flag to the graph. + Later, this will contain significantly more information. + :param logical_graph_template: The logical graph data structure (a JSON object (a dict)) + :param rmode: One several values 0-5 defined in constants.py + :return: The same lgt object with new information appended + """ + rmode = rflag_caster(rmode) + if not rmode_supported(rmode): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(rmode) + ) + rmode = REPRO_DEFAULT + if rmode == ReproducibilityFlags.NOTHING: + return logical_graph_template + reprodata = {"rmode": str(rmode.value), "meta_data": accumulate_meta_data()} + meta_tree = MerkleTree(reprodata.items(), common_hash) + reprodata["merkleroot"] = meta_tree.merkle_root + for drop in logical_graph_template.get("nodeDataArray", []): + init_lgt_repro_drop_data(drop, rmode) + logical_graph_template["reprodata"] = reprodata + logger.info("Reproducibility data finished at LGT level") + return logical_graph_template + + +def init_lg_repro_data(logical_graph: dict): + """ + Handles adding reproducibility data at the logical graph level. + Also builds the logical data blockdag over the entire structure. + :param logical_graph: The logical graph data structure (a JSON object (a dict)) + :return: The same lgt object with new information appended + """ + if "reprodata" not in logical_graph: + return logical_graph + level = rflag_caster(logical_graph["reprodata"]["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + if level == ReproducibilityFlags.NOTHING: + return logical_graph + for drop in logical_graph.get("nodeDataArray", []): + init_lg_repro_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + if rmode.name not in logical_graph["reprodata"]: + logical_graph["reprodata"][rmode.name] = {} + leaves, _ = lg_build_blockdag(logical_graph, rmode) + logical_graph["reprodata"][rmode.name]["signature"] = agglomerate_leaves( + leaves + ) + else: + leaves, _ = lg_build_blockdag(logical_graph) + logical_graph["reprodata"]["signature"] = agglomerate_leaves(leaves) + logger.info("Reproducibility data finished at LG level") + return logical_graph + + +def init_pgt_unroll_repro_data(physical_graph_template: list): + """ + Handles adding reproducibility data at the physical graph template level. + :param physical_graph_template: The physical graph template structure + (a list of drops + reprodata dictionary) + :return: The same pgt object with new information appended + """ + reprodata = physical_graph_template.pop() + if "rmode" not in reprodata: + for drop in physical_graph_template: + if "reprodata" in drop: + drop.pop("reprodata") + physical_graph_template.append(reprodata) + return physical_graph_template + level = rflag_caster(reprodata["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + if level == ReproducibilityFlags.NOTHING: + physical_graph_template.append(reprodata) + for drop in physical_graph_template: + drop.pop("reprodata") + return physical_graph_template + for drop in physical_graph_template: + init_pgt_unroll_repro_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + if rmode.name not in reprodata: + reprodata[rmode.name] = {} + leaves, _ = build_blockdag(physical_graph_template, "pgt", rmode) + reprodata[rmode.name]["signature"] = agglomerate_leaves(leaves) + else: + leaves, _ = build_blockdag(physical_graph_template, "pgt") + reprodata["signature"] = agglomerate_leaves(leaves) + physical_graph_template.append(reprodata) + logger.info("Reproducibility data finished at PGT unroll level") + return physical_graph_template + + +def init_pgt_partition_repro_data(physical_graph_template: list): + """ + Handles adding reproducibility data at the physical graph template level + after resource partitioning. + :param physical_graph_template: The physical graph template structure + (a list of drops + reprodata dictionary) + :return: The same pgt object with new information recorded + """ + reprodata = physical_graph_template.pop() + if "rmode" not in reprodata: + physical_graph_template.append(reprodata) + return physical_graph_template + level = rflag_caster(reprodata["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + if level == ReproducibilityFlags.NOTHING: + physical_graph_template.append(reprodata) + return physical_graph_template + for drop in physical_graph_template: + init_pgt_partition_repro_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + if rmode.name not in reprodata: + reprodata[rmode.name] = {} + leaves, _ = build_blockdag(physical_graph_template, "pgt", rmode) + reprodata[rmode.name]["signature"] = agglomerate_leaves(leaves) + else: + leaves, _ = build_blockdag(physical_graph_template, "pgt") + reprodata["signature"] = agglomerate_leaves(leaves) + physical_graph_template.append(reprodata) + logger.info("Reproducibility data finished at PGT partition level") + return physical_graph_template + + +def init_pg_repro_data(physical_graph: list): + """ + Handles adding reproducibility data at the physical graph template level. + :param physical_graph: The logical graph data structure (a list of drops + reprodata dictionary) + :return: The same pg object with new information appended + """ + reprodata = physical_graph.pop() + if "rmode" not in reprodata: + physical_graph.append(reprodata) + return physical_graph + level = rflag_caster(reprodata["rmode"]) + if not rmode_supported(level): + logger.warning( + "Requested reproducibility mode %s not yet implemented", str(level) + ) + level = REPRO_DEFAULT + if level == ReproducibilityFlags.NOTHING: + physical_graph.append(reprodata) + return physical_graph + for drop in physical_graph: + init_pg_repro_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + leaves, _ = build_blockdag(physical_graph, "pg", rmode) + reprodata[rmode.name]["signature"] = agglomerate_leaves(leaves) + else: + leaves, _ = build_blockdag(physical_graph, "pg") + reprodata["signature"] = agglomerate_leaves(leaves) + physical_graph.append(reprodata) + logger.info("Reproducibility data finished at PG level") + return physical_graph + + +def init_runtime_repro_data(runtime_graph: dict, reprodata: dict): + """ + Adds reproducibility data at the runtime level to graph-wide values. + :param runtime_graph: + :param reprodata: + :return: + """ + if reprodata is None: + return runtime_graph + level = rflag_caster(reprodata["rmode"]) + if not rmode_supported(level): + # TODO: Logging needs sessionID at this stage + # logger.warning("Requested reproducibility mode %s not yet implemented", str(rmode)) + level = REPRO_DEFAULT + reprodata["rmode"] = str(level.value) + for drop in runtime_graph.values(): + init_rg_repro_drop_data(drop) + if level == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + leaves, _ = build_blockdag(list(runtime_graph.values()), "rg", rmode) + reprodata[rmode.name]["signature"] = agglomerate_leaves(leaves) + else: + leaves, _ = build_blockdag(list(runtime_graph.values()), "rg") + reprodata["signature"] = agglomerate_leaves(leaves) + runtime_graph["reprodata"] = reprodata + # logger.info("Reproducibility data finished at runtime level") + return runtime_graph diff --git a/daliuge-common/dlg/common/reproducibility/reproducibility_fields.py b/daliuge-common/dlg/common/reproducibility/reproducibility_fields.py new file mode 100644 index 000000000..3d17d5be1 --- /dev/null +++ b/daliuge-common/dlg/common/reproducibility/reproducibility_fields.py @@ -0,0 +1,244 @@ +# +# ICRAR - International Centre for Radio Astronomy Research +# (c) UWA - The University of Western Australia, 2017 +# Copyright by UWA (in the framework of the ICRAR) +# All rights reserved +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +""" +This module defines the fields each drop takes for each reproducibility standard defined. +Consider this module partially documentation, partially code. +Data generated by instanced drops at runtime are defined with that drop's implementation. +""" + +from enum import Enum + +from dlg.common import Categories +from dlg.common.reproducibility.constants import ReproducibilityFlags + + +class FieldOps(Enum): + """ + Defines the operations possible on drop data for provenance collection. + """ + + STORE = 0 + COUNT = 1 + REMOVE_FIRST = 2 # Removes the first char of an assumed string + + +def extract_fields(drop: dict, fields: dict): + """ + Attempts to extract fields with the names in fields from the drop description. + If not found, the key will not be present in the returned dictionary. + """ + data = {} + for key, operation in fields.items(): + if drop.get(key) is not None: + if operation == FieldOps.STORE: + data[key] = drop.get(key) + elif operation == FieldOps.COUNT: + data[key] = len(drop.get(key)) + elif operation == FieldOps.REMOVE_FIRST: + data[key] = drop.get(key)[1:] + return data + + +def lgt_block_fields(rmode: ReproducibilityFlags): + """ + Collects dict of fields and operations for all drop types at the lgt layer for + the supplied reproducibility standard. + :param rmode: The reproducibility level in question + :return: Dictionary of pairs + """ + if rmode == ReproducibilityFlags.NOTHING: + return {} + data = { + "categoryType": FieldOps.STORE, + "category": FieldOps.STORE, + "inputPorts": FieldOps.COUNT, + "outputPorts": FieldOps.COUNT, + "inputLocalPorts": FieldOps.COUNT, + "outputLocalPorts": FieldOps.COUNT, # MKN Nodes + "streaming": FieldOps.STORE, + } + if rmode == ReproducibilityFlags.REPRODUCE: + del data["inputPorts"] + del data["outputPorts"] + del data["inputLocalPorts"] + del data["outputLocalPorts"] + del data["streaming"] + return data + + +def lg_block_fields( + category: Categories, category_type: str, rmode: ReproducibilityFlags +): + """ + Collects dict of fields and operations for all drop types at the lg layer for + the supplied reproducibility standard. + :param category: The broad type of drop + :param category_type: The specific type of drop + :param rmode: The reproducibility level in question + :return: Dictionary of pairs + """ + data = {} + if rmode in ( + ReproducibilityFlags.NOTHING, + ReproducibilityFlags.RERUN, + ReproducibilityFlags.REPRODUCE, + ReproducibilityFlags.REPLICATE_SCI, + ): + return data + # Drop category considerations + if category == "Application": + data["execution_time"] = FieldOps.STORE + data["num_cpus"] = FieldOps.STORE + elif category == "Group": + data["inputApplicationName"] = FieldOps.STORE + data["inputApplicationType"] = FieldOps.STORE + elif category == Categories.DATA: # An anomaly, I know + data["data_volume"] = FieldOps.STORE + + # Drop type considerations + if category_type == Categories.START: + pass + elif category_type == Categories.END: + pass + elif category_type == Categories.MEMORY: + pass + elif category_type == Categories.SHMEM: + pass + elif category_type == Categories.FILE: + data["check_filepath_exists"] = FieldOps.STORE + if rmode in ( + ReproducibilityFlags.RECOMPUTE, + ReproducibilityFlags.REPLICATE_COMP, + ): + data["filepath"] = FieldOps.STORE + data["dirname"] = FieldOps.STORE + elif category_type == Categories.NULL: + pass + elif category_type == Categories.JSON: + pass + elif category_type == Categories.NGAS: + pass + elif category_type == Categories.S3: + pass + elif category_type == Categories.PLASMA: + data["plasma_path"] = FieldOps.STORE + data["object_id"] = FieldOps.STORE + elif category_type == Categories.PLASMAFLIGHT: + data["plasma_path"] = FieldOps.STORE + data["object_id"] = FieldOps.STORE + data["flight_path"] = FieldOps.STORE + elif category_type == Categories.PARSET: + pass + elif category_type == Categories.ENVIRONMENTVARS: + pass + elif category_type == Categories.MKN: + data["m"] = FieldOps.STORE + data["k"] = FieldOps.STORE + data["n"] = FieldOps.STORE + elif category_type == Categories.SCATTER: + data["num_of_copies"] = FieldOps.STORE + data["scatter_axis"] = FieldOps.STORE + elif category_type == Categories.GATHER: + data["num_of_inputs"] = FieldOps.STORE + data["gather_axis"] = FieldOps.STORE + elif category_type == Categories.LOOP: + data["num_of_iter"] = FieldOps.STORE + elif category_type == Categories.GROUP_BY: + data["group_key"] = FieldOps.STORE + data["group_axis"] = FieldOps.STORE + elif category_type == Categories.VARIABLES: + pass + elif category_type == Categories.BRANCH: + data["appclass"] = FieldOps.STORE + elif category_type == Categories.PYTHON_APP: + data["appclass"] = FieldOps.STORE + elif category_type == Categories.COMPONENT: + data["appclass"] = FieldOps.STORE + elif category_type == Categories.BASH_SHELL_APP: + data["Arg01"] = FieldOps.STORE + elif category_type == Categories.MPI: + data["num_of_procs"] = FieldOps.STORE + elif category_type == Categories.DOCKER: + data["image"] = FieldOps.STORE + data["command"] = FieldOps.STORE + data["user"] = FieldOps.STORE + data["ensureUserAndSwitch"] = FieldOps.STORE + data["removeContainer"] = FieldOps.STORE + data["additionalBindings"] = FieldOps.STORE + elif category_type == Categories.DYNLIB_APP: + data["libpath"] = FieldOps.STORE + elif category_type == Categories.DYNLIB_PROC_APP: + data["libpath"] = FieldOps.STORE + return data + + +def pgt_unroll_block_fields(category_type, rmode: ReproducibilityFlags): + """ + Collects dict of fields and operations for all drop types at the pgt unroll layer for + the supplied reproducibility standard. + :param category_type: The specific type of drop + :param rmode: The reproducibility level in question + :return: Dictionary of pairs + """ + data = {} + if rmode == ReproducibilityFlags.NOTHING: + return data + if rmode != ReproducibilityFlags.NOTHING: + data["type"] = FieldOps.STORE + if rmode != ReproducibilityFlags.REPRODUCE: + if category_type != "plain": + data["dt"] = FieldOps.STORE + if category_type == "plain": + data["storage"] = FieldOps.STORE + if rmode in (ReproducibilityFlags.RECOMPUTE, ReproducibilityFlags.REPLICATE_COMP): + data["rank"] = FieldOps.STORE + + return data + + +def pgt_partition_block_fields(rmode: ReproducibilityFlags): + """ + Collects dict of fields and operations for all drop types at the pgt partition layer for + the supplied reproducibility standard. + :param rmode: The reproducibility level in question + :return: Dictionary of pairs + """ + data = {} + if rmode in (ReproducibilityFlags.RECOMPUTE, ReproducibilityFlags.REPLICATE_COMP): + data["node"] = FieldOps.REMOVE_FIRST + data["island"] = FieldOps.REMOVE_FIRST + return data + + +def pg_block_fields(rmode: ReproducibilityFlags): + """ + Collects dict of fields and operations for all drop types at the pg layer for + the supplied reproducibility standard. + :param rmode: The reproducibility level in question + :return: Dictionary of pairs + """ + # These two happen to have the same data. + data = {} + if rmode in (ReproducibilityFlags.RECOMPUTE, ReproducibilityFlags.REPLICATE_COMP): + data["node"] = FieldOps.STORE + data["island"] = FieldOps.STORE + return data diff --git a/daliuge-common/docker/Dockerfile b/daliuge-common/docker/Dockerfile index d309e5ab5..8750cab18 100644 --- a/daliuge-common/docker/Dockerfile +++ b/daliuge-common/docker/Dockerfile @@ -7,7 +7,9 @@ FROM ubuntu:20.04 ARG BUILD_ID LABEL stage=builder LABEL build=$BUILD_ID -RUN apt-get update && apt-get install -y gcc python3 python3.8-venv python3-pip python3-distutils libmetis-dev curl && apt-get clean +RUN apt-get update && \ + apt-get install -y gcc python3 python3.8-venv python3-pip python3-distutils python3-appdirs libmetis-dev curl && \ + apt-get clean COPY / /daliuge diff --git a/daliuge-common/docker/Dockerfile.dev b/daliuge-common/docker/Dockerfile.dev index 18ded1617..eba084089 100644 --- a/daliuge-common/docker/Dockerfile.dev +++ b/daliuge-common/docker/Dockerfile.dev @@ -8,13 +8,14 @@ ARG BUILD_ID LABEL stage=builder LABEL build=$BUILD_ID RUN apt-get update && \ - apt-get install -y gcc python3 python3.8-venv python3-pip python3-distutils libmetis-dev curl git sudo && \ + apt-get install -y gcc python3 python3.8-venv python3-pip python3-distutils python3-appdirs libmetis-dev curl git sudo && \ apt-get clean COPY / /daliuge RUN cd / && python3 -m venv dlg && cd /daliuge && \ . /dlg/bin/activate && \ + pip install --upgrade pip && \ pip install wheel numpy && \ pip install . diff --git a/daliuge-common/docker/Dockerfile.cuda b/daliuge-common/docker/Dockerfile.devcuda similarity index 69% rename from daliuge-common/docker/Dockerfile.cuda rename to daliuge-common/docker/Dockerfile.devcuda index cd93affac..103b6c598 100644 --- a/daliuge-common/docker/Dockerfile.cuda +++ b/daliuge-common/docker/Dockerfile.devcuda @@ -7,23 +7,18 @@ FROM ubuntu:20.04 ARG BUILD_ID LABEL stage=builder LABEL build=$BUILD_ID -RUN apt-get update && apt-get install -y gcc python3 python3.8-venv && apt-get clean +RUN apt-get update && \ + apt-get install -y gcc python3 python3.8-venv python3-pip python3-distutils libmetis-dev curl git sudo && \ + apt-get clean COPY / /daliuge -RUN cd && python3 -m venv dlg && cd /daliuge && \ - . ${HOME}/dlg/bin/activate && \ - pip install numpy && \ - pip install . && \ - apt-get remove -y gcc && \ - apt-get autoremove -y +RUN cd / && python3 -m venv dlg && cd /daliuge && \ + . /dlg/bin/activate && \ + pip install wheel numpy && \ + pip install . - -FROM ubuntu:20.04 -RUN apt-get update && apt-get install -y bash -COPY --from=0 /root/dlg /root/dlg - -RUN apt install -y wget gnupg2 software-properties-common +RUN DEBIAN_FRONTEND=noninteractive apt install -y wget gnupg2 software-properties-common RUN mkdir -p /code && cd /code &&\ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin &&\ mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 &&\ diff --git a/daliuge-common/setup.py b/daliuge-common/setup.py index aff84a530..43e0eeeb8 100644 --- a/daliuge-common/setup.py +++ b/daliuge-common/setup.py @@ -47,6 +47,8 @@ def do_versioning(): return _globals["write_version_info"](VERSION, VERSION_FILE, RELEASE) +install_requires = ["gputil>=1.4.0", "merklelib>=1.0"] + setup( name="daliuge-common", version=do_versioning(), @@ -61,4 +63,5 @@ def do_versioning(): entry_points={ "console_scripts": ["dlg=dlg.common.tool:run"] }, # One tool to rule them all + install_requires=install_requires, ) diff --git a/daliuge-engine/build_engine.sh b/daliuge-engine/build_engine.sh index e35957ece..00e674eff 100755 --- a/daliuge-engine/build_engine.sh +++ b/daliuge-engine/build_engine.sh @@ -3,9 +3,11 @@ # branch name or with a release tag depending whether this is a development or deployment # version. +export VCS_TAG=`git describe --tags --abbrev=0|sed s/v//` +export DEV_TAG=`git rev-parse --abbrev-ref HEAD | tr '[:upper:]' '[:lower:]'` + case "$1" in "dep") - export VCS_TAG=`git describe --tags --abbrev=0|sed s/v//` echo "Building daliuge-engine version using tag ${VCS_TAG}" echo $VCS_TAG > dlg/manager/web/VERSION cp ../LICENSE dlg/manager/web/. @@ -13,38 +15,40 @@ case "$1" in echo "Build finished!" exit 0 ;; "dev") + C_TAG="master" [[ ! -z $2 ]] && C_TAG=$2 export VERSION=`git describe --tags --abbrev=0|sed s/v//` - export VCS_TAG=`git rev-parse --abbrev-ref HEAD | tr '[:upper:]' '[:lower:]'` - echo "Building daliuge-engine development version using daliuge-common:${VCS_TAG}" + export VCS_TAG=DEV_TAG + echo "Building daliuge-engine development version using daliuge-common:${C_TAG}" echo "$VERSION:$VCS_TAG" > dlg/manager/web/VERSION git rev-parse --verify HEAD >> dlg/manager/web/VERSION cp ../LICENSE dlg/manager/web/. - docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-engine:${VCS_TAG} -f docker/Dockerfile.dev . + docker build --build-arg VCS_TAG=${C_TAG} --no-cache -t icrar/daliuge-engine:${DEV_TAG} -f docker/Dockerfile.dev . echo "Build finished!" exit 0;; "devall") [[ ! -z $2 ]] && C_TAG=$2 export VERSION=`git describe --tags --abbrev=0|sed s/v//` export VCS_TAG=`git rev-parse --abbrev-ref HEAD | tr '[:upper:]' '[:lower:]'` - echo "Building daliuge-engine development version using daliuge-common:${VCS_TAG}" + echo "Building daliuge-engine development version using daliuge-common:${DEV_TAG}" echo "$VERSION:$VCS_TAG" > dlg/manager/web/VERSION git rev-parse --verify HEAD >> dlg/manager/web/VERSION cp ../LICENSE dlg/manager/web/. - docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-engine:${VCS_TAG} -f docker/Dockerfile.devall . + docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-engine:${DEV_TAG} -f docker/Dockerfile.devall . echo "Build finished!" exit 0;; "slim") - export VCS_TAG=`git describe --tags --abbrev=0|sed s/v//` - echo "Building daliuge-engine slim version ${VCS_TAG}" - docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-engine.big:${VCS_TAG} -f docker/Dockerfile . + C_TAG="master" + [[ ! -z $2 ]] && C_TAG=$2 + echo "Building daliuge-engine slim version ${VCS_TAG} using daliuge-common:${C_TAG}" + docker build --build-arg VCS_TAG=${VCS_TAG} --no-cache -t icrar/daliuge-engine:${DEV_TAG} -f docker/Dockerfile.dev . echo "Build finished! Slimming the image now" echo ">>>>> docker-slim output <<<<<<<<<" docker run -it --rm -v /var/run/docker.sock:/var/run/docker.sock dslim/docker-slim build --include-shell \ - --include-path /usr/local/lib --include-path /usr/local/bin --include-path /usr/lib/python3.8/multiprocessing \ - --include-path /dlg --include-path /daliuge --publish-exposed-ports=true \ - --http-probe-exec start_local_managers.sh --http-probe=true --tag=icrar/daliuge-engine:${VCS_TAG}\ - icrar/daliuge-engine.big:${VCS_TAG} \ + --include-path /etc --include-path /usr/local/lib --include-path /usr/local/bin --include-path /usr/lib/python3.8 \ + --include-path /usr/lib/python3 --include-path /dlg --include-path /daliuge --publish-exposed-ports=true \ + --http-probe-exec start_local_managers.sh --http-probe=true --tag=icrar/daliuge-engine.slim:${DEV_TAG}\ + icrar/daliuge-engine:${DEV_TAG} \ ;; *) echo "Usage: build_engine.sh " diff --git a/daliuge-engine/dlg/__init__.py b/daliuge-engine/dlg/__init__.py index 0cb96f675..ba61d0929 100644 --- a/daliuge-engine/dlg/__init__.py +++ b/daliuge-engine/dlg/__init__.py @@ -22,3 +22,11 @@ # Declaring this as a namespace package __path__ = __import__("pkgutil").extend_path(__path__, __name__) # @ReservedAssignment +# set the version +try: + from dlg.common import version + + __version__ = version.full_version +except: + # This can happen when running from source + __version__ = "unknown" diff --git a/daliuge-engine/dlg/apps/DALIUGE/xml/simple.xml b/daliuge-engine/dlg/apps/DALIUGE/xml/simple.xml new file mode 100644 index 000000000..e69de29bb diff --git a/daliuge-engine/dlg/apps/bash_shell_app.py b/daliuge-engine/dlg/apps/bash_shell_app.py index f0ab1ac6f..ba609dc55 100644 --- a/daliuge-engine/dlg/apps/bash_shell_app.py +++ b/daliuge-engine/dlg/apps/bash_shell_app.py @@ -157,7 +157,8 @@ class BashShellBase(object): Common class for BashShell apps. It simply requires a command to be specified. """ - #TODO: use the shlex module for most of the construction of the + + # TODO: use the shlex module for most of the construction of the # command line to get a proper and safe shell syntax command = dlg_string_param("Bash command", None) @@ -170,8 +171,7 @@ def initialize(self, **kwargs): self._cmdLineArgs = self._getArg(kwargs, "command_line_arguments", "") self._applicationArgs = self._getArg(kwargs, "applicationArgs", {}) self._argumentPrefix = self._getArg(kwargs, "argumentPrefix", "--") - self._paramValueSeparator = self._getArg(kwargs, \ - "paramValueSeparator", " ") + self._paramValueSeparator = self._getArg(kwargs, "paramValueSeparator", " ") if not self.command: self.command = self._getArg(kwargs, "command", None) @@ -180,6 +180,8 @@ def initialize(self, **kwargs): self, "No command specified, cannot create BashShellApp" ) + self._recompute_data = {} + def _run_bash(self, inputs, outputs, stdin=None, stdout=subprocess.PIPE): """ Runs the given `cmd`. If any `inputs` and/or `outputs` are given @@ -197,8 +199,9 @@ def _run_bash(self, inputs, outputs, stdin=None, stdout=subprocess.PIPE): session_id = ( self._dlg_session.sessionId if self._dlg_session is not None else "" ) - argumentString = droputils.serialize_applicationArgs(self._applicationArgs, \ - self._argumentPrefix, self._paramValueSeparator) + argumentString = droputils.serialize_applicationArgs( + self._applicationArgs, self._argumentPrefix, self._paramValueSeparator + ) # complete command including all additional parameters and optional redirects cmd = f"{self.command} {argumentString} {self._cmdLineArgs} " if self._outputRedirect: @@ -252,10 +255,13 @@ def _run_bash(self, inputs, outputs, stdin=None, stdout=subprocess.PIPE): if stdout != subprocess.PIPE: pstdout = b"" pcode = process.returncode - end = time.time() logger.info("Finished in %.3f [s] with exit code %d", (end - start), pcode) + logger.info("Finished in %.3f [s] with exit code %d", (end - start), pcode) + self._recompute_data["stdout"] = str(pstdout) + self._recompute_data["stderr"] = str(pstderr) + self._recompute_data["status"] = str(pcode) if pcode == 0 and logger.isEnabledFor(logging.DEBUG): logger.debug( mesage_stdouts("Command finished successfully", pstdout, pstderr) @@ -275,6 +281,10 @@ def cancel(self): except: logger.exception("Error while terminating process %r", self.proc) + def generate_recompute_data(self): + self._recompute_data["command"] = self.command + return self._recompute_data + class StreamingInputBashAppBase(BashShellBase, AppDROP): """ diff --git a/daliuge-engine/dlg/apps/crc.py b/daliuge-engine/dlg/apps/crc.py index baf3a21f4..ac63b9a55 100644 --- a/daliuge-engine/dlg/apps/crc.py +++ b/daliuge-engine/dlg/apps/crc.py @@ -25,6 +25,8 @@ from ..drop import BarrierAppDROP, AppDROP from dlg.ddap_protocol import AppDROPStates + +from ..drop import BarrierAppDROP, AppDROP from ..meta import dlg_component, dlg_batch_input, dlg_batch_output, dlg_streaming_input try: diff --git a/daliuge-engine/dlg/apps/dockerapp.py b/daliuge-engine/dlg/apps/dockerapp.py index 948d36c7d..17a57cf43 100644 --- a/daliuge-engine/dlg/apps/dockerapp.py +++ b/daliuge-engine/dlg/apps/dockerapp.py @@ -74,7 +74,7 @@ def waitForIp(self, timeout=None): ## # @brief Docker -# @details +# @details A component wrapping docker based applications. # @par EAGLE_START # @param category Docker # @param tag template @@ -93,7 +93,7 @@ def waitForIp(self, timeout=None): # @param[in] cparam/command_line_arguments Command Line Arguments//String/readwrite/False//False/ # \~English Additional command line arguments to be added to the command line to be executed # @param[in] cparam/paramValueSeparator Param value separator/ /String/readwrite/False//False/ -# \~English Separator character(s) between parameters on the command line +# \~English Separator character(s) between parameters and their respective values on the command line # @param[in] cparam/argumentPrefix Argument prefix/"--"/String/readwrite/False//False/ # \~English Prefix to each keyed argument on the command line # @param[in] cparam/execution_time Execution Time/5/Float/readonly/False//False/ @@ -236,6 +236,7 @@ class DockerApp(BarrierAppDROP): running in a container must quit themselves after successfully performing their task. """ + _container: Optional[Container] = None # signals for stopping this drop must first wait @@ -259,8 +260,7 @@ def initialize(self, **kwargs): self._cmdLineArgs = self._getArg(kwargs, "command_line_arguments", "") self._applicationArgs = self._getArg(kwargs, "applicationArgs", {}) self._argumentPrefix = self._getArg(kwargs, "argumentPrefix", "--") - self._paramValueSeparator = self._getArg(kwargs, \ - "paramValueSeparator", " ") + self._paramValueSeparator = self._getArg(kwargs, "paramValueSeparator", " ") if not self._image: raise InvalidDropException( self, "No docker image specified, cannot create DockerApp" @@ -277,9 +277,7 @@ def initialize(self, **kwargs): self._noBash = False if not self._command or self._command[:2].strip() == "%%": - logger.warning( - "Assume a default command is executed in the container" - ) + logger.warning("Assume a default command is executed in the container") self._command = self._command.strip()[2:].strip() if self._command else "" self._noBash = True # This makes sure that we can retain any command defined in the image, but still be @@ -287,8 +285,9 @@ def initialize(self, **kwargs): # "%%" at the start of the command, else it is interpreted as a normal command. # construct the actual command line from all application parameters - argumentString = droputils.serialize_applicationArgs(self._applicationArgs, \ - self._argumentPrefix, self._paramValueSeparator) + argumentString = droputils.serialize_applicationArgs( + self._applicationArgs, self._argumentPrefix, self._paramValueSeparator + ) # complete command including all additional parameters and optional redirects cmd = f"{self._command} {argumentString} {self._cmdLineArgs} " cmd = cmd.strip() @@ -300,10 +299,12 @@ def initialize(self, **kwargs): # container running as a component. pw = pwd.getpwuid(os.getuid()) - self._user = pw.pw_name # use current user by default + self._user = pw.pw_name # use current user by default self._userid = pw.pw_uid self._groupid = pw.pw_gid - logger.debug(f"User for docker container: {self._user} {self._userid}:{self._groupid}") + logger.debug( + f"User for docker container: {self._user} {self._userid}:{self._groupid}" + ) # By default containers are removed from the filesystem, but people # might want to preserve them. @@ -323,13 +324,17 @@ def initialize(self, **kwargs): # on the host system. They are given either as a list or as a # comma-separated string self._additionalBindings = {} - bindings = [f"{utils.getDlgDir()}:{utils.getDlgDir()}", - f"{utils.getDlgDir()}/workspace/settings/passwd:/etc/passwd", - f"{utils.getDlgDir()}/workspace/settings/group:/etc/group" + bindings = [ + f"{utils.getDlgDir()}:{utils.getDlgDir()}", + f"{utils.getDlgDir()}/workspace/settings/passwd:/etc/passwd", + f"{utils.getDlgDir()}/workspace/settings/group:/etc/group", ] additionalBindings = self._getArg(kwargs, "additionalBindings", []) - additionalBindings = additionalBindings.split(",") if isinstance(additionalBindings, str) \ + additionalBindings = ( + additionalBindings.split(",") + if isinstance(additionalBindings, str) else additionalBindings + ) bindings += additionalBindings for binding in bindings: if len(binding) == 0: @@ -371,7 +376,7 @@ def initialize(self, **kwargs): logger.debug("Docker Image inspection: %r", inspection) self.workdir = inspection.get("ContainerConfig", {}).get("WorkingDir", None) # self.workdir = None - self._sessionId = (self._dlg_session.sessionId if self._dlg_session else "") + self._sessionId = self._dlg_session.sessionId if self._dlg_session else "" if not self.workdir: default_workingdir = os.path.join(utils.getDlgWorkDir(), self._sessionId) self.workdir = self._getArg(kwargs, "workingDir", default_workingdir) @@ -381,6 +386,11 @@ def initialize(self, **kwargs): self._containerIp = None self._containerId = None self._waiters = [] + self._recompute_data = { + "image": self._image, + "user": self._user, + "command": self._command, + } @property def containerIp(self): @@ -418,12 +428,14 @@ def run(self): fsInputs = {uid: i for uid, i in iitems if droputils.has_path(i)} fsOutputs = {uid: o for uid, o in oitems if droputils.has_path(o)} dockerInputs = { - # uid: DockerPath(utils.getDlgDir() + i.path) for uid, i in fsInputs.items() - uid: DockerPath(i.path) for uid, i in fsInputs.items() + # uid: DockerPath(utils.getDlgDir() + i.path) for uid, i in fsInputs.items() + uid: DockerPath(i.path) + for uid, i in fsInputs.items() } dockerOutputs = { - # uid: DockerPath(utils.getDlgDir() + o.path) for uid, o in fsOutputs.items() - uid: DockerPath(o.path) for uid, o in fsOutputs.items() + # uid: DockerPath(utils.getDlgDir() + o.path) for uid, o in fsOutputs.items() + uid: DockerPath(o.path) + for uid, o in fsOutputs.items() } dataURLInputs = {uid: i for uid, i in iitems if not droputils.has_path(i)} dataURLOutputs = {uid: o for uid, o in oitems if not droputils.has_path(o)} @@ -442,7 +454,9 @@ def run(self): # directory, maintaining the rest of their original paths. # Outputs are bound only up to their dirname (see class doc for details) # Volume bindings are setup for FileDROPs and DirectoryContainers only - binds = [i.path + ":" + dockerInputs[uid].path for uid, i in fsInputs.items()] + binds = [ + i.path + ":" + dockerInputs[uid].path for uid, i in fsInputs.items() + ] binds += [ os.path.dirname(o.path) + ":" + os.path.dirname(dockerOutputs[uid].path) for uid, o in fsOutputs.items() @@ -458,7 +472,7 @@ def run(self): ] binds = list(set(binds)) # make this a unique list else docker complains try: - binds.remove(':') + binds.remove(":") except: pass logger.debug("Volume bindings: %r", binds) @@ -488,27 +502,26 @@ def run(self): # deal with environment variables env = {} - env.update({ - "DLG_UID": self._uid}, - ) + env.update({"DLG_UID": self._uid}) if self._dlg_session: - env.update({"DLG_SESSION_ID":self._dlg_session.sessionId}) + env.update({"DLG_SESSION_ID": self._dlg_session.sessionId}) if self._user is not None: - env.update({ - "USER": self._user, - "DLG_ROOT": utils.getDlgDir() - }) + env.update({"USER": self._user, "DLG_ROOT": utils.getDlgDir()}) if self._env is not None: logger.debug(f"Found environment variable setting: {self._env}") - if self._env.lower() == "all": # pass on all environment variables from host + if ( + self._env.lower() == "all" + ): # pass on all environment variables from host env.update(os.environ) elif self._env[0] in ["{", "["]: try: addEnv = json.loads(self._env) except json.JSONDecodeError: - logger.warning("Ignoring provided environment variables: Format wrong? Check documentation") + logger.warning( + "Ignoring provided environment variables: Format wrong? Check documentation" + ) addEnv = {} - if isinstance(addEnv, dict): # if it is a dict populate directly + if isinstance(addEnv, dict): # if it is a dict populate directly # but replace placeholders first for key in addEnv: value = droputils.replace_path_placeholders( @@ -519,27 +532,34 @@ def run(self): ) addEnv[key] = value env.update(addEnv) - elif isinstance(addEnv, list): # if it is a list populate from host environment + elif isinstance( + addEnv, list + ): # if it is a list populate from host environment for e in addEnv: env.update(os.environ[e]) else: - logger.warning("Ignoring provided environment variables: Format wrong! Check documentation") + logger.warning( + "Ignoring provided environment variables: Format wrong! Check documentation" + ) logger.debug(f"Adding environment variables: {env}") - # Wrap everything inside bash if len(cmd) > 0 and not self._noBash: - cmd = '/bin/bash -c "%s"' % (utils.escapeQuotes(cmd, singleQuotes=False)) + cmd = '/bin/bash -c "%s"' % ( + utils.escapeQuotes(cmd, singleQuotes=False) + ) logger.debug("Command after user creation and wrapping is: %s", cmd) else: - logger.debug("executing container with default cmd and wrapped arguments") + logger.debug( + "executing container with default cmd and wrapped arguments" + ) cmd = f"{utils.escapeQuotes(cmd, singleQuotes=False)}" c = DockerApp._get_client() logger.debug(f"Final user for container: {self._user}:{self._userid}") - + # Create container - self._container = c.containers.create( # type: ignore + self._container = c.containers.create( # type: ignore self._image, cmd, volumes=binds, @@ -566,7 +586,6 @@ def run(self): self.container.start() logger.info("Started container %s", cId) - # Figure out the container's IP and save it # Setting self.containerIp will trigger an event being sent to the # registered listeners @@ -609,7 +628,11 @@ def run(self): ) elif self._exitCode != 0: msg = f"Container {cId} didn't finish successfully (exit code {self._exitCode})" - if self._exitCode == 137 or self._exitCode == 139 or self._exitCode == 143: + if ( + self._exitCode == 137 + or self._exitCode == 139 + or self._exitCode == 143 + ): # termination via SIGKILL, SIGSEGV, and SIGTERM is expected for some services logger.warning( f"{msg}, output follows.\n==STDOUT==\n%s==STDERR==\n%s", @@ -680,3 +703,6 @@ def _kwargs_from_env(cls, ssl_version=None, assert_hostname=False): if os.path.exists(config_file_name): return ConfigObj(config_file_name) return {} + + def generate_recompute_data(self): + return self._recompute_data diff --git a/daliuge-engine/dlg/apps/dynlib.py b/daliuge-engine/dlg/apps/dynlib.py index ef03a5407..7f7534f21 100644 --- a/daliuge-engine/dlg/apps/dynlib.py +++ b/daliuge-engine/dlg/apps/dynlib.py @@ -26,13 +26,13 @@ import multiprocessing import queue import threading +import six from .. import rpc, utils from ..ddap_protocol import AppDROPStates from ..drop import AppDROP, BarrierAppDROP from ..exceptions import InvalidDropException - logger = logging.getLogger(__name__) _read_cb_type = ctypes.CFUNCTYPE( @@ -93,6 +93,12 @@ class CDlgApp(ctypes.Structure): ("data", ctypes.c_void_p), ] + def pack_python(self): + out = {} + for key, val in self._fields_: + out[key] = repr(getattr(self, key)) + return out + def _to_c_input(i): """ @@ -347,6 +353,13 @@ def addStreamingInput(self, streamingInputDrop, back=True): super(DynlibStreamApp, self).addStreamingInput(streamingInputDrop, back) self._c_app.n_streaming_inputs += 1 + def generate_recompute_data(self): + out = {"status": self.status} + data = self._c_app.pack_python() + if data is not None: + out.update(data) + return out + ## # @brief DynlibApp @@ -380,6 +393,14 @@ def run(self): self._ensure_c_outputs_are_set() run(self.lib, self._c_app, input_closers) + def generate_recompute_data(self): + out = {"status": self.status} + if self._c_app is None: + return out + else: + out.update(self._c_app.pack_python()) + return out + class FinishSubprocess(Exception): pass diff --git a/daliuge-engine/dlg/apps/mpi.py b/daliuge-engine/dlg/apps/mpi.py index 6a82378a6..1f6ed1867 100644 --- a/daliuge-engine/dlg/apps/mpi.py +++ b/daliuge-engine/dlg/apps/mpi.py @@ -29,7 +29,6 @@ from ..drop import BarrierAppDROP from ..exceptions import InvalidDropException - logger = logging.getLogger(__name__) ## @@ -134,6 +133,7 @@ def run(self): any_failed = False for rank, (stdout, stderr, code) in enumerate(children_data): + self._recompute_data[str(rank)] = [code, str(stdout), str(stderr)] if code == 0: continue any_failed = True @@ -149,10 +149,12 @@ def run(self): else: comm_children.barrier() + def generate_recompute_data(self): + return self._recompute_data + # When we are called by the MPIApp def module_as_main(): - # Get the parent communicator before anything else happens # This way we ensure the communicator is valid from mpi4py import MPI diff --git a/daliuge-engine/dlg/apps/plasmaflight.py b/daliuge-engine/dlg/apps/plasmaflight.py index 0eec56c3a..a566cd1a6 100644 --- a/daliuge-engine/dlg/apps/plasmaflight.py +++ b/daliuge-engine/dlg/apps/plasmaflight.py @@ -35,7 +35,13 @@ class PlasmaFlightClient: """ Client for accessing plasma-backed arrow flight data server. """ - def __init__(self, socket: str, scheme: str = "grpc+tcp", connection_args: Optional[dict] = None): + + def __init__( + self, + socket: str, + scheme: str = "grpc+tcp", + connection_args: Optional[dict] = None, + ): """ Args: socket (str): The socket of the local plasma store @@ -91,9 +97,7 @@ def put_raw_buffer(self, data: memoryview, object_id: plasma.ObjectID): self.plasma_client.put_raw_buffer(data, object_id) def get_buffer( - self, - object_id: plasma.ObjectID, - owner: Optional[str] = None + self, object_id: plasma.ObjectID, owner: Optional[str] = None ) -> memoryview: """ Gets the plasma object from the local store if it's available, diff --git a/daliuge-engine/dlg/apps/pyfunc.py b/daliuge-engine/dlg/apps/pyfunc.py index c8f203c51..782f4e07a 100644 --- a/daliuge-engine/dlg/apps/pyfunc.py +++ b/daliuge-engine/dlg/apps/pyfunc.py @@ -31,6 +31,8 @@ from typing import Callable import dill +from io import StringIO +from contextlib import redirect_stdout from dlg import droputils, utils from dlg.drop import BarrierAppDROP @@ -64,33 +66,45 @@ def serialize_func(f): f = getattr(importlib.import_module(".".join(parts[:-1])), parts[-1]) fser = dill.dumps(f) - fdefaults = {} + fdefaults = {"args": [], "kwargs": {}} + adefaults = {"args": [], "kwargs": {}} a = inspect.getfullargspec(f) if a.defaults: - fdefaults = dict( - zip(a.args[-len(a.defaults):], [serialize_data(d) for d in a.defaults]) + fdefaults["kwargs"] = dict( + zip(a.args[-len(a.defaults) :], [serialize_data(d) for d in a.defaults]) ) - logger.debug("Defaults for function %r: %r", f, fdefaults) + adefaults["kwargs"] = dict( + zip(a.args[-len(a.defaults) :], [d for d in a.defaults]) + ) + logger.debug(f"Introspection of function {f}: {a}") + logger.debug("Defaults for function %r: %r", f, adefaults) return fser, fdefaults def import_using_name(app, fname): - # The name has the form pack1.pack2.mod.func + # If only one part check if builtin parts = fname.split(".") if len(parts) < 2: - msg = "%s does not contain a module name" % fname - raise InvalidDropException(app, msg) - - modname, fname = ".".join(parts[:-1]), parts[-1] - try: - mod = importlib.import_module(modname, __name__) - return getattr(mod, fname) - except ImportError as e: - raise InvalidDropException( - app, "Error when loading module %s: %s" % (modname, str(e)) - ) - except AttributeError: - raise InvalidDropException(app, "Module %s has no member %s" % (modname, fname)) + b = globals()['__builtins__'] + logger.debug(f"Builtins: {type(b)}") + logger.debug(f"Function {fname}: {hasattr(b, fname)}") + if fname in b: + return b[fname] + else: + msg = "%s is not builtin and does not contain a module name" % fname + raise InvalidDropException(app, msg) + else: + modname, fname = ".".join(parts[:-1]), parts[-1] + try: + mod = importlib.import_module(modname, __name__) + return getattr(mod, fname) + except ImportError as e: + raise InvalidDropException( + app, "Error when loading module %s: %s" % (modname, str(e)) + ) + except AttributeError: + raise InvalidDropException(app, "Module %s has no member %s" % (modname, fname)) + def import_using_code(code): @@ -107,7 +121,7 @@ def import_using_code(code): # being written to its corresponding output. # @par EAGLE_START # @param category PythonApp -# @param tag daliuge +# @param tag template # @param[in] cparam/appclass Application Class/dlg.apps.pyfunc.PyFuncApp/String/readonly/False//False/ # \~English Application class # @param[in] cparam/execution_time Execution Time/5/Float/readonly/False//False/ @@ -117,14 +131,14 @@ def import_using_code(code): # @param[in] cparam/group_start Group start/False/Boolean/readwrite/False//False/ # \~English Is this node the start of a group? # @param[in] cparam/input_error_threshold "Input error rate (%)"/0/Integer/readwrite/False//False/ -# \~English the allowed failure rate of the inputs (in percent), before this component goes to ERROR state and is not executed +# \~English The allowed failure rate of the inputs (in percent), before this component goes to ERROR state and is not executed # @param[in] cparam/n_tries Number of tries/1/Integer/readwrite/False//False/ # \~English Specifies the number of times the 'run' method will be executed before finally giving up # @param[in] aparam/func_name Function Name//String/readwrite/False//False/ -# \~English Python fuction name +# \~English Python function name # @param[in] aparam/func_code Function Code//String/readwrite/False//False/ -# \~English Python fuction code, e.g. 'def fuction_name(args): return args' -# @param[in] aparam/pickle Pickle//Boolean/readwrite/False//False/ +# \~English Python function code, e.g. 'def function_name(args): return args' +# @param[in] aparam/pickle Pickle/false/Boolean/readwrite/False//False/ # \~English Whether the python arguments are pickled. # @param[in] aparam/func_defaults Function Defaults//String/readwrite/False//False/ # \~English Mapping from argname to default value. Should match only the last part of the argnames list. @@ -167,7 +181,7 @@ class PyFuncApp(BarrierAppDROP): ``{"kwargs":{"kw1_name":kw1_value, "kw2_name":kw2_value}, "args":[arg1, arg2]}`` - The positional args will be used in order of appearance. + The positional onlyargs will be used in order of appearance. """ component_meta = dlg_component( @@ -188,38 +202,113 @@ class PyFuncApp(BarrierAppDROP): func_defaults = dlg_dict_param("func_defaults", {}) - f: Callable fdefaults: dict + def _init_func_defaults(self): + """ + Inititalize self.func_defaults dictionary from values provided. + Multiple options exist and some are here for compatibility. + """ + logger.debug(f"Starting evaluation of func_defaults: {self.func_defaults}") + if ( + isinstance(self.func_defaults, dict) + and len(self.func_defaults) > 0 + and list(self.func_defaults.keys()) == ["kwargs", "args"] + ): + # we bring everything back to just kwargs, because positional args are messy + # NOTE: This means that positional ONLY arguments won't work, but those are not used + # too often. + for arg in self.func_defaults["args"]: + self.func_defaults["kwargs"][arg] = arg + self.func_defaults = self.func_defaults["kwargs"] + elif ( + isinstance(self.func_defaults, dict) + and "kwargs" in self.func_defaults + and isinstance(self.func_defaults["kwargs"], dict) + ): + self.func_defaults = self.func_defaults["kwargs"] + # we came all this way, now assume that any resulting dict is correct + if not isinstance(self.func_defaults, dict): + logger.error( + f"Wrong format or type for function defaults for " + + "{self.f.__name__}: {self.func_defaults}, {type(self.func_defaults)}" + ) + raise ValueError + if self.pickle: + # only values are pickled, get them unpickled + for name, value in self.func_defaults.items(): + self.func_defaults[name] = deserialize_data(value) + + # set the function defaults from introspection + if self.arguments: + self.fn_npos = len(self.arguments.args) - self.fn_ndef + self.fn_defaults = { + name: None for name in self.arguments.args[: self.fn_npos] + } + logger.debug(f"initialized fn_defaults with {self.fn_defaults}") + # deal with args and kwargs + kwargs = ( + dict(zip(self.arguments.args[self.fn_npos :], self.arguments.defaults)) + if self.arguments.defaults + else {} + ) + self.fn_defaults.update(kwargs) + logger.debug(f"fn_defaults updated with {kwargs}") + # deal with kwonlyargs + if self.arguments.kwonlydefaults: + kwonlyargs = dict( + zip(self.arguments.kwonlyargs, self.arguments.kwonlydefaults) + ) + self.fn_defaults.update(kwonlyargs) + logger.debug(f"fn_defaults updated with {kwonlyargs}") + + self.fn_posargs = self.arguments.args[ + : self.fn_npos + ] # positional arg names + def initialize(self, **kwargs): + """ + The initialization of a function component is mainly dealing with mapping + inputs and provided applicationArgs to the function arguments. All of this + should be driven by matching names, but currently that is not being done. + """ BarrierAppDROP.initialize(self, **kwargs) self._applicationArgs = self._getArg(kwargs, "applicationArgs", {}) self.func_code = self._getArg(kwargs, "func_code", None) - # check for args in applicationArgs, original still has preference - for kw in [ + # check for function definition arguments in applicationArgs + self.func_def_keywords = [ "func_code", "func_name", "func_arg_mapping", "pickle", "func_defaults" - ]: + ] + for kw in self.func_def_keywords: dum_arg = new_arg = "gIbbERiSH:askldhgol" - if kw in self._applicationArgs: # these are the preferred ones now - if isinstance(self._applicationArgs[kw]["value"], bool): # always transfer booleans - new_arg = self._applicationArgs[kw]['value'] - elif self._applicationArgs[kw]["value"]: # only transfer if there is a value - # we allow python expressions as values, means that strings need to be quoted - new_arg = self._applicationArgs[kw]['value'] + if kw in self._applicationArgs: # these are the preferred ones now + if isinstance( + self._applicationArgs[kw]["value"], bool + ): # always transfer booleans + new_arg = self._applicationArgs.pop(kw) + elif ( + self._applicationArgs[kw]["value"] + or self._applicationArgs[kw]["precious"] + ): + # only transfer if there is a value or precious is True + new_arg = self._applicationArgs.pop(kw) if new_arg != dum_arg: - logger.debug(f"Setting {kw} to {new_arg}") - self.__setattr__(kw, new_arg) - + logger.debug(f"Setting {kw} to {new_arg['value']}") + # we allow python expressions as values, means that strings need to be quoted + self.__setattr__(kw, new_arg["value"]) + self.num_args = len( + self._applicationArgs + ) # number of additional arguments provided if not self.func_name and not self.func_code: raise InvalidDropException( @@ -234,98 +323,218 @@ def initialize(self, **kwargs): self.func_code = base64.b64decode(self.func_code.encode("utf8")) self.f = import_using_code(self.func_code) # make sure defaults are dicts - if isinstance(self.func_defaults, str): + if isinstance(self.func_defaults, str): self.func_defaults = ast.literal_eval(self.func_defaults) - if isinstance(self.func_arg_mapping, str): + if isinstance(self.func_arg_mapping, str): self.func_arg_mapping = ast.literal_eval(self.func_arg_mapping) - - if self.pickle: - self.fdefaults = {name: deserialize_data(d) for name, d in self.func_defaults.items()} - if isinstance(self.func_defaults, dict) and len(self.func_defaults) > 0 and \ - list(self.func_defaults.keys()) == ["kwargs", "args"]: - pass - elif isinstance(self.func_defaults, (dict, str)) and len(self.func_defaults) == 0: - pass - elif isinstance(self.func_defaults, dict): - self.func_defaults = {"kwargs": self.func_defaults, "args":[]} - else: - logger.error(f"Wrong format or type for function defaults for {self.f.__name__}: {self.func_defaults}, {type(self.func_defaults)}") - raise ValueError - - logger.debug(f"Default values for function {self.func_name}: {self.func_defaults}") + self.arguments = inspect.getfullargspec(self.f) + logger.debug(f"Function inspection revealed {self.arguments}") + self.fn_nargs = len(self.arguments.args) + self.fn_ndef = len(self.arguments.defaults) if self.arguments.defaults else 0 + self._init_func_defaults() + logger.info(f"Args summary for '{self.func_name}':") + logger.info(f"Args: {self.arguments.args}") + logger.info(f"Args defaults: {self.arguments.defaults}") + logger.info(f"Args positional: {self.arguments.args[:self.fn_npos]}") + logger.info(f"Args keyword: {self.arguments.args[self.fn_npos:]}") + logger.info(f"Args supplied: {self.func_defaults}") + logger.info(f"VarArgs allowed: {self.arguments.varargs}") + logger.info(f"VarKwds allowed: {self.arguments.varkw}") # Mapping between argument name and input drop uids logger.debug(f"Input mapping: {self.func_arg_mapping}") + self._recompute_data = {} def run(self): + """ + Function positional and keyword argument treatment: + + Function arguments can be provided in four different ways: + 1) Through an input port + 2) By specifying ApplicationArgs (one for each argument) + 3) By specifying a func_defaults dictionary in the ComponentParameters + 4) Through defaults at the time of function definition + + The priority follows the list above with input ports overruling the others. + Function arguments in Python can be passed as positional, kw-value, positional + only, kw-value only, and catch-all args and kwargs, which don't provide any + hint about the names of accepted parameters. All of them are now supported. If + positional arguments or kw-value arguments are provided by the user, but are + not explicitely defined in the function signiture AND args and/or kwargs are + allowed then these arguments are passed to the function. For args this is + somewhat risky, since the order is relevant and in this code derived from the + order defined in the graph (same order as defined in the component description). + + Input ports will NOT be used by order (anymore), but by the IdText (name field + in EAGLE) of the port. Since each input port requires an associated data drop, + this provides a unique mapping. This also allows to pass values to any function + argument through a port. + + Function argument values as well as the function code can be provided in + serialised (pickle) form by setting the 'pickle' flag. Note that this flag + is valid for all arguments and the code (if specified) in a global way. + """ # Inputs are un-pickled and treated as the arguments of the function # Their order must be preserved, so we use an OrderedDict if self.pickle: all_contents = lambda x: pickle.loads(droputils.allDropContents(x)) else: - all_contents = lambda x: ast.literal_eval(droputils.allDropContents(x).decode('utf-8')) + all_contents = lambda x: ast.literal_eval( + droputils.allDropContents(x).decode("utf-8") + ) inputs = collections.OrderedDict() for uid, drop in self._inputs.items(): inputs[uid] = all_contents(drop) - - self.funcargs = {"kwargs":{}, "args":[]} + self.funcargs = {} # Keyword arguments are made up of the default values plus the inputs # that match one of the keyword argument names - n_def = len(self.func_defaults) # if defaults dict has not been specified at all we'll go ahead anyway - n_args = (len(self.func_defaults["args"]), len(self.func_defaults["kwargs"])) if n_def else (0,0) - argnames = inspect.getfullargspec(self.f).args - n_args_req = len(argnames) - if n_def and (n_args_req > (sum(n_args))): - logger.warning(f"Function {self.f.__name__} expects {n_args_req} argument defaults") - logger.warning(f"only {sum(n_args)} found!") - logger.warning("Please correct the function default specification") - #raise ValueError + n_args = len(self.func_defaults) + argnames = self.arguments.args + # use explicit mapping of inputs to arguments first + # TODO: Required by dlg_delayed?? Else, we should really not do this. kwargs = { name: inputs.pop(uid) for name, uid in self.func_arg_mapping.items() - if name in self.fdefaults or name not in argnames + if name in self.func_defaults or name not in argnames } - self.funcargs["kwargs"] = kwargs - # The rest of the inputs are missing arguments - args = list(inputs.values()) - self.funcargs["args"] = args - - if len(kwargs) + n_args[1] + len(args) < n_args_req: # There are kwargs missing fill with defaults - def_kwargs = self.func_defaults["kwargs"] - for kw in def_kwargs.keys(): - if kw not in kwargs: - kwargs.update({kw: def_kwargs[kw]}) - - - # fill the rest with default args - n_missing = n_args_req - len(kwargs) - len(args) - if n_missing > 0: - logger.warning(f"Expected {n_args_req} inputs for {self.f.__name__} missing {n_missing}") - logger.debug(f"Trying to fill with arg defaults") - for a in range(n_missing): - try: - args.append(self.func_defaults["args"][a]) - except IndexError: - logger.warning("Insufficient number of function defaults?", exc_info=True) - - logger.debug(f"Running {self.func_name} with args={args}, kwargs={kwargs}") - result = self.f(*args, **kwargs) + logger.debug(f"updating funcargs with {kwargs}") + self.funcargs = kwargs + + # Fill arguments with rest of inputs + logger.debug(f"available inputs: {inputs}") + + # if we have named ports use the inputs with + # the correct UIDs + logger.debug(f"Parameters found: {self.parameters}") + posargs = self.arguments.args[:self.fn_npos] + kwargs = {} + self.pargs = [] + pargsDict = {} # Initialize pargs dictionary + if ('inputs' in self.parameters and isinstance(self.parameters['inputs'][0], dict)): + logger.debug(f"Using named ports to identify inputs: "+\ + f"{self.parameters['inputs']}") + for i in range(min(len(inputs),self.fn_nargs +\ + len(self.arguments.kwonlyargs))): + # key for final dict is value in named ports dict + key = list(self.parameters["inputs"][i].values())[0] + # value for final dict is value in inputs dict + value = inputs[list(self.parameters['inputs'][i].keys())[0]] + if key in posargs: + pargsDict.update({key:value}) + else: + kwargs.update({key:value}) + else: + for i in range(min(len(inputs), self.fn_nargs)): + kwargs.update({self.arguments.args[i]: list(inputs.values())[i]}) + + logger.debug(f"updated pos-args with input ports {pargsDict}") + logger.debug(f"updating kw-args with input ports {kwargs}") + self.funcargs.update(kwargs) + + # Try to get values for still missing positional arguments from Application Args + if "applicationArgs" in self.parameters: + appArgs = self.parameters["applicationArgs"] # we'll pop them + _dum = [appArgs.pop(k) for k in self.func_def_keywords if k in appArgs] + for pa in posargs: + if pa not in self.funcargs and pa not in pargsDict: + if pa in appArgs: + arg = appArgs.pop(pa) + value = arg['value'] + ptype = arg['type'] + if ptype in ["Complex", "Json"]: + try: + value = ast.literal_eval(value) + except: + pass + pargsDict.update({ + pa: + value + }) + elif pa != 'self': + logger.warning(f"Required positional argument '{pa}' not found!") + logger.debug(f"updating posargs with {list(kwargs.values())}") + self.pargs.extend(list(pargsDict.values())) + + # Try to get values for still missing kwargs arguments from Application kws + kwargs = {} + kws = self.arguments.args[self.fn_npos :] + for ka in kws: + if ka not in self.funcargs and ka not in pargsDict: + if ka in appArgs: + arg = appArgs.pop(ka) + value = arg['value'] + ptype = arg['type'] + if ptype in ["Complex", "Json"]: + try: + value = ast.literal_eval(value) + except: + pass + kwargs.update({ka: value}) + else: + logger.warning(f"Keyword argument '{ka}' not found!") + logger.debug(f"updating funcargs with {kwargs}") + self.funcargs.update(kwargs) + vparg = [] + vkarg = {} + logger.debug(f"Remaining AppArguments {appArgs}") + for arg in appArgs: + if appArgs[arg]['type'] in ['Json', 'Complex']: + value = ast.literal_eval(appArgs[arg]['value']) + else: + value = appArgs[arg]['value'] + if appArgs[arg]['positional']: + vparg.append(value) + else: + vkarg.update({arg:value}) + + # any remaining application arguments will be used for vargs and vkwargs + if self.arguments.varargs: + self.pargs.extend(vparg) + if self.arguments.varkw: + self.funcargs.update(vkarg) + + # Fill rest with default arguments if there are any more + kwargs = {} + for kw in self.func_defaults.keys(): + value = self.func_defaults[kw] + if kw not in self.funcargs and kw not in pargsDict: + kwargs.update({kw: value}) + logger.debug(f"updating funcargs with {kwargs}") + self.funcargs.update(kwargs) + self._recompute_data["args"] = self.funcargs.copy() + logger.debug(f"Running {self.func_name} with *{self.pargs} **{self.funcargs}") + + # we capture and log whatever is produced on STDOUT + capture = StringIO() + with redirect_stdout(capture): + result = self.f(*self.pargs, **self.funcargs) + logger.info(f"Captured output from function app '{self.func_name}': {capture.getvalue()}") + logger.debug(f"Finished execution of {self.func_name}.") # Depending on how many outputs we have we treat our result # as an iterable or as a single object. Each result is pickled # and written to its corresponding output + self.write_results(result) + + def write_results(self, result): outputs = self.outputs - if len(outputs) == 1: - result = [result] - for r, o in zip(result, outputs): - if self.pickle: - o.write(pickle.dumps(r)) # @UndefinedVariable - else: - o.write(repr(r).encode('utf-8')) + if len(outputs) > 0: + if len(outputs) == 1: + result = [result] + for r, o in zip(result, outputs): + p = pickle.dumps(r) + if self.pickle: + logger.debug(f"Writing pickeled result {type(r)} to {o}") + o.write(pickle.dumps(r)) # @UndefinedVariable + else: + o.write(repr(r).encode("utf-8")) + + def generate_recompute_data(self): + return self._recompute_data diff --git a/daliuge-engine/dlg/apps/scp.py b/daliuge-engine/dlg/apps/scp.py index 772d84604..d894c9a68 100644 --- a/daliuge-engine/dlg/apps/scp.py +++ b/daliuge-engine/dlg/apps/scp.py @@ -91,13 +91,27 @@ class ScpApp(BarrierAppDROP): [ dlg_batch_input( "binary/*", - [NgasDROP, InMemoryDROP, SharedMemoryDROP, NullDROP, RDBMSDrop, ContainerDROP] + [ + NgasDROP, + InMemoryDROP, + SharedMemoryDROP, + NullDROP, + RDBMSDrop, + ContainerDROP, + ], ) ], [ dlg_batch_output( "binary/*", - [NgasDROP, InMemoryDROP, SharedMemoryDROP, NullDROP, RDBMSDrop, ContainerDROP] + [ + NgasDROP, + InMemoryDROP, + SharedMemoryDROP, + NullDROP, + RDBMSDrop, + ContainerDROP, + ], ) ], [dlg_streaming_input("binary/*")], diff --git a/daliuge-engine/dlg/apps/simple.py b/daliuge-engine/dlg/apps/simple.py index 5d4766e25..fbf6ce627 100644 --- a/daliuge-engine/dlg/apps/simple.py +++ b/daliuge-engine/dlg/apps/simple.py @@ -20,13 +20,14 @@ # MA 02111-1307 USA # """Applications used as examples, for testing, or in simple situations""" +import _pickle from numbers import Number import pickle import random from typing import List, Optional import urllib.error import urllib.request - +import logging import time import ast import numpy as np @@ -34,19 +35,23 @@ from dlg import droputils, utils from dlg.drop import BarrierAppDROP, BranchAppDrop, ContainerDROP from dlg.meta import ( - dlg_float_param, + dlg_float_param, dlg_string_param, - dlg_bool_param, + dlg_bool_param, dlg_int_param, dlg_list_param, - dlg_component, + dlg_component, dlg_batch_input, - dlg_batch_output, - dlg_streaming_input + dlg_batch_output, + dlg_streaming_input, ) from dlg.exceptions import DaliugeException from dlg.apps.pyfunc import serialize_data, deserialize_data + +logger = logging.getLogger(__name__) + + class NullBarrierApp(BarrierAppDROP): component_meta = dlg_component( "NullBarrierApp", @@ -61,6 +66,25 @@ class NullBarrierApp(BarrierAppDROP): def run(self): pass +## +# @brief PythonApp +# @details A placeholder APP to aid construction of new applications. +# This is mainly useful (and used) when starting a new workflow from scratch. +# @par EAGLE_START +# @param category PythonApp +# @param tag template +# @param[in] cparam/appclass Application Class//String/readonly/False//False/ +# \~English Application class +# @param[in] cparam/num_cpus No. of CPUs/1/Integer/readonly/False//False/ +# \~English Number of cores used +# @param[in] cparam/execution_time Execution Time/5/Float/readonly/False//False/ +# \~English Estimated execution time +# @param[in] cparam/group_start Group start/False/Boolean/readwrite/False//False/ +# \~English Is this node the start of a group? +# @par EAGLE_END +class PythonApp(BarrierAppDROP): + """A placeholder BarrierAppDrop that just aids the generation of the palette component""" + pass ## # @brief SleepApp @@ -69,21 +93,15 @@ def run(self): # without executing real algorithms. Very useful for debugging. # @par EAGLE_START # @param category PythonApp -# @param tag daliuge +# @param tag template # @param[in] aparam/sleepTime Sleep Time/5/Integer/readwrite/False//False/ # \~English The number of seconds to sleep # @param[in] cparam/appclass Application Class/dlg.apps.simple.SleepApp/String/readonly/False//False/ # \~English Application class # @param[in] cparam/execution_time Execution Time/5/Float/readonly/False//False/ # \~English Estimated execution time -# @param[in] cparam/num_cpus No. of CPUs/1/Integer/readonly/False//False/ -# \~English Number of cores used # @param[in] cparam/group_start Group start/False/Boolean/readwrite/False//False/ # \~English Is this node the start of a group? -# @param[in] cparam/input_error_threshold "Input error rate (%)"/0/Integer/readwrite/False//False/ -# \~English the allowed failure rate of the inputs (in percent), before this component goes to ERROR state and is not executed -# @param[in] cparam/n_tries Number of tries/1/Integer/readwrite/False//False/ -# \~English Specifies the number of times the 'run' method will be executed before finally giving up # @par EAGLE_END class SleepApp(BarrierAppDROP): """A BarrierAppDrop that sleeps the specified amount of time (0 by default)""" @@ -340,7 +358,7 @@ def __init__(self, oid, uid, **kwargs): self.marray = [] def initialize(self, **kwargs): - super(AverageArraysApp, self).initialize(**kwargs) + super().initialize(**kwargs) def run(self): # At least one output should have been added @@ -371,7 +389,10 @@ def getInputArrays(self): print(f"Input does not contain data!") else: sarray = pickle.loads(sarray) - marray.extend(sarray) + if isinstance(sarray, (list, tuple, np.ndarray)): + marray.extend(list(sarray)) + else: + marray.append(sarray) self.marray = marray def averageArray(self): @@ -400,7 +421,7 @@ def averageArray(self): # \~English Specifies the number of times the 'run' method will be executed before finally giving up # @param[in] cparam/function Function/sum/Select/readwrite/False/sum,prod,min,max,add,multiply,maximum,minimum/False/ # \~English The function used for gathering -# @param[in] cparam/function reduce_axes/None/String/readonly/False//False/ +# @param[in] cparam/reduce_axes "Reduce Axes"/None/String/readonly/False//False/ # \~English The ndarray axes to reduce, None reduces all axes for sum, prod, max, min functions # @param[in] port/array Array/npy/ # \~English Port for the input array(s) @@ -410,9 +431,10 @@ def averageArray(self): class GenericNpyGatherApp(BarrierAppDROP): """ A BarrierAppDrop that reduces then gathers one or more inputs using cummulative operations. - function: string <['sum']|'prod'|'min'|'max'|'add'|'multiply'|'maximum'|'minimum'>. + function: string <'sum'|'prod'|'min'|'max'|'add'|'multiply'|'maximum'|'minimum'>. """ + component_meta = dlg_component( "GenericNpyGatherApp", "Generic Npy Gather App.", @@ -422,29 +444,22 @@ class GenericNpyGatherApp(BarrierAppDROP): ) # reduce and combine operation pair names + # reduce operation reduces the dimensionality of a ndarray + # gather operation combines ndarrays and retains dimensionality functions = { - # reduce and gather e.g. output dimension is reduces - - "sum": "add", # sum reduction of inputs along an axis first then reduces across drops - "prod": "multiply", # prod reduction of inputs along an axis first then reduces across drops - "max": "maximum", # max reduction of input along an axis first then reduces across drops - "min": "minimum", # min reduction of input along an axis first then reduces across drops - + # reduce and gather (output dimension is reduced) + "sum": "add", # sum reduction of inputs along an axis first then gathers across drops + "prod": "multiply", # prod reduction of inputs along an axis first then gathers across drops + "max": "maximum", # max reduction of input along an axis first then gathers across drops + "min": "minimum", # min reduction of input along an axis first then gathers across drops # gather only - "add": None, # elementwise addition of inputs, ndarrays must be of same shape - "multiply": None, # elementwise multiplication of inputs, ndarrays must be of same shape - "maximum": None, # elementwise maximums of inputs, ndarrays must be of same shape - "minimum": None # elementwise minimums of inputs, ndarrays must be of same shape - + "add": None, # elementwise addition of inputs, ndarrays must be of same shape + "multiply": None, # elementwise multiplication of inputs, ndarrays must be of same shape + "maximum": None, # elementwise maximums of inputs, ndarrays must be of same shape + "minimum": None, # elementwise minimums of inputs, ndarrays must be of same shape } - function: str = dlg_string_param("function", "sum") - reduce_axes: list = dlg_list_param("reduce_axes", "None") - - def __init__(self, oid, uid, **kwargs): - super().__init__(oid, kwargs) - - def initialize(self, **kwargs): - super().initialize(**kwargs) + function: str = dlg_string_param("function", "sum") # type: ignore + reduce_axes: list = dlg_list_param("reduce_axes", "None") # type: ignore def run(self): if len(self.inputs) < 1: @@ -454,27 +469,38 @@ def run(self): if self.function not in self.functions: raise Exception(f"Function {self.function} not supported by {self}") - result = self.reduce_combine_inputs() if self.functions[self.function] is not None else self.combine_inputs() + result = ( + self.reduce_gather_inputs() + if self.functions[self.function] is not None + else self.gather_inputs() + ) + for o in self.outputs: droputils.save_numpy(o, result) - def reduce_combine_inputs(self): + def reduce_gather_inputs(self): + """reduces then gathers each input drop interpreted as an npy drop""" result: Optional[Number] = None reduce = getattr(np, f"{self.function}") - combine = getattr(np, f"{self.functions[self.function]}") + gather = getattr(np, f"{self.functions[self.function]}") for input in self.inputs: data = droputils.load_numpy(input) - result = reduce(data, axis=self.reduce_axes)\ - if result is None\ - else combine(result, reduce(data, axis=self.reduce_axes)) + # skip gather for the first input + result = ( + reduce(data, axis=self.reduce_axes) + if result is None + else gather(result, reduce(data, axis=self.reduce_axes)) + ) return result - def combine_inputs(self): + def gather_inputs(self): + """gathers each input drop interpreted as an npy drop""" result: Optional[Number] = None - combine = getattr(np, f"{self.functions[self.function]}") + gather = getattr(np, f"{self.function}") for input in self.inputs: data = droputils.load_numpy(input) - result = data if result is None else combine(result, data) + # assign instead of gather for the first input + result = data if result is None else gather(result, data) return result @@ -531,9 +557,11 @@ def run(self): elif len(ins) != 1: raise Exception("Only one input expected for %r" % self) else: # the input is expected to be a vector. We'll use the first element - self.greeting = "Hello %s" % str( - pickle.loads(droputils.allDropContents(ins[0]))[0] - ) + try: + phrase = str(pickle.loads(droputils.allDropContents(ins[0]))[0]) + except _pickle.UnpicklingError: + phrase = str(droputils.allDropContents(ins[0]), encoding="utf-8") + self.greeting = f"Hello {phrase}" outs = self.outputs if len(outs) < 1: @@ -717,21 +745,19 @@ class GenericNpyScatterApp(BarrierAppDROP): # automatically populated by scatter node num_of_copies: int = dlg_int_param("num_of_copies", 1) - scatter_axes: List[int] = dlg_string_param("scatter_axes", "[0]") - - def initialize(self, **kwargs): - super(GenericNpyScatterApp, self).initialize(**kwargs) - self.scatter_axes = ast.literal_eval(self.scatter_axes) + scatter_axes: List[int] = dlg_list_param("scatter_axes", "[0]") def run(self): if len(self.inputs) * self.num_of_copies != len(self.outputs): - raise DaliugeException(\ + raise DaliugeException( f"expected {len(self.inputs) * self.num_of_copies} outputs,\ - got {len(self.outputs)}") + got {len(self.outputs)}" + ) if len(self.inputs) != len(self.scatter_axes): - raise DaliugeException(\ + raise DaliugeException( f"expected {len(self.inputs)} axes,\ - got {len(self.scatter_axes)}, {self.scatter_axes}") + got {len(self.scatter_axes)}, {self.scatter_axes}" + ) # split it as many times as we have outputs self.num_of_copies = self.num_of_copies @@ -739,7 +765,9 @@ def run(self): for in_index in range(len(self.inputs)): nObj = droputils.load_numpy(self.inputs[in_index]) try: - result = np.array_split(nObj, self.num_of_copies, axis=self.scatter_axes[in_index]) + result = np.array_split( + nObj, self.num_of_copies, axis=self.scatter_axes[in_index] + ) except IndexError as err: raise err for split_index in range(self.num_of_copies): @@ -797,13 +825,17 @@ class ListAppendThrashingApp(BarrierAppDROP): size: int, number of array elements """ - compontent_meta = dlg_component('ListAppendThrashingApp', 'List Append Thrashing', - [dlg_batch_input('binary/*', [])], - [dlg_batch_output('binary/*', [])], - [dlg_streaming_input('binary/*')]) + + compontent_meta = dlg_component( + "ListAppendThrashingApp", + "List Append Thrashing", + [dlg_batch_input("binary/*", [])], + [dlg_batch_output("binary/*", [])], + [dlg_streaming_input("binary/*")], + ) def initialize(self, **kwargs): - self.size = self._getArg(kwargs, 'size', 100) + self.size = self._getArg(kwargs, "size", 100) self.marray = [] super(ListAppendThrashingApp, self).initialize(**kwargs) @@ -811,8 +843,7 @@ def run(self): # At least one output should have been added outs = self.outputs if len(outs) < 1: - raise Exception( - 'At least one output should have been added to %r' % self) + raise Exception("At least one output should have been added to %r" % self) self.marray = self.generateArray() for o in outs: d = pickle.dumps(self.marray) diff --git a/daliuge-engine/dlg/apps/socket_listener.py b/daliuge-engine/dlg/apps/socket_listener.py index 8c1b3c3ff..d7d22d448 100644 --- a/daliuge-engine/dlg/apps/socket_listener.py +++ b/daliuge-engine/dlg/apps/socket_listener.py @@ -41,7 +41,6 @@ dlg_streaming_input, ) - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/dask_emulation.py b/daliuge-engine/dlg/dask_emulation.py index a58dd23c0..5913080f0 100644 --- a/daliuge-engine/dlg/dask_emulation.py +++ b/daliuge-engine/dlg/dask_emulation.py @@ -36,7 +36,6 @@ from .drop import BarrierAppDROP from .exceptions import InvalidDropException - logger = logging.getLogger(__name__) @@ -139,7 +138,6 @@ def compute(value, **kwargs): class _DelayedDrop(object): - _drop_count = 0 def __init__(self, producer=None): @@ -244,7 +242,6 @@ def __init__(self, *drops): logger.debug("Created %r", self) def _to_physical_graph(self, visited, graph): - output = _DataDrop(producer=self) output._append_to_graph(visited, graph) diff --git a/daliuge-engine/dlg/deploy/common.py b/daliuge-engine/dlg/deploy/common.py index 1a437a830..8b26e40e2 100644 --- a/daliuge-engine/dlg/deploy/common.py +++ b/daliuge-engine/dlg/deploy/common.py @@ -124,6 +124,53 @@ def monitor_sessions( time.sleep(poll_interval) +def monitor_sessions_repro( + session_id=None, + poll_interval=10, + host="127.0.0.1", + port=constants.ISLAND_DEFAULT_REST_PORT, + timeout=60, + status_dump_path=None, +): + """ + Very similar to monitoring execution status of all (or one) session specified by `session_id` + by polling `host`:`port`, and returns when they all have finalized their reproducibility data. + """ + client = _get_client(host, port, timeout, status_dump_path) + if session_id: + while True: + repro_status = client.session_repro_status(session_id) + if repro_status: + return True + time.sleep(poll_interval) + else: + while True: + sessions = client.sessions() + if all(client.session_repro_status(s) for s in sessions): + return {s["sessionId"]: s["repro"] for s in sessions} + time.sleep(poll_interval) + + +def fetch_reproducibility( + session_id=None, + poll_interval=10, + host="127.0.0.1", + port=constants.ISLAND_DEFAULT_REST_PORT, + timeout=60, +): + """ + Fetches the final graph and associated reproducibility information for `session_id`. + """ + if session_id is None: + return {} + client = _get_client(host, port, timeout) + while True: + repro_data = client.session_repro_data(session_id) + if repro_data is not None: + return repro_data + time.sleep(poll_interval) + + def submit( pg, host="127.0.0.1", @@ -138,7 +185,7 @@ def submit( """ client = _get_client(host, port, timeout) session_id = session_id or "%f" % (time.time()) - completed_uids = droputils.get_roots(pg) + completed_uids = droputils.get_roots(pg[:-1]) with client: client.create_session(session_id) logger.info("Session %s created", session_id) diff --git a/daliuge-engine/dlg/deploy/configs/__init__.py b/daliuge-engine/dlg/deploy/configs/__init__.py index 1066639c6..58b543619 100644 --- a/daliuge-engine/dlg/deploy/configs/__init__.py +++ b/daliuge-engine/dlg/deploy/configs/__init__.py @@ -39,13 +39,14 @@ class DefaultConfig(object): MODULES = "" VENV = "" + def __init__(self): self._dict = dict() l = self.init_list() self.setpar("acc", l[0]) self.setpar("log_root", l[1]) - self.setpar("modules",l[2].strip()) - self.setpar("venv",l[3].strip()) + self.setpar("modules", l[2].strip()) + self.setpar("venv", l[3].strip()) def init_list(self): pass @@ -56,16 +57,18 @@ def setpar(self, k, v): def getpar(self, k): return self._dict.get(k) + ############################# + class ICRARoodConfig(DefaultConfig): MODULES = """ module load python/3.8.12 """ # The following is more a workaround than a solution # requires the user to have a venv exectly in that place - ACCOUNT = os.environ['USER'] - HOME_DIR = os.environ['HOME'] + ACCOUNT = os.environ["USER"] + HOME_DIR = os.environ["HOME"] LOG_DIR = f"{HOME_DIR}/dlg/runs" VENV = f"source {HOME_DIR}/dlg/venv/bin/activate" @@ -73,9 +76,8 @@ def __init__(self): super(ICRARoodConfig, self).__init__() def init_list(self): # TODO please fill in - return [self.ACCOUNT, self.LOG_DIR, - self.MODULES, - self.VENV] + return [self.ACCOUNT, self.LOG_DIR, self.MODULES, self.VENV] + class GalaxyMWAConfig(DefaultConfig): def __init__(self): @@ -92,6 +94,7 @@ class GalaxyASKAPConfig(DefaultConfig): module load mpi4py """ VENV = "" + def __init__(self): super(GalaxyASKAPConfig, self).__init__() @@ -106,6 +109,7 @@ def __init__(self): def init_list(self): return ["pawsey0129", "/group/pawsey0129/daliuge_logs"] + class TianHe2Config(DefaultConfig): def __init__(self): super(TianHe2Config, self).__init__() @@ -114,9 +118,9 @@ def init_list(self): # TODO please fill in return ["SHAO", "/group/shao/daliuge_logs"] - ########################################## + class ConfigFactory: mapping = { "galaxy_mwa": GalaxyMWAConfig, diff --git a/daliuge-engine/dlg/deploy/create_dlg_job.py b/daliuge-engine/dlg/deploy/create_dlg_job.py index dc4c5c4b4..03446d221 100644 --- a/daliuge-engine/dlg/deploy/create_dlg_job.py +++ b/daliuge-engine/dlg/deploy/create_dlg_job.py @@ -60,7 +60,9 @@ class LogEntryPair: def __init__(self, name, gstart, gend): self._name = name - self._gstart = (gstart + 2) # group 0 is the whole matching line, group 1 is the catchall + self._gstart = ( + gstart + 2 + ) # group 0 is the whole matching line, group 1 is the catchall self._gend = gend + 2 self._start_time = None self._end_time = None @@ -274,8 +276,12 @@ def parse(self, out_csv=None): if not os.path.isdir(os.path.join(self._log_dir, log_directory_file_name)): continue nm_logf = os.path.join(self._log_dir, log_directory_file_name, "dlgNM.log") - nm_dim_logf = os.path.join(self._log_dir, log_directory_file_name, "dlgDIM.log") - nm_mm_logf = os.path.join(self._log_dir, log_directory_file_name, "dlgMM.log") + nm_dim_logf = os.path.join( + self._log_dir, log_directory_file_name, "dlgDIM.log" + ) + nm_mm_logf = os.path.join( + self._log_dir, log_directory_file_name, "dlgMM.log" + ) if not os.path.exists(nm_logf): if os.path.exists(nm_dim_logf) or os.path.exists(nm_mm_logf): num_dims += 1 @@ -338,9 +344,10 @@ def parse(self, out_csv=None): deploy_time = indexed_leps["node_deploy_time"].get_duration() if deploy_time is None: # since some node managers failed to start continue - exec_time = (indexed_leps["completion_time"].get_duration() - or indexed_leps["completion_time_old"].get_duration() - ) + exec_time = ( + indexed_leps["completion_time"].get_duration() + or indexed_leps["completion_time_old"].get_duration() + ) if exec_time is None: continue real_exec_time = exec_time - (max_node_deploy_time - deploy_time) @@ -386,7 +393,8 @@ def check_log_dir(self, log_dir): def main(): parser = optparse.OptionParser( - usage='\n%prog -a [1|2] -f [options]\n\n%prog -h for further help') + usage="\n%prog -a [1|2] -f [options]\n\n%prog -h for further help" + ) parser.add_option( "-a", @@ -613,7 +621,9 @@ def main(): ) for path_to_graph_file in (opts.logical_graph, opts.physical_graph): if path_to_graph_file and not os.path.exists(path_to_graph_file): - parser.error("Cannot locate graph file at '{0}'".format(path_to_graph_file)) + parser.error( + "Cannot locate graph file at '{0}'".format(path_to_graph_file) + ) client = SlurmClient( facility=opts.facility, @@ -630,7 +640,7 @@ def main(): check_with_session=opts.check_with_session, logical_graph=opts.logical_graph, physical_graph=opts.physical_graph, - submit=opts.submit in ['True', 'true'], + submit=opts.submit in ["True", "true"], ) client._visualise_graph = opts.visualise_graph client.submit_job() diff --git a/daliuge-engine/dlg/deploy/deployment_utils.py b/daliuge-engine/dlg/deploy/deployment_utils.py index f690359c6..2d13ba1aa 100644 --- a/daliuge-engine/dlg/deploy/deployment_utils.py +++ b/daliuge-engine/dlg/deploy/deployment_utils.py @@ -20,6 +20,12 @@ # MA 02111-1307 USA # import json +import logging +import re +import subprocess +import time + +logger = logging.getLogger(__name__) class ListTokens(object): @@ -104,6 +110,20 @@ def list_as_string(s): return _parse_list_tokens(iter(_list_tokenizer(s))) +def check_k8s_env(): + """ + Makes sure kubectl can be called and is accessible. + """ + try: + output = subprocess.run( + ["kubectl version"], capture_output=True, shell=True + ).stdout + pattern = re.compile(r"^Client Version:.*\nServer Version:.*") + return re.match(pattern, output.decode(encoding="utf-8")) + except subprocess.SubprocessError: + return False + + def find_numislands(physical_graph_template_file): """ Given the physical graph data extract the graph name and the total number of @@ -119,10 +139,10 @@ def find_numislands(physical_graph_template_file): (pgt_name, pgt) = pgt_data except: raise ValueError(type(pgt_data)) - nodes = list(map(lambda x: x['node'], pgt)) - islands = list(map(lambda x: x['island'], pgt)) - num_islands = len(dict(zip(islands,range(len(islands))))) - num_nodes = len(dict(zip(nodes,range(len(nodes))))) + nodes = list(map(lambda x: x["node"], pgt)) + islands = list(map(lambda x: x["island"], pgt)) + num_islands = len(dict(zip(islands, range(len(islands))))) + num_nodes = len(dict(zip(nodes, range(len(nodes))))) pip_name = pgt_name return num_islands, num_nodes, pip_name @@ -146,7 +166,91 @@ def num_daliuge_nodes(num_nodes: int, run_proxy: bool): else: ret = num_nodes - 0 # exclude the data island node? if ret <= 0: - raise Exception( - "Not enough nodes {0} to run DALiuGE.".format(num_nodes) - ) + raise Exception("Not enough nodes {0} to run DALiuGE.".format(num_nodes)) return ret + + +def find_node_ips(): + query = subprocess.check_output( + [ + r"kubectl get nodes --selector=kubernetes.io/role!=master -o jsonpath={.items[*].status.addresses[?\(@.type==\"InternalIP\"\)].address}" + ], + shell=True, + ) + node_ips = query.decode(encoding="utf-8").split(" ") + return node_ips + + +def find_service_ips(num_expected, retries=3, timeout=10): + pattern = r"^daliuge-daemon-service-.*\s*ClusterIP\s*\d+\.\d+\.\d+\.\d+" + ip_pattern = r"\d+\.\d+\.\d+\.\d+" + ips = [] + attempts = 0 + while len(ips) < num_expected and attempts < retries: + ips = [] + query = subprocess.check_output( + [r"kubectl get svc -o wide"], shell=True + ).decode(encoding="utf-8") + outcome = re.findall(pattern, query, re.M) + for service in outcome: + ip = re.search(ip_pattern, service) + if ip: + ips.append(ip.group(0)) + logger.info(f"K8s service ips: {ips}") + time.sleep(timeout) + return ips + + +def find_pod_ips(num_expected, retries=3, timeout=10): + ips = [] + attempts = 0 + while len(ips) < num_expected and attempts < retries: + ips = [] + query = str( + subprocess.check_output([r"kubectl get pods -o wide"], shell=True).decode( + encoding="utf-8" + ) + ) + pattern = r"^daliuge-daemon.*" + ip_pattern = r"\d+\.\d+\.\d+\.\d+" + outcome = re.findall(pattern, query, re.M) + for pod in outcome: + ip = re.search(ip_pattern, pod) + if ip: + ips.append(ip.group(0)) + logger.info(f"K8s pod ips: {ips}") + time.sleep(timeout) + return ips + + +def _status_all_running(statuses): + if statuses == []: + return False + for status in statuses: + if status != "Running": + return False + return True + + +def wait_for_pods(num_expected, retries=18, timeout=10): + all_running = False + attempts = 0 + while not all_running and attempts < retries: + query = str( + subprocess.check_output([r"kubectl get pods -o wide"], shell=True).decode( + encoding="utf-8" + ) + ) + logger.info(query) + pattern = r"^daliuge-daemon.*" + outcome = re.findall(pattern, query, re.M) + if len(outcome) < num_expected: + all_running = False + continue + all_running = True + for pod in outcome: + if "Running" not in pod: + all_running = False + attempts += 1 + time.sleep(timeout) + return all_running diff --git a/daliuge-engine/dlg/deploy/dlg_monitor.py b/daliuge-engine/dlg/deploy/dlg_monitor.py index 97395c264..340f02f14 100644 --- a/daliuge-engine/dlg/deploy/dlg_monitor.py +++ b/daliuge-engine/dlg/deploy/dlg_monitor.py @@ -51,7 +51,6 @@ from ..utils import b2s - BUFF_SIZE = 16384 outstanding_conn = 20 default_publication_port = 20000 diff --git a/daliuge-engine/dlg/deploy/dlg_proxy.py b/daliuge-engine/dlg/deploy/dlg_proxy.py index 8a42a0b87..9d9f9453f 100644 --- a/daliuge-engine/dlg/deploy/dlg_proxy.py +++ b/daliuge-engine/dlg/deploy/dlg_proxy.py @@ -36,10 +36,11 @@ -------------------------------------------------------------------------------- """ +import logging import select import socket import struct -import sys, logging +import sys import time from ..utils import b2s diff --git a/daliuge-engine/dlg/deploy/helm_client.py b/daliuge-engine/dlg/deploy/helm_client.py index 7e8b57d4a..344c0fbb1 100644 --- a/daliuge-engine/dlg/deploy/helm_client.py +++ b/daliuge-engine/dlg/deploy/helm_client.py @@ -23,52 +23,94 @@ Contains a module translating physical graphs to kubernetes helm charts. """ import json -import re -import time +import logging import os -import sys -import shutil import pathlib +import shutil +import subprocess +import sys +import threading +import time import dlg import yaml -import subprocess from dlg.common.version import version as dlg_version -from dlg.restutils import RestClient +from dlg.constants import NODE_DEFAULT_REST_PORT from dlg.deploy.common import submit +from dlg.deploy.deployment_utils import ( + find_node_ips, + find_service_ips, + find_pod_ips, + wait_for_pods, + check_k8s_env, +) +from dlg.dropmake import pg_generator +from dlg.restutils import RestClient + +logger = logging.getLogger(__name__) + +def _num_deployments_required(islands, nodes): + machines = set() + machines.update(islands) + machines.update(nodes) + return len(machines) -def _write_chart(chart_dir, name: str, chart_name: str, version: str, app_version: str, home: str, - description, keywords: list, sources: list, kubeVersion: str): - chart_info = {'apiVersion': "v2", 'name': chart_name, 'type': 'application', 'version': version, - 'appVersion': app_version, 'home': home, 'description': description, - 'keywords': keywords, 'sources': sources, 'kubeVersion': kubeVersion} + +def _write_chart( + chart_dir, + name: str, + chart_name: str, + version: str, + app_version: str, + home: str, + description, + keywords: list, + sources: list, + kubeVersion: str, +): + chart_info = { + "apiVersion": "v2", + "name": chart_name, + "type": "application", + "version": version, + "appVersion": app_version, + "home": home, + "description": description, + "keywords": keywords, + "sources": sources, + "kubeVersion": kubeVersion, + } # TODO: Fix app_version quotations. - with open(f'{chart_dir}{os.sep}{name}', 'w', encoding='utf-8') as chart_file: + with open(f"{chart_dir}{os.sep}{name}", "w", encoding="utf-8") as chart_file: yaml.dump(chart_info, chart_file) def _write_values(chart_dir, config): - with open(f"{chart_dir}{os.sep}custom-values.yaml", 'w', encoding='utf-8') as value_file: + with open( + f"{chart_dir}{os.sep}custom-values.yaml", "w+", encoding="utf-8" + ) as value_file: yaml.dump(config, value_file) + logger.info("Written custom-values file.") def _read_values(chart_dir): - with open(f"{chart_dir}{os.sep}values.yaml", 'r', encoding='utf-8') as old_file: + with open(f"{chart_dir}{os.sep}values.yaml", "r", encoding="utf-8") as old_file: data = yaml.safe_load(old_file) - with open(f"{chart_dir}{os.sep}values.yaml", 'r', encoding='utf-8') as custom_file: + with open(f"{chart_dir}{os.sep}values.yaml", "r", encoding="utf-8") as custom_file: new_data = yaml.safe_load(custom_file) data.update(new_data) + logger.info("Read yaml values file") return data def _find_resources(pgt_data): pgt = json.loads(pgt_data) - nodes = list(map(lambda x: x['node'], pgt)) - islands = list(map(lambda x: x['island'], pgt)) - num_islands = len(dict(zip(islands, nodes))) - num_nodes = len(nodes) - return num_islands, num_nodes + nodes = list(map(lambda x: x["node"], pgt)) + islands = list(map(lambda x: x["island"], pgt)) + islands = list(set(islands)) + nodes = list(set(nodes)) + return islands, nodes class HelmClient: @@ -76,105 +118,262 @@ class HelmClient: Writes necessary files to launch job with kubernetes. """ - def __init__(self, deploy_name, chart_name="daliuge-daemon", deploy_dir="./", - submit=True, chart_version="0.1.0", - value_config=None, physical_graph_file=None, chart_vars=None): + def __init__( + self, + deploy_name, + chart_name="daliuge-daemon", + deploy_dir="./", + submit=True, + chart_version="0.1.0", + value_config=None, + physical_graph_file=None, + chart_vars=None, + ): if value_config is None: value_config = dict() + self._k8s_access = check_k8s_env() self._chart_name = chart_name - self._chart_vars = {'name': 'daliuge-daemon', - 'appVersion': 'v1.0.0', - 'home': 'https://github.com/ICRAR/daliuge/daliuge-k8s', - 'description': 'DALiuGE k8s deployment', - 'keywords': ['daliuge', 'workflow'], - 'sources': ['https://github.com/ICRAR/daliuge/daliuge-k8s'], - 'kubeVersion': ">=1.10.0-0" - } + self._chart_vars = { + "name": "daliuge-daemon", + "appVersion": "v1.0.0", + "home": "https://github.com/ICRAR/daliuge/daliuge-k8s", + "description": "DALiuGE k8s deployment", + "keywords": ["daliuge", "workflow"], + "sources": ["https://github.com/ICRAR/daliuge/daliuge-k8s"], + "kubeVersion": ">=1.10.0-0", + } if chart_vars is not None: self._chart_vars.update(chart_vars) self._deploy_dir = deploy_dir - self._chart_dir = os.path.join(self._deploy_dir, 'daliuge-daemon') + self._chart_dir = os.path.join(self._deploy_dir, "daliuge-daemon") self._chart_version = chart_version self._deploy_name = deploy_name self._submit = submit self._value_data = value_config if value_config is not None else {} self._submission_endpoint = None + self._k8s_nodes = find_node_ips() + self._num_machines = 1 + self._pod_details = {} if physical_graph_file is not None: self._set_physical_graph(physical_graph_file) # Copy in template files. library_root = pathlib.Path(os.path.dirname(dlg.__file__)).parent.parent - print(library_root) + logger.debug(f"Helm chart copied to: {library_root}") if sys.version_info >= (3, 8): - shutil.copytree(os.path.join(library_root, 'daliuge-k8s', 'helm'), self._deploy_dir, - dirs_exist_ok=True) + shutil.copytree( + os.path.join(library_root, "daliuge-k8s", "helm"), + self._deploy_dir, + dirs_exist_ok=True, + ) else: - shutil.copytree(os.path.join(library_root, 'daliuge-k8s', 'helm'), self._deploy_dir) + shutil.copytree( + os.path.join(library_root, "daliuge-k8s", "helm"), self._deploy_dir + ) - def _set_physical_graph(self, physical_graph_content): + def _set_physical_graph(self, physical_graph_content, co_host=True): self._physical_graph_file = physical_graph_content - self._num_islands, self._num_nodes = _find_resources( - self._physical_graph_file) + self._islands, self._nodes = _find_resources(self._physical_graph_file) + self._num_machines = _num_deployments_required(self._islands, self._nodes) - ( + 1 if co_host else 0 + ) + + def _find_pod_details(self): + # NOTE: +1 for the master. + service_ips = find_service_ips(self._num_machines + 1) + pod_ips = find_pod_ips(self._num_machines + 1) + labels = sorted([str(x) for x in range(self._num_machines)]) + for i in range(len(labels)): + self._pod_details[labels[i]] = {"ip": pod_ips[i], "svc": service_ips[i]} + self._pod_details["master"] = {"ip": pod_ips[-1], "svc": service_ips[-1]} + logger.debug(f"Pod details: {self._pod_details}") - def create_helm_chart(self, physical_graph_content): + def create_helm_chart(self, physical_graph_content, co_host=True): """ Translates a physical graph to a kubernetes helm chart. For now, it will just try to run everything in a single container. """ - _write_chart(self._chart_dir, 'Chart.yaml', self._chart_name, self._chart_version, - dlg_version, - self._chart_vars['home'], self._chart_vars['description'], - self._chart_vars['keywords'], self._chart_vars['sources'], - self._chart_vars['kubeVersion']) + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + # Add charts + self._set_physical_graph(physical_graph_content, co_host) + _write_chart( + self._chart_dir, + "Chart.yaml", + self._chart_name, + self._chart_version, + dlg_version, + self._chart_vars["home"], + self._chart_vars["description"], + self._chart_vars["keywords"], + self._chart_vars["sources"], + self._chart_vars["kubeVersion"], + ) # Update values.yaml _write_values(self._chart_dir, self._value_data) self._value_data = _read_values(self._chart_dir) - # Add charts - # TODO: Add charts to helm - self._set_physical_graph(physical_graph_content) # Update template - # TODO: Update templates in helm - def launch_helm(self): + def start_manager(self, manager_node): + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + self._submission_endpoint = self._pod_details[manager_node]["svc"] + client = RestClient( + self._submission_endpoint, + self._value_data["service"]["daemon"]["port"], + timeout=30, + ) + node_ips = [x["ip"] for x in self._pod_details.values()] + print(node_ips) + data = json.dumps({"nodes": node_ips}).encode("utf-8") + time.sleep(5) + logger.debug(f"Starting manager on {self._submission_endpoint}") + client._POST( + "/managers/island/start", content=data, content_type="application/json" + ).read() + client._POST( + "/managers/master/start", content=data, content_type="application/json" + ).read() + + def start_nodes(self): + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + ips = [x["svc"] for x in self._pod_details.values()] + ips.remove(self._pod_details["master"]["svc"]) + for ip in ips: + client = RestClient( + ip, self._value_data["service"]["daemon"]["port"], timeout=30 + ) + time.sleep(5) + logger.debug(f"Starting node on {ip}") + # node_ips = ['127.0.0.1'] + [x['ip'] for x in self._pod_details.values()] + node_ips = [x["ip"] for x in self._pod_details.values()] + # data = json.dumps({'nodes': ['127.0.0.1']}).encode('utf-8') + data = json.dumps({"nodes": node_ips}).encode("utf-8") + client._POST( + "/managers/master/start", content=data, content_type="application/json" + ).read() + + def launch_helm(self, co_host=False): """ Launches the built helm chart using the most straightforward commands possible. Assumes all files are prepared and validated. """ + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") if self._submit: os.chdir(self._deploy_dir) - instruction = f'helm install {self._deploy_name} {self._chart_name}/ ' \ - f'--values {self._chart_name}{os.sep}custom-values.yaml' - print(subprocess.check_output([instruction], - shell=True).decode('utf-8')) - query = str(subprocess.check_output(['kubectl get svc -o wide'], shell=True)) - # WARNING: May be problematic later if multiple services are running - pattern = r"-service\s*ClusterIP\s*\d+\.\d+\.\d+\.\d+" - ip_pattern = r"\d+\.\d+\.\d+\.\d+" - outcome = re.search(pattern, query) - if outcome: - manager_ip = re.search(ip_pattern, outcome.string) - self._submission_endpoint = manager_ip.group(0) - client = RestClient(self._submission_endpoint, - self._value_data['service']['daemon']['port']) - data = json.dumps({'nodes': ["127.0.0.1"]}).encode('utf-8') - time.sleep(5) # TODO: Deterministic deployment information - client._POST('/managers/island/start', content=data, - content_type='application/json') - client._POST('/managers/master/start', content=data, - content_type='application/json') + _write_values( + self._chart_dir, + {"deploy_id": "master", "name": f"{self._chart_name}-master"}, + ) + instruction = ( + f"helm install {self._deploy_name}-master {self._chart_name}/ " + f"--values {self._chart_name}{os.sep}custom-values.yaml" + ) + process_return_string = subprocess.check_output( + [instruction], shell=True + ).decode("utf-8") + logger.info(f"{process_return_string}") + for i in range(self._num_machines): + _write_values( + self._chart_dir, {"deploy_id": i, "name": f"{self._chart_name}-{i}"} + ) + instruction = ( + f"helm install {self._deploy_name}-{i} {self._chart_name}/ " + f"--values {self._chart_name}{os.sep}custom-values.yaml" + ) + process_return_string = subprocess.check_output( + [instruction], shell=True + ).decode("utf-8") + logger.info(f"{process_return_string}") + # TODO: Check running nodes before launching another + self._find_pod_details() + if wait_for_pods(self._num_machines): + self.start_manager("master") + # self.start_nodes() else: - print("Could not find manager IP address") - + logger.error("K8s pods did not start in timeframe allocated") + self.teardown() + raise RuntimeWarning("K8s pods did not start in timeframe allocated") else: - print(f"Created helm chart {self._chart_name} in {self._deploy_dir}") + logger.info(f"Created helm chart {self._chart_name} in {self._deploy_dir}") def teardown(self): - subprocess.check_output(['helm uninstall daliuge-daemon'], shell=True) + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + for i in range(self._num_machines - 1, -1, -1): + subprocess.check_output([f"helm uninstall daliuge-daemon-{i}"], shell=True) + subprocess.check_output([f"helm uninstall daliuge-daemon-master"], shell=True) + + def _monitor(self, session_id=None): + def _task(): + while True: + try: + dlg.deploy.common.monitor_sessions( + session_id=session_id, + host=self._submission_endpoint, + port=NODE_DEFAULT_REST_PORT, + ) + break + except: + logger.exception("Monitoring failed, attempting to restart") + + threads = threading.Thread(target=_task) + threads.start() + return threads + + def submit_pgt(self): + """ + There is a semi-dynamic element to fetching the IPs of Node(s) to deploy to. + Hence, launching the chart and initiating graph execution have been de-coupled. + """ + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + # TODO: Check all nodes are operational first. + pgt_data = json.loads(self._physical_graph_file) + node_ips = [x["ip"] for x in self._pod_details.values()] + node_ips.remove(self._pod_details["master"]["ip"]) + node_ips = [self._pod_details["master"]["ip"]] + node_ips + # node_ips = ['127.0.0.1'] + physical_graph = pg_generator.resource_map(pgt_data, node_ips, co_host_dim=True) + # TODO: Add dumping to log-dir + submit( + physical_graph, + self._submission_endpoint, + port=NODE_DEFAULT_REST_PORT, + skip_deploy=False, + ) + + def submit_and_monitor_pgt(self): + """ + Combines submission and monitoring steps of a pgt. + """ + session_id = self.submit_pgt() + monitoring_thread = self._monitor(session_id) + monitoring_thread.join() - def submit_job(self): + def submit_pg(self): """ There is a semi-dynamic element to fetching the IPs of Node(s) to deploy to. Hence, launching the chart and initiating graph execution have been de-coupled. """ + if not self._k8s_access: + raise RuntimeError("Cannot access k8s") + # TODO: Check all nodes are operational first. pg_data = json.loads(self._physical_graph_file) - submit(pg_data, self._submission_endpoint) + # TODO: Add dumping to log-dir + submit( + pg_data, + self._submission_endpoint, + port=NODE_DEFAULT_REST_PORT, + skip_deploy=False, + ) + + def submit_and_monitor_pg(self): + """ + Combines submission and monitoring steps of a pg. + """ + session_id = self.submit_pg() + monitoring_thread = self._monitor(session_id) + monitoring_thread.join() diff --git a/daliuge-engine/dlg/deploy/remotes.py b/daliuge-engine/dlg/deploy/remotes.py index 38245cd57..327ec08b7 100644 --- a/daliuge-engine/dlg/deploy/remotes.py +++ b/daliuge-engine/dlg/deploy/remotes.py @@ -29,7 +29,6 @@ from . import deployment_utils - logger = logging.getLogger(__name__) @@ -46,7 +45,7 @@ def __init__(self, options, my_ip): def _get_ip_from_name(self, hostname): rx = re.compile("^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") - if rx.match(hostname): # already an IP?? + if rx.match(hostname): # already an IP?? return hostname else: return socket.gethostbyname(hostname) @@ -58,8 +57,7 @@ def _set_world(self, rank, size, sorted_peers): if len(set(sorted_peers)) != self.size: raise RuntimeError("More than one task started per node, cannot continue") # convert nodes to IP addresses if hostnames - self.sorted_peers = list(map(lambda x:self._get_ip_from_name(x), - sorted_peers)) + self.sorted_peers = list(map(lambda x: self._get_ip_from_name(x), sorted_peers)) nm_range = self._nm_range() if nm_range[0] == nm_range[1]: raise RuntimeError( diff --git a/daliuge-engine/dlg/deploy/slurm_client.py b/daliuge-engine/dlg/deploy/slurm_client.py index 1f96c0724..357037241 100644 --- a/daliuge-engine/dlg/deploy/slurm_client.py +++ b/daliuge-engine/dlg/deploy/slurm_client.py @@ -50,26 +50,26 @@ class SlurmClient: """ def __init__( - self, - log_root=None, - acc=None, - physical_graph_template_data=None, # JSON formatted physical graph template - logical_graph=None, - job_dur=30, - num_nodes=None, - run_proxy=False, - mon_host=DEFAULT_AWS_MON_HOST, - mon_port=DEFAULT_AWS_MON_PORT, - logv=1, - facility=None, - zerorun=False, - max_threads=0, - sleepncopy=False, - num_islands=None, - all_nics=False, - check_with_session=False, - submit=True, - pip_name=None, + self, + log_root=None, + acc=None, + physical_graph_template_data=None, # JSON formatted physical graph template + logical_graph=None, + job_dur=30, + num_nodes=None, + run_proxy=False, + mon_host=DEFAULT_AWS_MON_HOST, + mon_port=DEFAULT_AWS_MON_PORT, + logv=1, + facility=None, + zerorun=False, + max_threads=0, + sleepncopy=False, + num_islands=None, + all_nics=False, + check_with_session=False, + submit=True, + pip_name=None, ): self._config = ConfigFactory.create_config(facility=facility) self._acc = self._config.getpar("acc") if (acc is None) else acc @@ -97,7 +97,8 @@ def __init__( self._submit = submit self._dtstr = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") # .%f self._num_islands, self._num_nodes, self._pip_name = find_numislands( - self._physical_graph_template_data) + self._physical_graph_template_data + ) def get_log_dirname(self): """ @@ -106,7 +107,7 @@ def get_log_dirname(self): # Moved setting of dtstr to init # to ensure it doesn't change for this instance of SlurmClient() # dtstr = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") # .%f - graph_name = self._pip_name.split('_')[0] # use only the part of the graph name + graph_name = self._pip_name.split("_")[0] # use only the part of the graph name return "{0}_{1}".format(graph_name, self._dtstr) def create_job_desc(self, physical_graph_file): @@ -121,7 +122,7 @@ def create_job_desc(self, physical_graph_file): pardict["SESSION_ID"] = os.path.split(log_dir)[-1] pardict["JOB_DURATION"] = label_job_dur(self._job_dur) pardict["ACCOUNT"] = self._acc - pardict["PY_BIN"] = 'python3' if pardict["VENV"] else sys.executable + pardict["PY_BIN"] = "python3" if pardict["VENV"] else sys.executable pardict["LOG_DIR"] = log_dir pardict["GRAPH_PAR"] = ( '-L "{0}"'.format(self._logical_graph) @@ -155,7 +156,7 @@ def submit_job(self): os.makedirs(log_dir) physical_graph_file_name = "{0}/{1}".format(log_dir, self._pip_name) - with open(physical_graph_file_name, 'w') as physical_graph_file: + with open(physical_graph_file_name, "w") as physical_graph_file: physical_graph_file.write(self._physical_graph_template_data) physical_graph_file.close() diff --git a/daliuge-engine/dlg/deploy/start_dlg_cluster.py b/daliuge-engine/dlg/deploy/start_dlg_cluster.py index f42fb2fac..7aefc076b 100644 --- a/daliuge-engine/dlg/deploy/start_dlg_cluster.py +++ b/daliuge-engine/dlg/deploy/start_dlg_cluster.py @@ -62,11 +62,7 @@ GRAPH_MONITOR_INTERVAL = 5 VERBOSITY = "5" LOGGER = logging.getLogger("deploy.dlg.cluster") -APPS = ( - None, - "test.graphsRepository.SleepApp", - "test.graphsRepository.SleepAndCopyApp", -) +APPS = (None, "test.graphsRepository.SleepApp", "test.graphsRepository.SleepAndCopyApp") def check_host(host, port, timeout=5, check_with_session=False): @@ -99,7 +95,7 @@ def check_and_add(ip_addr): ntries = retry while ntries: if check_host( - ip_addr, port, timeout=timeout, check_with_session=check_with_session + ip_addr, port, timeout=timeout, check_with_session=check_with_session ): LOGGER.info("Host %s:%d is running", ip_addr, port) return ip_addr @@ -139,8 +135,7 @@ def get_workspace_dir(log_dir): def start_node_mgr( - log_dir, my_ip, logv=1, max_threads=0, host=None, event_listeners="", - use_tool=True + log_dir, my_ip, logv=1, max_threads=0, host=None, event_listeners="", use_tool=True ): """ Start node manager @@ -162,9 +157,10 @@ def start_node_mgr( str(max_threads), "--no-dlm", ] - if event_listeners: args += ["--event-listeners", event_listeners] + if event_listeners: + args += ["--event-listeners", event_listeners] - if use_tool: + if use_tool: # This returns immediately proc = tool.start_process("nm", args) LOGGER.info("Node manager process started with pid %d", proc.pid) @@ -224,11 +220,14 @@ def start_mm(node_list, log_dir, logv=1): proc = tool.start_process("mm", args) LOGGER.info("Master manager process started with pid %d", proc.pid) return proc + + # cmdline.dlgMM(parser, args) def _stop(endpoints): LOGGER.info(f"Stopping ThreadPool") + def _the_stop(endpoint): common.BaseDROPManagerClient(endpoint[0], endpoint[1]).stop() @@ -258,9 +257,10 @@ def _task(): dump_path = None if opts.dump: dump_path = os.path.join(opts.log_dir, "status-monitoring.json") - if submit: - session_id = common.submit(physical_graph, host=host, port=port, - session_id=opts.ssid) + if submit: + session_id = common.submit( + physical_graph, host=host, port=port, session_id=opts.ssid + ) else: session_id = opts.ssid @@ -324,7 +324,7 @@ def get_pg(opts, nms, dims): opts.part_algo, num_partitions=num_nms, num_islands=num_dims, - **algo_params + **algo_params, ) del unrolled # quickly dispose of potentially big object else: @@ -345,8 +345,9 @@ def get_pg(opts, nms, dims): retry=3, ) LOGGER.info(f"Mapping graph to available resources: nms {nms}, dims {dims}") - physical_graph = pg_generator.resource_map(pgt, dims + nms, num_islands=num_dims, - co_host_dim=opts.co_host_dim) + physical_graph = pg_generator.resource_map( + pgt, dims + nms, num_islands=num_dims, co_host_dim=opts.co_host_dim + ) graph_name = os.path.basename(opts.log_dir) graph_name = f"{graph_name.split('_')[0]}.json" # get just the graph name with open(os.path.join(opts.log_dir, graph_name), "wt") as pg_file: @@ -622,8 +623,10 @@ def main(): log_dir = "{0}/{1}".format(options.log_dir, remote.my_ip) os.makedirs(log_dir) logfile = log_dir + "/start_dlg_cluster.log" - log_format = "%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] " \ - "%(name)s#%(funcName)s:%(lineno)s %(message)s" + log_format = ( + "%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] " + "%(name)s#%(funcName)s:%(lineno)s %(message)s" + ) logging.basicConfig(filename=logfile, level=logging.DEBUG, format=log_format) LOGGER.info("This node has IP address: %s", remote.my_ip) @@ -651,7 +654,7 @@ def main(): # need to check for NM first and go on of co-hosted if remote.is_nm: - co_hosted = (remote.my_ip in remote.dim_ips) + co_hosted = remote.my_ip in remote.dim_ips nm_proc = start_node_mgr( log_dir, remote.my_ip, @@ -659,8 +662,8 @@ def main(): max_threads=options.max_threads, host=None if options.all_nics else remote.my_ip, event_listeners=options.event_listeners, - use_tool = co_hosted - ) + use_tool=co_hosted, + ) if remote.is_proxy: # Wait until the Island Manager is open @@ -676,21 +679,21 @@ def main(): "Couldn't connect to the main drop manager, proxy not started" ) elif remote.my_ip in remote.dim_ips: - LOGGER.info(f"Starting island managers on nodes: {remote.dim_ips}") - dim_proc = start_dim(remote.nm_ips, log_dir, remote.my_ip, logv=logv) - # whichever way we came from, now we have to wait until session is finished - # we always monitor the island, else we will have race conditions - physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips) - monitoring_thread = submit_and_monitor( - physical_graph, options, remote.dim_ips[0], REST_PORT, submit=co_hosted - ) - monitoring_thread.join() - # now the session is finished + LOGGER.info(f"Starting island managers on nodes: {remote.dim_ips}") + dim_proc = start_dim(remote.nm_ips, log_dir, remote.my_ip, logv=logv) + # whichever way we came from, now we have to wait until session is finished + # we always monitor the island, else we will have race conditions + physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips) + monitoring_thread = submit_and_monitor( + physical_graph, options, remote.dim_ips[0], REST_PORT, submit=co_hosted + ) + monitoring_thread.join() + # now the session is finished - # still shutting DIM down first to avoid monitoring conflicts - stop_dims(remote.dim_ips) - # now stop all the NMs - stop_nms(remote.nm_ips) + # still shutting DIM down first to avoid monitoring conflicts + stop_dims(remote.dim_ips) + # now stop all the NMs + stop_nms(remote.nm_ips) # shouldn't need this in addition # if dim_proc is not None: @@ -720,7 +723,9 @@ def main(): ) mm_proc = start_mm(remote.dim_ips, log_dir, logv=logv) monitoring_thread.join() - stop_mm(remote.my_ip) # TODO: I don't think we need this and least not in the single island case + stop_mm( + remote.my_ip + ) # TODO: I don't think we need this and least not in the single island case stop_dims(remote.dim_ips) else: nm_ips = remote.recv_dim_nodes() diff --git a/daliuge-engine/dlg/deploy/start_helm_cluster.py b/daliuge-engine/dlg/deploy/start_helm_cluster.py index 24ac2577e..d716bd9e9 100644 --- a/daliuge-engine/dlg/deploy/start_helm_cluster.py +++ b/daliuge-engine/dlg/deploy/start_helm_cluster.py @@ -29,63 +29,64 @@ import argparse import json import os +import logging import tempfile -from dlg.dropmake import pg_generator +import dlg.exceptions +import dlg.restutils from dlg.deploy.helm_client import HelmClient +from dlg.dropmake import pg_generator -def get_pg(opts, node_managers: list, data_island_managers: list): +def get_pg(opts, num_node_managers, num_data_island_managers): if not opts.logical_graph and not opts.physical_graph: return [] - num_nms = len(node_managers) - num_dims = len(data_island_managers) if opts.logical_graph: unrolled_graph = pg_generator.unroll(opts.logical_graph) - pgt = pg_generator.partition(unrolled_graph, algo='metis', num_partitons=num_nms, - num_islands=num_dims) + pgt = pg_generator.partition( + unrolled_graph, + algo="metis", + num_partitions=num_node_managers, + num_islands=num_data_island_managers, + ) del unrolled_graph else: - with open(opts.physical_graph, 'rb', encoding='utf-8') as pg_file: + with open(opts.physical_graph, "r", encoding="utf-8") as pg_file: pgt = json.load(pg_file) - physical_graph = pg_generator.resource_map(pgt, node_managers + data_island_managers) - # TODO: Add dumping to log-dir - return physical_graph + return pgt def start_helm(physical_graph_template, num_nodes: int, deploy_dir: str): # TODO: Dynamic helm chart logging dir - # TODO: Multiple node deployments - available_ips = ["127.0.0.1"] pgt = json.loads(physical_graph_template) - pgt = pg_generator.partition(pgt, algo='metis', num_partitons=len(available_ips), - num_islands=len(available_ips)) - pg = pg_generator.resource_map(pgt, available_ips + available_ips) + pgt = pg_generator.partition(pgt, algo="metis", num_partitons=1, num_islands=1) helm_client = HelmClient( - deploy_name='daliuge-daemon', - chart_name='daliuge-daemon', - deploy_dir=deploy_dir + deploy_name="daliuge-daemon", chart_name="daliuge-daemon", deploy_dir=deploy_dir ) + helm_client.create_helm_chart(json.dumps(pgt), co_host=True) try: - helm_client.create_helm_chart(json.dumps(pg)) helm_client.launch_helm() - helm_client.submit_job() + helm_client.submit_pgt() + except dlg.restutils.RestClientException as exp: + raise exp + except dlg.exceptions.InvalidGraphException as exp2: + raise exp2 + finally: helm_client.teardown() - except Exception as ex: - raise + logging.info("Finished deployment") def main(): parser = argparse.ArgumentParser() parser.add_argument( - '-L', - '--logical-graph', + "-L", + "--logical-graph", action="store", type=str, dest="logical_graph", help="The filename of the logical graph to deploy", - default=None + default=None, ) parser.add_argument( "-P", @@ -96,6 +97,15 @@ def main(): help="The filename of the physical graph (template) to deploy", default=None, ) + parser.add_argument( + "-N", + "--num_nodes", + action="store", + type=int, + dest="num_nodes", + help="The number of compute nodes you would like to try and deploy", + default=1, + ) options = parser.parse_args() if bool(options.logical_graph) == bool(options.physical_graph): @@ -106,19 +116,29 @@ def main(): if graph_file_name and not os.path.exists(graph_file_name): parser.error(f"Cannot locate graph_file at {graph_file_name}") - available_ips = ["127.0.0.1"] - physical_graph = get_pg(options, available_ips, available_ips) + if options.num_nodes <= 0: + parser.error("The number of nodes must be a positive integer") - helm_client = HelmClient( - deploy_name='daliuge-daemon', - chart_name='daliuge-daemon', - deploy_dir='/home/nicholas/dlg_temp/demo' - ) - helm_client.create_helm_chart(json.dumps(physical_graph)) - helm_client.launch_helm() - helm_client.submit_job() - helm_client.teardown() + physical_graph = get_pg(options, options.num_nodes, 1) + # TODO: dynamic deployment directory. + with tempfile.TemporaryDirectory() as tdir: + helm_client = HelmClient( + deploy_name="daliuge-daemon", + chart_name="daliuge-daemon", + deploy_dir=tdir, + ) + helm_client.create_helm_chart(json.dumps(physical_graph)) + try: + helm_client.launch_helm() + helm_client.submit_and_monitor_pgt() + except dlg.restutils.RestClientException as exp: + raise exp + except dlg.exceptions.InvalidGraphException as exp2: + raise exp2 + finally: + helm_client.teardown() if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) main() diff --git a/daliuge-engine/dlg/deploy/utils/monitor_replayer.py b/daliuge-engine/dlg/deploy/utils/monitor_replayer.py index 16920e440..99fb66c69 100644 --- a/daliuge-engine/dlg/deploy/utils/monitor_replayer.py +++ b/daliuge-engine/dlg/deploy/utils/monitor_replayer.py @@ -32,14 +32,21 @@ this module also depends on networkx (included in Daliuge), which produces the edge list that becomes input for gephi vis tool. """ -import pygraphviz as pgv -import networkx as nx -import json, os, logging, optparse, sys, commands, filecmp +import filecmp +import json +import logging +import optparse +import os +import sqlite3 as dbdrv +import sys from collections import defaultdict from datetime import datetime as dt from xml.etree.ElementTree import ElementTree -import sqlite3 as dbdrv + +import commands +import networkx as nx import numpy as np +import pygraphviz as pgv logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/drop.py b/daliuge-engine/dlg/drop.py index aee65890f..949194e7e 100644 --- a/daliuge-engine/dlg/drop.py +++ b/daliuge-engine/dlg/drop.py @@ -32,6 +32,7 @@ import errno import heapq import importlib +import inspect import io import logging import math @@ -47,6 +48,17 @@ from typing import List, Optional, Union import numpy as np +import pyarrow.plasma as plasma +import six +from dlg.common.reproducibility.constants import ( + ReproducibilityFlags, + REPRO_DEFAULT, + rmode_supported, + ALL_RMODES, +) +from dlg.common.reproducibility.reproducibility import common_hash +from merklelib import MerkleTree +from six import BytesIO from .ddap_protocol import ( ExecutionMode, @@ -57,9 +69,9 @@ DROPStates, DROPRel, ) -from .event import EventFirer -from .exceptions import InvalidDropException, InvalidRelationshipException -from .io import ( +from dlg.event import EventFirer +from dlg.exceptions import InvalidDropException, InvalidRelationshipException +from dlg.io import ( DataIO, OpenMode, FileIO, @@ -72,14 +84,29 @@ PlasmaFlightIO, ) -DEFAULT_INTERNAL_PARAMETERS = {'storage', 'rank', 'loop_cxt', 'dw', 'iid', 'dt', 'consumers', - 'config_data', 'mode'} +DEFAULT_INTERNAL_PARAMETERS = { + "storage", + "rank", + "loop_cxt", + "dw", + "iid", + "dt", + "consumers", + "config_data", + "mode", +} if sys.version_info >= (3, 8): - from .io import SharedMemoryIO -from .utils import prepare_sql, createDirIfMissing, isabs, object_tracking, getDlgVariable + from dlg.io import SharedMemoryIO +from dlg.utils import ( + prepare_sql, + createDirIfMissing, + isabs, + object_tracking, + getDlgVariable, +) from dlg.process import DlgProcess -from .meta import ( +from dlg.meta import ( dlg_float_param, dlg_int_param, dlg_list_param, @@ -272,6 +299,14 @@ def __init__(self, oid, uid, **kwargs): self._checksumType = None self._size = None + # Recording runtime reproducibility information is handled via MerkleTrees + # Switching on the reproducibility level will determine what information is recorded. + self._committed = False + self._merkleRoot = None + self._merkleTree = None + self._merkleData = [] + self._reproducibility = REPRO_DEFAULT + # The DataIO instance we use in our write method. It's initialized to # None because it's lazily initialized in the write method, since data # might be written externally and not through this DROP @@ -356,7 +391,7 @@ def getmembers(object, predicate=None): # Take a class dlg defined parameter class attribute and create an instanced attribute on object for attr_name, obj in getmembers( - self, lambda a: not (inspect.isfunction(a) or isinstance(a, property)) + self, lambda a: not (inspect.isfunction(a) or isinstance(a, property)) ): if isinstance(obj, dlg_float_param): value = kwargs.get(attr_name, obj.default_value) @@ -377,7 +412,10 @@ def getmembers(object, predicate=None): elif isinstance(obj, dlg_list_param): value = kwargs.get(attr_name, obj.default_value) if isinstance(value, str): - value = ast.literal_eval(value) + if value == "": + value = [] + else: + value = ast.literal_eval(value) if value is not None and not isinstance(value, list): raise Exception( "dlg_list_param {} is not a list. It is a {}".format( @@ -387,7 +425,10 @@ def getmembers(object, predicate=None): elif isinstance(obj, dlg_dict_param): value = kwargs.get(attr_name, obj.default_value) if isinstance(value, str): - value = ast.literal_eval(value) + if value == "": + value = {} + else: + value = ast.literal_eval(value) if value is not None and not isinstance(value, dict): raise Exception( "dlg_dict_param {} is not a dict. It is a {}".format( @@ -443,11 +484,14 @@ def get_environment_variable(self, key: str): """ if self._dlg_var_matcher.fullmatch(key): return getDlgVariable(key) - if len(key) < 2 or key[0] != '$': + if len(key) < 2 or key[0] != "$": # Reject malformed entries return key key_edit = key[1:] - env_var_ref, env_var_key = key_edit.split('.')[0], '.'.join(key_edit.split('.')[1:]) + env_var_ref, env_var_key = ( + key_edit.split(".")[0], + ".".join(key_edit.split(".")[1:]), + ) env_var_drop = None for producer in self._producers: if producer.name == env_var_ref: @@ -470,6 +514,164 @@ def get_environment_variables(self, keys: list): return_values.append(self.get_environment_variable(key)) return return_values + @property + def merkleroot(self): + return self._merkleRoot + + @property + def reproducibility_level(self): + return self._reproducibility + + @reproducibility_level.setter + def reproducibility_level(self, new_flag): + if type(new_flag) != ReproducibilityFlags: + raise TypeError("new_flag must be a reproducibility flag enum.") + elif rmode_supported(new_flag): # TODO: Support custom checkers for repro-level + self._reproducibility = new_flag + if new_flag == ReproducibilityFlags.ALL: + self._committed = False + self._merkleRoot = {rmode.name: None for rmode in ALL_RMODES} + self._merkleTree = {rmode.name: None for rmode in ALL_RMODES} + self._merkleData = {rmode.name: [] for rmode in ALL_RMODES} + elif self._committed: + # Current behaviour, set to un-committed again after change + self._committed = False + self._merkleRoot = None + self._merkleTree = None + self._merkleData = [] + else: + raise NotImplementedError("new_flag %d is not supported", new_flag.value) + + def generate_rerun_data(self): + """ + Provides a serailized list of Rerun data. + At runtime, Rerunning only requires execution success or failure. + :return: A dictionary containing rerun values + """ + return {"status": self._status} + + def generate_repeat_data(self): + """ + Provides a list of Repeat data. + At runtime, repeating, like rerunning only requires execution success or failure. + :return: A dictionary containing runtime exclusive repetition values. + """ + return {"status": self._status} + + def generate_recompute_data(self): + """ + Provides a dictionary containing recompute data. + At runtime, recomputing, like repeating and rerunning, by default, only shows success or failure. + We anticipate that any further implemented behaviour be done at a lower class. + :return: A dictionary containing runtime exclusive recompute values. + """ + return {"status": self._status} + + def generate_reproduce_data(self): + """ + Provides a list of Reproducibility data (specifically). + The default behaviour is to return nothing. Per-class behaviour is to be achieved by overriding this method. + :return: A dictionary containing runtime exclusive reproducibility data. + """ + return {} + + def generate_replicate_sci_data(self): + """ + Provides a list of scientific replication data. + This is by definition a merging of both reproduction and rerun data + :return: A dictionary containing runtime exclusive scientific replication data. + """ + res = {} + res.update(self.generate_rerun_data()) + res.update(self.generate_reproduce_data()) + return res + + def generate_replicate_comp_data(self): + """ + Provides a list of computational replication data. + This is by definition a merging of both reproduction and recompute data + :return: A dictionary containing runtime exclusive computational replication data. + """ + res = {} + recomp_data = self.generate_recompute_data() + if recomp_data is not None: + res.update(self.generate_recompute_data()) + res.update(self.generate_reproduce_data()) + return res + + def generate_replicate_total_data(self): + """ + Provides a list of total replication data. + This is by definition a merging of reproduction and repetition data + :return: A dictionary containing runtime exclusive total replication data. + """ + res = {} + res.update(self.generate_repeat_data()) + res.update(self.generate_reproduce_data()) + return res + + def generate_merkle_data(self): + """ + Provides a serialized summary of data as a list. + Fields constitute a single entry in this list. + Wraps several methods dependent on this DROPs reproducibility level + Some of these are abstract. + :return: A dictionary of elements constituting a summary of this drop + """ + if self._reproducibility is ReproducibilityFlags.NOTHING: + return {} + elif self._reproducibility is ReproducibilityFlags.RERUN: + return self.generate_rerun_data() + elif self._reproducibility is ReproducibilityFlags.REPEAT: + return self.generate_repeat_data() + elif self._reproducibility is ReproducibilityFlags.RECOMPUTE: + return self.generate_recompute_data() + elif self._reproducibility is ReproducibilityFlags.REPRODUCE: + return self.generate_reproduce_data() + elif self._reproducibility is ReproducibilityFlags.REPLICATE_SCI: + return self.generate_replicate_sci_data() + elif self._reproducibility is ReproducibilityFlags.REPLICATE_COMP: + return self.generate_replicate_comp_data() + elif self._reproducibility is ReproducibilityFlags.REPLICATE_TOTAL: + return self.generate_replicate_total_data() + elif self._reproducibility is ReproducibilityFlags.ALL: + return { + ReproducibilityFlags.RERUN.name: self.generate_rerun_data(), + ReproducibilityFlags.REPEAT.name: self.generate_repeat_data(), + ReproducibilityFlags.RECOMPUTE.name: self.generate_recompute_data(), + ReproducibilityFlags.REPRODUCE.name: self.generate_reproduce_data(), + ReproducibilityFlags.REPLICATE_SCI.name: self.generate_replicate_sci_data(), + ReproducibilityFlags.REPLICATE_COMP.name: self.generate_replicate_comp_data(), + ReproducibilityFlags.REPLICATE_TOTAL.name: self.generate_replicate_total_data(), + } + else: + raise NotImplementedError("Currently other levels are not in development.") + + def commit(self): + """ + Generates the MerkleRoot of this DROP + Should only be called once this DROP is completed. + """ + if not self._committed: + # Generate the MerkleData + self._merkleData = self.generate_merkle_data() + if self._reproducibility == ReproducibilityFlags.ALL: + for rmode in ALL_RMODES: + self._merkleTree[rmode.name] = MerkleTree( + self._merkleData[rmode.name].items(), common_hash + ) + self._merkleRoot[rmode.name] = self._merkleTree[ + rmode.name + ].merkle_root + else: + # Fill MerkleTree, add data and set the MerkleRoot Value + self._merkleTree = MerkleTree(self._merkleData.items(), common_hash) + self._merkleRoot = self._merkleTree.merkle_root + # Set as committed + self._committed = True + else: + logger.debug("Trying to re-commit DROP %s, cannot overwrite." % self) + @property def oid(self): """ @@ -699,6 +901,9 @@ def addConsumer(self, consumer, back=True): logger.debug("Adding back %r as input of %r", self, consumer) consumer.addInput(self, False) + # Add reproducibility subscription + self.subscribe(consumer, "reproducibility") + @property def producers(self): """ @@ -739,6 +944,8 @@ def handleEvent(self, e): """ if e.type == "producerFinished": self.producerFinished(e.uid, e.status) + elif e.type == "reproducibility": + self.dropReproComplete(e.uid, e.reprodata) @track_current_drop def producerFinished(self, uid, drop_state): @@ -777,6 +984,13 @@ def producerFinished(self, uid, drop_state): else: self.setCompleted() + def dropReproComplete(self, uid, reprodata): + """ + Callback invoved when a DROP with UID `uid` has finishing processing its reproducibility information. + Importantly, this is independent of that drop being completed. + """ + # TODO: Perform some action + @property def streamingConsumers(self): """ @@ -827,6 +1041,19 @@ def addStreamingConsumer(self, streamingConsumer, back=True): if self.executionMode == ExecutionMode.DROP: self.subscribe(streamingConsumer, "dropCompleted") + # Add reproducibility subscription + self.subscribe(streamingConsumer, "reproducibility") + + def completedrop(self): + """ + Builds final reproducibility data for this drop and fires a 'dropComplete' event. + This should be called once a drop is finished in success or error + :return: + """ + self.commit() + reprodata = {"data": self._merkleData, "merkleroot": self.merkleroot} + self._fire(eventType="reproducibility", reprodata=reprodata) + @track_current_drop def setError(self): """ @@ -842,7 +1069,8 @@ def setError(self): self.status = DROPStates.ERROR # Signal our subscribers that the show is over - self._fire("dropCompleted", status=DROPStates.ERROR) + self._fire(eventType="dropCompleted", status=DROPStates.ERROR) + self.completedrop() @track_current_drop def setCompleted(self): @@ -863,14 +1091,16 @@ def setCompleted(self): "%r not in INITIALIZED or WRITING state (%s), cannot setComplete()" % (self, self.status) ) - - self._closeWriters() + try: + self._closeWriters() + except AttributeError as exp: + logger.debug(exp) logger.debug("Moving %r to COMPLETED", self) self.status = DROPStates.COMPLETED - # Signal our subscribers that the show is over - self._fire("dropCompleted", status=DROPStates.COMPLETED) + self._fire(eventType="dropCompleted", status=DROPStates.COMPLETED) + self.completedrop() def isCompleted(self): """ @@ -906,7 +1136,10 @@ def parameters(self): class PathBasedDrop(object): - """Base class for data drops that handle paths (i.e., file and directory drops)""" + """ + Base class for data drops that handle paths (i.e., file and directory drops) + """ + _path: str = None def get_dir(self, dirname): @@ -980,10 +1213,7 @@ def open(self, **kwargs): if self.status != DROPStates.COMPLETED: raise Exception( "%r is in state %s (!=COMPLETED), cannot be opened for reading" - % ( - self, - self.status, - ) + % (self, self.status) ) io = self.getIO() @@ -1041,11 +1271,7 @@ def read(self, descriptor, count=4096, **kwargs): def _checkStateAndDescriptor(self, descriptor): if self.status != DROPStates.COMPLETED: raise Exception( - "%r is in state %s (!=COMPLETED), cannot be read" - % ( - self, - self.status, - ) + "%r is in state %s (!=COMPLETED), cannot be read" % (self, self.status) ) if descriptor is None: raise ValueError("Illegal empty descriptor given") @@ -1238,7 +1464,7 @@ def dataURL(self) -> str: # @details A standard file on a filesystem mounted to the deployment machine # @par EAGLE_START # @param category File -# @param tag template +# @param tag daliuge # @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ # \~English Estimated size of the data contained in this node # @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ @@ -1286,6 +1512,7 @@ class FileDROP(DataDROP, PathBasedDrop): ``dirname``, ``$u`` is the drop's UID and ``$B`` is the base directory for this drop's session, namely ``/the/cwd/$session_id``. """ + # filepath = dlg_string_param("filepath", None) # dirname = dlg_string_param("dirname", None) delete_parent_directory = dlg_bool_param("delete_parent_directory", False) @@ -1294,8 +1521,10 @@ class FileDROP(DataDROP, PathBasedDrop): def sanitize_paths(self, filepath, dirname): # first replace any ENV_VARS on the names - if filepath: filepath = os.path.expandvars(filepath) - if dirname: dirname = os.path.expandvars(dirname) + if filepath: + filepath = os.path.expandvars(filepath) + if dirname: + dirname = os.path.expandvars(dirname) # No filepath has been given, there's nothing to sanitize if not filepath: return filepath, dirname @@ -1319,8 +1548,8 @@ def initialize(self, **kwargs): """ # filepath, dirpath the two pieces of information we offer users to tweak # These are very intermingled but are not exactly the same, see below - self.filepath = self.parameters.get('filepath', None) - self.dirname = self.parameters.get('dirname', None) + self.filepath = self.parameters.get("filepath", None) + self.dirname = self.parameters.get("dirname", None) # Duh! if isabs(self.filepath) and self.dirname: raise InvalidDropException( @@ -1406,19 +1635,30 @@ def setCompleted(self): self._size = 0 # Signal our subscribers that the show is over self._fire("dropCompleted", status=DROPStates.COMPLETED) + self.completedrop() @property def dataURL(self) -> str: hostname = os.uname()[1] # TODO: change when necessary return "file://" + hostname + self._path + # Override + def generate_reproduce_data(self): + from .droputils import allDropContents + + try: + data = allDropContents(self, self.size) + except Exception: + data = b"" + return {"data_hash": common_hash(data)} + ## # @brief NGAS # @details An archive on the Next Generation Archive System (NGAS). # @par EAGLE_START # @param category NGAS -# @param tag template +# @param tag daliuge # @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ # \~English Estimated size of the data contained in this node # @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ @@ -1528,18 +1768,27 @@ def setCompleted(self): logger.debug("Moving %r to COMPLETED", self) self.status = DROPStates.COMPLETED self._fire("dropCompleted", status=DROPStates.COMPLETED) + self.completedrop() @property def dataURL(self) -> str: return "ngas://%s:%d/%s" % (self.ngasSrv, self.ngasPort, self.fileId) + # Override + def generate_reproduce_data(self): + # TODO: This is a bad implementation. Will need to sort something better out + from .droputils import allDropContents + + data = allDropContents(self, self.size) + return {"data_hash": common_hash(data)} + ## # @brief Memory # @details In-memory storage of intermediate data products # @par EAGLE_START # @param category Memory -# @param tag template +# @param tag daliuge # @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ # \~English Estimated size of the data contained in this node # @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ @@ -1560,7 +1809,11 @@ def initialize(self, **kwargs): self._buf = io.BytesIO(*args) def getIO(self): - if hasattr(self, '_tp') and hasattr(self, '_sessID') and sys.version_info >= (3, 8): + if ( + hasattr(self, "_tp") + and hasattr(self, "_sessID") + and sys.version_info >= (3, 8) + ): return SharedMemoryIO(self.oid, self._sessID) else: return MemoryIO(self._buf) @@ -1570,6 +1823,13 @@ def dataURL(self) -> str: hostname = os.uname()[1] return "mem://%s/%d/%d" % (hostname, os.getpid(), id(self._buf)) + # Override + def generate_reproduce_data(self): + from .droputils import allDropContents + + data = allDropContents(self, self.size) + return {"data_hash": common_hash(data)} + ## # @brief SharedMemory @@ -1602,14 +1862,18 @@ def initialize(self, **kwargs): def getIO(self): if sys.version_info >= (3, 8): - if hasattr(self, '_sessID'): + if hasattr(self, "_sessID"): return SharedMemoryIO(self.oid, self._sessID) else: # Using Drop without manager, just generate a random name. - sess_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) + sess_id = "".join( + random.choices(string.ascii_uppercase + string.digits, k=10) + ) return SharedMemoryIO(self.oid, sess_id) else: - raise NotImplementedError("Shared memory is only available with Python >= 3.8") + raise NotImplementedError( + "Shared memory is only available with Python >= 3.8" + ) @property def dataURL(self) -> str: @@ -1617,6 +1881,17 @@ def dataURL(self) -> str: return f"shmem://{hostname}/{os.getpid()}/{id(self._buf)}" +## +# @brief NULL +# @details A Drop not storing any data (useful for just passing on events) +# @par EAGLE_START +# @param category Memory +# @param tag daliuge +# @param[in] cparam/data_volume Data volume/0/Float/readonly/False//False/ +# \~English This never stores any data +# @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ +# \~English Is this node the end of a group? +# @par EAGLE_END class NullDROP(DataDROP): """ A DROP that doesn't store any data. @@ -1636,6 +1911,27 @@ class EndDROP(NullDROP): """ +## +# @brief RDBMS +# @details A Drop allowing storage and retrieval from a SQL DB. +# @par EAGLE_START +# @param category File +# @param tag template +# @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ +# \~English Estimated size of the data contained in this node +# @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ +# \~English Is this node the end of a group? +# @param[in] cparam/dbmodule Python DB module//String/readwrite/False//False/ +# \~English Load path for python DB module +# @param[in] cparam/dbtable DB table name//String/readwrite/False//False/ +# \~English The name of the table to use +# @param[in] cparam/vals Values dictionary//Json/readwrite/False//False/ +# \~English Json encoded values dictionary used for INSERT. The keys of ``vals`` are used as the column names. +# @param[in] cparam/condition Whats used after WHERE//String/readwrite/False//False/ +# \~English Condition for SELECT. For this the WHERE statement must be written using the "{X}" or "{}" placeholders +# @param[in] cparam/selectVals values for WHERE//Json/readwrite/False//False/ +# \~English Values for the WHERE statement +# @par EAGLE_END class RDBMSDrop(DataDROP): """ A Drop that stores data in a table of a relational database @@ -1660,6 +1956,9 @@ def initialize(self, **kwargs): # The table this Drop points at self._db_table = kwargs.pop("dbtable") + # Data store for reproducibility + self._querylog = [] + def getIO(self): # This Drop cannot be accessed directly return ErrorIO() @@ -1704,13 +2003,7 @@ def select(self, columns=None, condition=None, vals=()): # Build up SQL with optional columns and conditions columns = columns or ("*",) - sql = [ - "SELECT %s FROM %s" - % ( - ",".join(columns), - self._db_table, - ) - ] + sql = ["SELECT %s FROM %s" % (",".join(columns), self._db_table)] if condition: sql.append(" WHERE ") sql.append(condition) @@ -1720,8 +2013,11 @@ def select(self, columns=None, condition=None, vals=()): logger.debug("Executing SQL with parameters: %s / %r", sql, vals) cur.execute(sql, vals) if cur.description: - return cur.fetchall() - return [] + ret = cur.fetchall() + else: + ret = [] + self._querylog.append((sql, vals, ret)) + return ret @property def dataURL(self) -> str: @@ -1731,6 +2027,10 @@ def dataURL(self) -> str: self._db_params, ) + # Override + def generate_reproduce_data(self): + return {"query_log": self._querylog} + class ContainerDROP(DataDROP): """ @@ -1847,7 +2147,7 @@ def exists(self): # @details An object in a Apache Arrow Plasma in-memory object store # @par EAGLE_START # @param category Plasma -# @param tag template +# @param tag daliuge # @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ # \~English Estimated size of the data contained in this node # @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ @@ -1872,15 +2172,19 @@ def initialize(self, **kwargs): super().initialize(**kwargs) self.plasma_path = os.path.expandvars(self.plasma_path) if self.object_id is None: - self.object_id = np.random.bytes(20) if len(self.uid) != 20 else self.uid.encode('ascii') + self.object_id = ( + np.random.bytes(20) if len(self.uid) != 20 else self.uid.encode("ascii") + ) elif isinstance(self.object_id, str): - self.object_id = self.object_id.encode('ascii') + self.object_id = self.object_id.encode("ascii") def getIO(self): - return PlasmaIO(plasma.ObjectID(self.object_id), - self.plasma_path, - expected_size=self._expectedSize, - use_staging=self.use_staging) + return PlasmaIO( + plasma.ObjectID(self.object_id), + self.plasma_path, + expected_size=self._expectedSize, + use_staging=self.use_staging, + ) @property def dataURL(self) -> str: @@ -1893,7 +2197,7 @@ def dataURL(self) -> str: # to a Plasma in-memory object store # @par EAGLE_START # @param category PlasmaFlight -# @param tag template +# @param tag daliuge # @param[in] cparam/data_volume Data volume/5/Float/readwrite/False//False/ # \~English Estimated size of the data contained in this node # @param[in] cparam/group_end Group end/False/Boolean/readwrite/False//False/ @@ -1919,9 +2223,11 @@ def initialize(self, **kwargs): super().initialize(**kwargs) self.plasma_path = os.path.expandvars(self.plasma_path) if self.object_id is None: - self.object_id = np.random.bytes(20) if len(self.uid) != 20 else self.uid.encode('ascii') + self.object_id = ( + np.random.bytes(20) if len(self.uid) != 20 else self.uid.encode("ascii") + ) elif isinstance(self.object_id, str): - self.object_id = self.object_id.encode('ascii') + self.object_id = self.object_id.encode("ascii") def getIO(self): return PlasmaFlightIO( @@ -1929,7 +2235,7 @@ def getIO(self): self.plasma_path, flight_path=self.flight_path, expected_size=self._expectedSize, - use_staging=self.use_staging + use_staging=self.use_staging, ) @property @@ -2094,6 +2400,7 @@ def _notifyAppIsFinished(self): self.status = DROPStates.COMPLETED logger.debug("Moving %r to %s", self, "FINISHED" if not is_error else "ERROR") self._fire("producerFinished", status=self.status, execStatus=self.execStatus) + self.completedrop() def cancel(self): """Moves this application drop to its CANCELLED state""" @@ -2278,7 +2585,7 @@ def execute(self, _send_notifications=True): self.execStatus = AppDROPStates.RUNNING while tries < self.n_tries: try: - if hasattr(self, '_tp'): + if hasattr(self, "_tp"): proc = DlgProcess(target=self.run, daemon=True) proc.start() proc.join() @@ -2327,7 +2634,8 @@ class BarrierAppDROP(InputFiredAppDROP): def initialize(self, **kwargs): # Blindly override existing value if any kwargs["n_effective_inputs"] = -1 - super(BarrierAppDROP, self).initialize(**kwargs) + super().initialize(**kwargs) + ## # @brief Branch diff --git a/daliuge-engine/dlg/droputils.py b/daliuge-engine/dlg/droputils.py index dd929eb84..438642ef7 100644 --- a/daliuge-engine/dlg/droputils.py +++ b/daliuge-engine/dlg/droputils.py @@ -112,7 +112,8 @@ def __exit__(self, typ, value, tb): if self._test: for evt in self._evts: self._test.assertTrue( - evt.wait(to), "Waiting for DROP failed with timeout %d %s" % (to, evt) + evt.wait(to), + "Waiting for DROP failed with timeout %d %s" % (to, evt), ) @@ -242,7 +243,6 @@ def breadFirstTraverse(toVisit): # See how many arguments we should used when calling func while toVisit: - # Pay the node a visit node = toVisit.popleft() dependencies = getDownstreamObjects(node) @@ -505,7 +505,8 @@ def replace_dataurl_placeholders(cmd, inputs, outputs): return cmd -def serialize_applicationArgs(applicationArgs, prefix='--', separator=' '): + +def serialize_applicationArgs(applicationArgs, prefix="--", separator=" "): """ Unpacks the applicationArgs dictionary and returns a string that can be used as command line parameters. @@ -520,8 +521,8 @@ def serialize_applicationArgs(applicationArgs, prefix='--', separator=' '): for (name, vdict) in applicationArgs.items(): if vdict in [None, False, ""]: continue - elif isinstance(vdict,bool): - value = '' + elif isinstance(vdict, bool): + value = "" elif isinstance(vdict, dict): precious = vdict["precious"] value = vdict["value"] @@ -533,12 +534,12 @@ def serialize_applicationArgs(applicationArgs, prefix='--', separator=' '): pargs.append(str(value).strip()) else: if prefix == "--" and len(name) == 1: - arg = [f'-{name} {value}'] + arg = [f"-{name} {value}"] else: - arg = [f'{prefix}{name}{separator}{value}'.strip()] + arg = [f"{prefix}{name}{separator}{value}".strip()] args += arg - - return f"{' '.join(args + pargs)}" # add positional arguments to end of args + + return f"{' '.join(args + pargs)}" # add positional arguments to end of args # Easing the transition from single- to multi-package diff --git a/daliuge-engine/dlg/environmentvar_drop.py b/daliuge-engine/dlg/environmentvar_drop.py index 339a0a3cf..d0f5948c8 100644 --- a/daliuge-engine/dlg/environmentvar_drop.py +++ b/daliuge-engine/dlg/environmentvar_drop.py @@ -27,8 +27,8 @@ from dlg.drop import AbstractDROP, DEFAULT_INTERNAL_PARAMETERS from dlg.io import MemoryIO -class KeyValueDROP: +class KeyValueDROP: @abc.abstractmethod def get(self, key): """ @@ -53,15 +53,19 @@ def set(self, key, value): def _filter_parameters(parameters: dict): - return {key: val for key, val in parameters.items() if - key not in DEFAULT_INTERNAL_PARAMETERS} + return { + key: val + for key, val in parameters.items() + if key not in DEFAULT_INTERNAL_PARAMETERS + } ## # @brief Environment variables # @details A set of environment variables, wholly specified in EAGLE and accessible to all drops. # @par EAGLE_START -# @param category EnvironmentVars +# @param category EnvironmentVariables +# @param tag daliuge # @par EAGLE_END class EnvironmentVarDROP(AbstractDROP, KeyValueDROP): """ @@ -78,7 +82,7 @@ def initialize(self, **kwargs): self._variables.update(_filter_parameters(self.parameters)) def getIO(self): - return MemoryIO(io.BytesIO(json.dumps(self._variables).encode('utf-8'))) + return MemoryIO(io.BytesIO(json.dumps(self._variables).encode("utf-8"))) def get(self, key): """ @@ -98,7 +102,8 @@ def get_multiple(self, keys: list): def set(self, key, value): raise NotImplementedError( - 'Setting EnvironmentVariables mid-execution is not currently implemented') + "Setting EnvironmentVariables mid-execution is not currently implemented" + ) @property def dataURL(self) -> str: diff --git a/daliuge-engine/dlg/event.py b/daliuge-engine/dlg/event.py index f1029fec1..c05653600 100644 --- a/daliuge-engine/dlg/event.py +++ b/daliuge-engine/dlg/event.py @@ -23,7 +23,6 @@ import collections import logging - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/graph_loader.py b/daliuge-engine/dlg/graph_loader.py index b6f90e466..ffd231e23 100644 --- a/daliuge-engine/dlg/graph_loader.py +++ b/daliuge-engine/dlg/graph_loader.py @@ -28,8 +28,13 @@ import importlib import logging +from dlg.common.reproducibility.constants import ReproducibilityFlags + +from numpy import isin + from . import droputils from .apps.socket_listener import SocketListenerApp +from .common import Categories from .ddap_protocol import DROPRel, DROPLinkType from .drop import ( ContainerDROP, @@ -42,7 +47,7 @@ NullDROP, EndDROP, PlasmaDROP, - PlasmaFlightDROP + PlasmaFlightDROP, ) from .environmentvar_drop import EnvironmentVarDROP from dlg.parset_drop import ParameterSetDROP @@ -62,7 +67,7 @@ Categories.PLASMA: PlasmaDROP, Categories.PLASMAFLIGHT: PlasmaFlightDROP, Categories.PARSET: ParameterSetDROP, - Categories.ENVIRONMENTVARS: EnvironmentVarDROP + Categories.ENVIRONMENTVARS: EnvironmentVarDROP, } try: @@ -132,13 +137,18 @@ def removeUnmetRelationships(dropSpecList): unmetRelationships = [] # Step #1: Get all OIDs - oids = {dropSpec["oid"] for dropSpec in dropSpecList} + oids = [] + for dropSpec in dropSpecList: + oid = dropSpec["oid"] + oid = list(oid.keys())[0] if isinstance(oid, dict) else oid + oids.append(oid) # Step #2: find unmet relationships and remove them from the original # DROP spec, keeping track of them for dropSpec in dropSpecList: this_oid = dropSpec["oid"] + this_oid = list(this_oid.keys())[0] if isinstance(this_oid, dict) else this_oid to_delete = [] for rel in dropSpec: @@ -150,13 +160,16 @@ def removeUnmetRelationships(dropSpecList): # Find missing OIDs in this relationship and keep track of them, # removing them from the current DROP spec - missingOids = [oid for oid in dropSpec[rel] if oid not in oids] + ds = dropSpec[rel] + if isinstance(ds[0], dict): + ds = [list(d.keys())[0] for d in ds] + missingOids = [oid for oid in ds if oid not in oids] for oid in missingOids: unmetRelationships.append(DROPRel(oid, link, this_oid)) - dropSpec[rel].remove(oid) + ds.remove(oid) # Remove the relationship list entirely if it has no elements - if not dropSpec[rel]: + if not ds: to_delete.append(rel) # N-1 relationships @@ -166,6 +179,7 @@ def removeUnmetRelationships(dropSpecList): # Check if OID is missing oid = dropSpec[rel] + oid = list(oid.keys())[0] if isinstance(oid, dict) else oid if oid in oids: continue @@ -176,7 +190,9 @@ def removeUnmetRelationships(dropSpecList): to_delete.append(rel) for rel in to_delete: - del dropSpec[rel] + ds = dropSpec[rel] + ds = list(ds.keys())[0] if isinstance(ds, dict) else ds + del ds return unmetRelationships @@ -199,10 +215,17 @@ def loadDropSpecs(dropSpecList): all DROP specifications (i.e., a dictionary of dictionaries) keyed on the OID of each DROP. Unlike `readObjectGraph` and `readObjectGraphS`, this method doesn't actually create the DROPs themselves. + + Slices off graph-wise reproducibility data for later use """ # Step #1: Check the DROP specs and collect them dropSpecs = {} + reprodata = None + if dropSpecList is None: + raise InvalidGraphException("DropSpec is empty %r" % dropSpecList) + if dropSpecList[-1].get("rmode"): + reprodata = dropSpecList.pop() for n, dropSpec in enumerate(dropSpecList): # "type" and 'oit' are mandatory @@ -216,6 +239,8 @@ def loadDropSpecs(dropSpecList): logger.debug("Found %d DROP definitions", len(dropSpecs)) # Step #2: check relationships + # TODO: shouldn't this loop be done the other way around, going through all __TOMANY + # and __TOONE and directly address the respective dropSpec attribute? for dropSpec in dropSpecList: # 1-N relationships @@ -225,19 +250,24 @@ def loadDropSpecs(dropSpecList): # A KeyError will be raised if a oid has been specified in the # relationship list but doesn't exist in the list of DROPs for oid in dropSpec[rel]: + oid = list(oid.keys())[0] if isinstance(oid, dict) else oid dropSpecs[oid] # N-1 relationships elif rel in __TOONE: + port = ( + list(dropSpecs[rel].keys()) + if isinstance(dropSpecs[rel], dict) + else dropSpecs[rel] + ) # See comment above - dropSpecs[dropSpec[rel]] + dropSpecs[port] # Done! - return dropSpecs + return dropSpecs, reprodata def createGraphFromDropSpecList(dropSpecList, session=None): - logger.debug("Found %d DROP definitions", len(dropSpecList)) # Step #1: create the actual DROPs @@ -251,6 +281,12 @@ def createGraphFromDropSpecList(dropSpecList, session=None): cf = __CREATION_FUNCTIONS[dropType] drop = cf(dropSpec, session=session) + if session is not None: + # Now using per-drop reproducibility setting. + drop.reproducibility_level = ReproducibilityFlags( + int(dropSpec.get("reprodata", {}).get("rmode", "0")) + ) + # session.reprodata['rmode'] drops[drop.oid] = drop # Step #2: establish relationships @@ -261,11 +297,12 @@ def createGraphFromDropSpecList(dropSpecList, session=None): oid = dropSpec["oid"] drop = drops[oid] - for rel in dropSpec: + for attr in dropSpec: # 1-N relationships - if rel in __TOMANY: - link = __TOMANY[rel] - for oid in dropSpec[rel]: + if attr in __TOMANY: + link = __TOMANY[attr] + for rel in dropSpec[attr]: + oid = list(rel.keys())[0] if isinstance(rel, dict) else rel lhDrop = drops[oid] relFuncName = LINKTYPE_1TON_APPEND_METHOD[link] try: @@ -281,9 +318,11 @@ def createGraphFromDropSpecList(dropSpecList, session=None): relFunc(lhDrop) # N-1 relationships - elif rel in __TOONE: - link = __TOONE[rel] - lhDrop = drops[dropSpec[rel]] + elif attr in __TOONE: + link = __TOONE[attr] + rel = dropSpec[attr] + rel = list(rel.keys())[0] if isinstance(rel, dict) else rel + lhDrop = drops[rel] propName = LINKTYPE_NTO1_PROPERTY[link] setattr(drop, propName, lhDrop) @@ -345,7 +384,6 @@ def _createSocket(dropSpec, dryRun=False, session=None): def _createApp(dropSpec, dryRun=False, session=None): oid, uid = _getIds(dropSpec) kwargs = _getKwargs(dropSpec) - del kwargs["app"] appName = dropSpec[DropType.APP] parts = appName.split(".") @@ -359,11 +397,7 @@ def _createApp(dropSpec, dryRun=False, session=None): appType = getattr(module, parts[-1]) except (ImportError, AttributeError): raise InvalidGraphException( - "drop %s specifies non-existent application: %s" - % ( - oid, - appName, - ) + "drop %s specifies non-existent application: %s" % (oid, appName) ) if dryRun: @@ -381,10 +415,17 @@ def _getIds(dropSpec): def _getKwargs(dropSpec): + REMOVE = [ + "oid", + "uid", + "app", + ] kwargs = dict(dropSpec) - del kwargs["oid"] - if "uid" in kwargs: - del kwargs["uid"] + for kw in REMOVE: + if kw in kwargs: + del kwargs[kw] + for name, spec in dropSpec.get("applicationArgs", dict()).items(): + kwargs[name] = spec["value"] return kwargs diff --git a/daliuge-engine/dlg/io.py b/daliuge-engine/dlg/io.py index e1b0c5e39..1201226a7 100644 --- a/daliuge-engine/dlg/io.py +++ b/daliuge-engine/dlg/io.py @@ -36,6 +36,7 @@ import pyarrow import pyarrow.plasma as plasma + if sys.version_info >= (3, 8): from dlg.shared_memory import DlgSharedMemory @@ -47,10 +48,11 @@ class OpenMode: """ Open Mode for Data Drops """ + OPEN_WRITE, OPEN_READ = range(2) -class DataIO(): +class DataIO: """ A class used to read/write data stored in a particular kind of storage in an abstract way. This base class simply declares a number of methods that @@ -240,6 +242,7 @@ class MemoryIO(DataIO): A DataIO class that reads/write from/into the BytesIO object given at construction time """ + _desc: io.BytesIO # TODO: This might actually be a problem def __init__(self, buf: io.BytesIO, **kwargs): @@ -299,7 +302,7 @@ class SharedMemoryIO(DataIO): def __init__(self, uid, session_id, **kwargs): super().__init__() - self._name = f'{session_id}_{uid}' + self._name = f"{session_id}_{uid}" self._written = 0 self._pos = 0 self._buf = None @@ -318,10 +321,10 @@ def _write(self, data, **kwargs) -> int: total_size = len(data) + self._written if total_size > self._buf.size: self._buf.resize(total_size) - self._buf.buf[self._written:total_size] = data + self._buf.buf[self._written : total_size] = data self._written = total_size else: - self._buf.buf[self._written:total_size] = data + self._buf.buf[self._written : total_size] = data self._written = total_size self._buf.resize(total_size) # It may be inefficient to resize many times, but assuming data is written 'once' this is @@ -363,14 +366,16 @@ class FileIO(DataIO): """ A file-based implementation of DataIO """ + _desc: io.BufferedReader + def __init__(self, filename, **kwargs): super().__init__() self._fnm = filename def _open(self, **kwargs) -> io.BufferedReader: - flag = 'r' if self._mode is OpenMode.OPEN_READ else 'w' - flag += 'b' + flag = "r" if self._mode is OpenMode.OPEN_READ else "w" + flag += "b" return open(self._fnm, flag) @overrides @@ -476,11 +481,11 @@ def _close(self, **kwargs): dataSize=self._writtenDataSize, ) self._buf = None - if reply != 200: + if reply.http_status != 200: # Probably msg is not enough, we need to unpack the status XML doc # from the returning data and extract the real error message from # there - raise Exception(msg) + raise Exception(reply.message) # Release the reference to _desc so the client object gets destroyed del self._desc @@ -492,7 +497,10 @@ def _read(self, count, **kwargs): @overrides def _write(self, data, **kwargs) -> int: - self._buf += data + if type(data) == bytes: + self._buf += str(data) + else: + self._buf += data self._writtenDataSize += len(data) return len(data) @@ -535,6 +543,7 @@ class NgasLiteIO(DataIO): The `ngaslite` module doesn't support the STATUS command yet, and because of that this class will throw an error if its `exists` method is invoked. """ + _desc: HTTPConnection def __init__( @@ -668,14 +677,15 @@ class PlasmaIO(DataIO): memory buffers. Note: not compatible with PlasmaClient put()/get() which performs data pickling before writing. """ + _desc: plasma.PlasmaClient def __init__( self, object_id: plasma.ObjectID, plasma_path="/tmp/plasma", - expected_size: Optional[int]=None, - use_staging=False + expected_size: Optional[int] = None, + use_staging=False, ): """Initializer Args: @@ -690,7 +700,9 @@ def __init__( self._reader = None self._writer = None # treat sizes <1 as None - self._expected_size = expected_size if expected_size and expected_size > 0 else None + self._expected_size = ( + expected_size if expected_size and expected_size > 0 else None + ) self._buffer_size = 0 self._use_staging = use_staging @@ -725,7 +737,9 @@ def _write(self, data, **kwargs) -> int: If use_staging is True, any number of writes may occur with a small performance penalty. """ # NOTE: data must be a collection of bytes for len to represent the buffer bytesize - assert isinstance(data, Union[memoryview, bytes, bytearray, pyarrow.Buffer].__args__) + assert isinstance( + data, Union[memoryview, bytes, bytearray, pyarrow.Buffer].__args__ + ) databytes = data.nbytes if isinstance(data, memoryview) else len(data) if self._use_staging: @@ -735,13 +749,23 @@ def _write(self, data, **kwargs) -> int: else: if not self._writer: # write directly into fixed size plasma buffer - self._buffer_size = self._expected_size if self._expected_size is not None else databytes + self._buffer_size = ( + self._expected_size + if self._expected_size is not None + else databytes + ) plasma_buffer = self._desc.create(self._object_id, self._buffer_size) self._writer = pyarrow.FixedSizeBufferWriter(plasma_buffer) if self._writer.tell() + databytes > self._buffer_size: - raise IOError("".join([f"attempted to write {self._writer.tell() + databytes} ", - f"bytes to plasma buffer of size {self._buffer_size}, ", - "consider using staging or expected_size argument"])) + raise IOError( + "".join( + [ + f"attempted to write {self._writer.tell() + databytes} ", + f"bytes to plasma buffer of size {self._buffer_size}, ", + "consider using staging or expected_size argument", + ] + ) + ) self._writer.write(data) return len(data) @@ -768,6 +792,7 @@ class PlasmaFlightIO(DataIO): """ A plasma drop managed by an arrow flight network protocol """ + _desc: PlasmaFlightClient def __init__( @@ -776,7 +801,7 @@ def __init__( plasma_path="/tmp/plasma", flight_path: Optional[str] = None, expected_size: Optional[int] = None, - use_staging = False + use_staging=False, ): super().__init__() self._object_id = object_id @@ -785,7 +810,9 @@ def __init__( self._reader = None self._writer = None # treat sizes <1 as None - self._expected_size = expected_size if expected_size and expected_size > 0 else None + self._expected_size = ( + expected_size if expected_size and expected_size > 0 else None + ) self._buffer_size = 0 self._use_staging = use_staging @@ -799,7 +826,9 @@ def _close(self, **kwargs): self._writer.close() else: if self._expected_size != self._writer.tell(): - logger.debug(f"written {self._writer.tell()} but expected {self._expected_size} bytes") + logger.debug( + f"written {self._writer.tell()} but expected {self._expected_size} bytes" + ) self._desc.seal(self._object_id) if self._reader: self._reader.close() @@ -813,16 +842,24 @@ def _read(self, count, **kwargs): def _write(self, data, **kwargs) -> int: # NOTE: data must be a collection of bytes for len to represent the buffer bytesize - assert isinstance(data, Union[memoryview, bytes, bytearray, pyarrow.Buffer].__args__) + assert isinstance( + data, Union[memoryview, bytes, bytearray, pyarrow.Buffer].__args__ + ) databytes = data.nbytes if isinstance(data, memoryview) else len(data) if not self._writer: if self._use_staging: # stream into resizeable buffer - logger.warning("Using dynamically sized Plasma buffer. Performance may be reduced.") + logger.warning( + "Using dynamically sized Plasma buffer. Performance may be reduced." + ) self._writer = io.BytesIO() else: # write directly to fixed size plasma buffer - self._buffer_size = self._expected_size if self._expected_size is not None else databytes + self._buffer_size = ( + self._expected_size + if self._expected_size is not None + else databytes + ) plasma_buffer = self._desc.create(self._object_id, self._buffer_size) self._writer = pyarrow.FixedSizeBufferWriter(plasma_buffer) self._writer.write(data) diff --git a/daliuge-engine/dlg/lifecycle/dlm.py b/daliuge-engine/dlg/lifecycle/dlm.py index 061cc0031..36721315c 100644 --- a/daliuge-engine/dlg/lifecycle/dlm.py +++ b/daliuge-engine/dlg/lifecycle/dlm.py @@ -130,7 +130,6 @@ from ..ddap_protocol import DROPStates, DROPPhases, AppDROPStates from ..drop import ContainerDROP - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/lifecycle/hsm/manager.py b/daliuge-engine/dlg/lifecycle/hsm/manager.py index 295c84f84..1e6ac89fb 100644 --- a/daliuge-engine/dlg/lifecycle/hsm/manager.py +++ b/daliuge-engine/dlg/lifecycle/hsm/manager.py @@ -30,7 +30,6 @@ from ..hsm import store - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/lifecycle/hsm/store.py b/daliuge-engine/dlg/lifecycle/hsm/store.py index d7e8eb940..389b3f55e 100644 --- a/daliuge-engine/dlg/lifecycle/hsm/store.py +++ b/daliuge-engine/dlg/lifecycle/hsm/store.py @@ -27,16 +27,15 @@ @author: rtobar """ -from abc import ABCMeta, abstractmethod import json import logging import os +from abc import ABCMeta, abstractmethod import psutil from ...drop import FileDROP, InMemoryDROP, NgasDROP - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/lifecycle/registry.py b/daliuge-engine/dlg/lifecycle/registry.py index 29d93aea0..61faffef9 100644 --- a/daliuge-engine/dlg/lifecycle/registry.py +++ b/daliuge-engine/dlg/lifecycle/registry.py @@ -31,15 +31,14 @@ @author: rtobar """ -from abc import abstractmethod, ABCMeta import importlib import logging import time +from abc import abstractmethod, ABCMeta from ..ddap_protocol import DROPPhases from ..utils import prepare_sql - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/manager/client.py b/daliuge-engine/dlg/manager/client.py index 453a7fe89..0a7be9a78 100644 --- a/daliuge-engine/dlg/manager/client.py +++ b/daliuge-engine/dlg/manager/client.py @@ -22,7 +22,6 @@ """Backwards compatibility for client""" from .. import clients - BaseDROPManagerClient = clients.BaseDROPManagerClient NodeManagerClient = clients.NodeManagerClient CompositeManagerClient = clients.CompositeManagerClient diff --git a/daliuge-engine/dlg/manager/cmdline.py b/daliuge-engine/dlg/manager/cmdline.py index 15ed7403f..ad33aa71c 100644 --- a/daliuge-engine/dlg/manager/cmdline.py +++ b/daliuge-engine/dlg/manager/cmdline.py @@ -50,10 +50,10 @@ _terminating = False -class DlgFormatter(logging.Formatter): +class DlgFormatter(logging.Formatter): def format(self, record: logging.LogRecord) -> str: - arg_pattern = re.compile(r'%\((\w+)\)') + arg_pattern = re.compile(r"%\((\w+)\)") arg_names = [x.group(1) for x in arg_pattern.finditer(self._fmt)] for field in arg_names: if field not in record.__dict__: @@ -263,7 +263,7 @@ def start(options, parser): else: working_dir = options.work_dir or "." tree = "/settings" - utils.createDirIfMissing(working_dir+tree) + utils.createDirIfMissing(working_dir + tree) os.chdir(working_dir) launchServer(options) diff --git a/daliuge-engine/dlg/manager/composite_manager.py b/daliuge-engine/dlg/manager/composite_manager.py index 28512ba6b..e2b43b356 100644 --- a/daliuge-engine/dlg/manager/composite_manager.py +++ b/daliuge-engine/dlg/manager/composite_manager.py @@ -30,12 +30,12 @@ from .client import NodeManagerClient from .constants import ISLAND_DEFAULT_REST_PORT, NODE_DEFAULT_REST_PORT from .drop_manager import DROPManager -from .. import remote, graph_loader +from .. import graph_loader +from ..common.reproducibility.reproducibility import init_pg_repro_data from ..ddap_protocol import DROPRel from ..exceptions import InvalidGraphException, DaliugeException, SubManagerException from ..utils import portIsOpen - logger = logging.getLogger(__name__) @@ -45,8 +45,15 @@ def uid_for_drop(dropSpec): return dropSpec["oid"] -def sanitize_relations(interDMRelations, graph): +def sanitize_link(link): + """ + Links can now be dictionaries, but we only need + the key. + """ + return list(link.keys())[0] if isinstance(link, dict) else link + +def sanitize_relations(interDMRelations, graph): # TODO: Big change required to remove this hack here # # Values in the interDMRelations array use OIDs to identify drops. @@ -79,8 +86,11 @@ def sanitize_relations(interDMRelations, graph): # know about the OIDs. newDMRelations = [] for rel in interDMRelations: - lhs = uid_for_drop(graph[rel.lhs]) - rhs = uid_for_drop(graph[rel.rhs]) + lhs = rel.lhs + lhs = sanitize_link(rel.lhs) + lhs = uid_for_drop(graph[lhs]) + rhs = sanitize_link(rel.rhs) + rhs = uid_for_drop(graph[rhs]) new_rel = DROPRel(lhs, rel.rel, rhs) newDMRelations.append(new_rel) interDMRelations[:] = newDMRelations @@ -339,6 +349,16 @@ def addGraphSpec(self, sessionId, graphSpec): # attribute set logger.info(f"Separating graph using partition attribute {self._partitionAttr}") perPartition = collections.defaultdict(list) + try: + if graphSpec[-1]["rmode"] is not None: + init_pg_repro_data(graphSpec) + self._graph["reprodata"] = graphSpec.pop() + logger.debug( + "Composite manager found reprodata in dropspecList, rmode=%s", + self._graph["reprodata"]["rmode"], + ) + except KeyError: + pass for dropSpec in graphSpec: if self._partitionAttr not in dropSpec: msg = "Drop %s doesn't specify a %s attribute" % ( @@ -360,7 +380,6 @@ def addGraphSpec(self, sessionId, graphSpec): # Add the drop specs to our graph self._graph[uid_for_drop(dropSpec)] = dropSpec - # At each partition the relationships between DROPs should be local at the # moment of submitting the graph; thus we record the inter-partition # relationships separately and remove them from the original graph spec @@ -391,6 +410,9 @@ def addGraphSpec(self, sessionId, graphSpec): # Create the individual graphs on each DM now that they are correctly # separated. logger.info("Adding individual graphSpec of session %s to each DM", sessionId) + for partition in perPartition: + if self._graph.get("reprodata") is not None: + perPartition[partition].append(self._graph["reprodata"]) self.replicate( sessionId, self._addGraphSpec, diff --git a/daliuge-engine/dlg/manager/constants.py b/daliuge-engine/dlg/manager/constants.py index 1170057c8..ed6500138 100644 --- a/daliuge-engine/dlg/manager/constants.py +++ b/daliuge-engine/dlg/manager/constants.py @@ -22,7 +22,6 @@ """Backwards compatibility for constants""" from .. import constants - DEFAULT_PORTS = constants.DEFAULT_PORTS NODE_DEFAULT_REST_PORT = constants.NODE_DEFAULT_REST_PORT ISLAND_DEFAULT_REST_PORT = constants.ISLAND_DEFAULT_REST_PORT diff --git a/daliuge-engine/dlg/manager/drop_manager.py b/daliuge-engine/dlg/manager/drop_manager.py index d3148c164..2ba4f860f 100644 --- a/daliuge-engine/dlg/manager/drop_manager.py +++ b/daliuge-engine/dlg/manager/drop_manager.py @@ -68,6 +68,13 @@ def getSessionStatus(self, sessionId): Returns the status of the session `sessionId`. """ + @abc.abstractmethod + def getSessionReproStatus(self, sessionId): + """ + Returns the reproducibility status of the session `sessionId`. + Not guaranteed to be identical to the usual SessionStatus. + """ + @abc.abstractmethod def addGraphSpec(self, sessionId, graphSpec): """ @@ -89,6 +96,12 @@ def getGraph(self, sessionId): `sessionId`. """ + @abc.abstractmethod + def getGraphReproData(self, sessionId): + """ + Returns the graph-wide reproducibility data for session `sessionId` + """ + @abc.abstractmethod def deploySession(self, sessionId, completedDrops=[]): """ diff --git a/daliuge-engine/dlg/manager/node_manager.py b/daliuge-engine/dlg/manager/node_manager.py index f0b3017b6..762ccae1e 100644 --- a/daliuge-engine/dlg/manager/node_manager.py +++ b/daliuge-engine/dlg/manager/node_manager.py @@ -127,13 +127,13 @@ class NodeManagerBase(DROPManager): __metaclass__ = abc.ABCMeta def __init__( - self, - useDLM=False, - dlgPath=None, - error_listener=None, - event_listeners=[], - max_threads=0, - logdir=utils.getDlgLogsDir(), + self, + useDLM=False, + dlgPath=None, + error_listener=None, + event_listeners=[], + max_threads=0, + logdir=utils.getDlgLogsDir(), ): self._dlm = DataLifecycleManager() if useDLM else None @@ -149,8 +149,8 @@ def __init__( sys.path.append(dlgPath) # we also add underlying site-packages dir to support # the --prefix installation of code - pyVer = f'{sys.version_info.major}.{sys.version_info.minor}' - extraPath = f'{dlgPath}/lib/python{pyVer}/site-packages' + pyVer = f"{sys.version_info.major}.{sys.version_info.minor}" + extraPath = f"{dlgPath}/lib/python{pyVer}/site-packages" logger.info("Adding %s to the system path", extraPath) sys.path.append(extraPath) @@ -173,7 +173,9 @@ def __init__( self._memoryManager = DlgSharedMemoryManager() if max_threads > 1: logger.info("Initializing thread pool with %d threads", max_threads) - self._threadpool = multiprocessing.pool.ThreadPool(processes=max_threads) + self._threadpool = multiprocessing.pool.ThreadPool( + processes=max_threads + ) # Event handler that only logs status changes debugging = logger.isEnabledFor(logging.DEBUG) @@ -220,7 +222,8 @@ def deliver_event(self, evt): """ if not evt.session_id in self._sessions: logger.warning( - "No session %s found, event (%s) will be dropped" % (evt.session_id, evt.type) + "No session %s found, event (%s) will be dropped" + % (evt.session_id, evt.type) ) return self._sessions[evt.session_id].deliver_event(evt) @@ -239,6 +242,12 @@ def getSessionStatus(self, sessionId): self._check_session_id(sessionId) return self._sessions[sessionId].status + def getSessionReproStatus(self, sessionId): + return self._sessions[sessionId].reprostatus + + def getGraphReproData(self, sessionId): + return self._sessions[sessionId].reprodata + def linkGraphParts(self, sessionId, lhOID, rhOID, linkType): self._check_session_id(sessionId) self._sessions[sessionId].linkGraphParts(lhOID, rhOID, linkType) @@ -253,6 +262,7 @@ def getGraphStatus(self, sessionId): def getGraph(self, sessionId): self._check_session_id(sessionId) + # TODO: Ensure returns reproducibility data. return self._sessions[sessionId].getGraph() def getLogDir(self): @@ -261,7 +271,7 @@ def getLogDir(self): def deploySession(self, sessionId, completedDrops=[]): self._check_session_id(sessionId) session = self._sessions[sessionId] - if hasattr(self, '_memoryManager'): + if hasattr(self, "_memoryManager"): self._memoryManager.register_session(sessionId) def foreach(drop): @@ -274,7 +284,8 @@ def foreach(drop): elif isinstance(drop, SharedMemoryDROP): if sys.version_info < (3, 8): raise NotImplementedError( - "Shared memory is not implemented when using Python < 3.8") + "Shared memory is not implemented when using Python < 3.8" + ) drop._sessID = sessionId self._memoryManager.register_drop(drop.uid, sessionId) if self._dlm: @@ -291,6 +302,7 @@ def foreach(drop): log_evt_listener = self._logging_event_listener if log_evt_listener: drop.subscribe(log_evt_listener, "status") + drop.subscribe(log_evt_listener, "reproducibility") if isinstance(drop, AppDROP): drop.subscribe(log_evt_listener, "execStatus") @@ -312,7 +324,7 @@ def destroySession(self, sessionId): logger.info("Destroying session: %s", sessionId) self._check_session_id(sessionId) session = self._sessions.pop(sessionId) - if hasattr(self, '_memoryManager'): + if hasattr(self, "_memoryManager"): self._memoryManager.shutdown_session(sessionId) session.destroy() @@ -363,7 +375,7 @@ def call_drop(self, sessionId, uid, method, *args): return self._sessions[sessionId].call_drop(uid, method, *args) def shutdown(self): - if hasattr(self, '_threadpool') and self._threadpool is not None: + if hasattr(self, "_threadpool") and self._threadpool is not None: self._threadpool.close() self._threadpool.join() @@ -558,16 +570,16 @@ class RpcMixIn(rpc.RPCClient, rpc.RPCServer): # Final NodeManager class class NodeManager(EventMixIn, RpcMixIn, NodeManagerBase): def __init__( - self, - useDLM=True, - dlgPath=utils.getDlgPath(), - error_listener=None, - event_listeners=[], - max_threads=0, - logdir=utils.getDlgLogsDir(), - host=None, - rpc_port=constants.NODE_DEFAULT_RPC_PORT, - events_port=constants.NODE_DEFAULT_EVENTS_PORT, + self, + useDLM=True, + dlgPath=utils.getDlgPath(), + error_listener=None, + event_listeners=[], + max_threads=0, + logdir=utils.getDlgLogsDir(), + host=None, + rpc_port=constants.NODE_DEFAULT_RPC_PORT, + events_port=constants.NODE_DEFAULT_EVENTS_PORT, ): # We "just know" that our RpcMixIn will have a create_context static # method, which in reality means we are using the ZeroRPCServer class diff --git a/daliuge-engine/dlg/manager/proc_daemon.py b/daliuge-engine/dlg/manager/proc_daemon.py index 9e7931c3a..5244a30a2 100644 --- a/daliuge-engine/dlg/manager/proc_daemon.py +++ b/daliuge-engine/dlg/manager/proc_daemon.py @@ -38,7 +38,6 @@ from .. import utils from ..restserver import RestServer - logger = logging.getLogger(__name__) diff --git a/daliuge-engine/dlg/manager/replay.py b/daliuge-engine/dlg/manager/replay.py index d44622cb4..0381023c8 100644 --- a/daliuge-engine/dlg/manager/replay.py +++ b/daliuge-engine/dlg/manager/replay.py @@ -30,7 +30,6 @@ from .session import SessionStates from ..exceptions import NoSessionException, InvalidSessionState - logger = logging.getLogger(__name__) build_step = 3 diff --git a/daliuge-engine/dlg/manager/rest.py b/daliuge-engine/dlg/manager/rest.py index f257069ff..5d1af58bf 100644 --- a/daliuge-engine/dlg/manager/rest.py +++ b/daliuge-engine/dlg/manager/rest.py @@ -158,6 +158,13 @@ def __init__(self, dm, maxreqsize=10): app.get("/api/sessions//graph/size", callback=self.getGraphSize) app.get("/api/sessions//graph/status", callback=self.getGraphStatus) app.post("/api/sessions//graph/append", callback=self.addGraphParts) + app.get( + "/api/sessions//repro/data", callback=self.getSessionReproData + ) + app.get( + "/api/sessions//repro/status", + callback=self.getSessionReproStatus, + ) app.route("/api/sessions", method="OPTIONS", callback=self.acceptPreflight) app.route( @@ -233,6 +240,31 @@ def getSessionInformation(self, sessionId): status = 0 return {"status": status, "graph": graphDict} + @daliuge_aware + def getSessionReproStatus(self, sessionId): + return self.dm.getSessionReproStatus(sessionId) + + @daliuge_aware + def getSessionsReproStatus(self): + sessions = [] + for sessionId in self.dm.getSessionIds(): + sessions.append( + { + "sessionId": sessionId, + "status": self.dm.getSessionStatus(sessionId), + "size": self.dm.getGraphSize(sessionId), + "repro": self.dm.getSessionReproStatus(sessionId), + } + ) + return sessions + + @daliuge_aware + def getSessionReproData(self, sessionId): + # For now, we only have information on a per-graph basis. + graphDict = self.dm.getGraph(sessionId) + reprodata = self.dm.getGraphReproData(sessionId) + return {"graph": graphDict, "reprodata": reprodata} + @daliuge_aware def destroySession(self, sessionId): self.dm.destroySession(sessionId) @@ -268,7 +300,12 @@ def getGraphStatus(self, sessionId): # TODO: addGraphParts v/s addGraphSpec @daliuge_aware def addGraphParts(self, sessionId): - if bottle.request.content_type != "application/json": + # WARNING: TODO: Somehow, the content_type can be overwritten to 'text/plain' + logger.debug(bottle.request.content_type) + if ( + "application/json" not in bottle.request.content_type + and "text/plain" not in bottle.request.content_type + ): bottle.response.status = 415 return @@ -338,10 +375,12 @@ def shutdown_node_manager(self): def getNMStatus(self): # we currently return the sessionIds, more things might be added in the # future + logger.debug("NM REST call: status") return {"sessions": self.sessions()} @daliuge_aware def getLogFile(self, sessionId): + logger.debug("NM REST call: logfile") logdir = self.dm.getLogDir() logfile = generateLogFileName(logdir, sessionId) if not os.path.isfile(logfile): @@ -352,6 +391,7 @@ def getLogFile(self, sessionId): @daliuge_aware def linkGraphParts(self, sessionId): + logger.debug("NM REST call: graph/link") params = bottle.request.params lhOID = params["lhOID"] rhOID = params["rhOID"] @@ -360,6 +400,7 @@ def linkGraphParts(self, sessionId): @daliuge_aware def add_node_subscriptions(self, sessionId): + logger.debug(f"NM REST call: add_subscriptions {bottle.request.json}") if bottle.request.content_type != "application/json": bottle.response.status = 415 return diff --git a/daliuge-engine/dlg/manager/session.py b/daliuge-engine/dlg/manager/session.py index face036e3..2ea5e307a 100644 --- a/daliuge-engine/dlg/manager/session.py +++ b/daliuge-engine/dlg/manager/session.py @@ -25,16 +25,22 @@ import collections import inspect +import json import logging +import os import threading import time import socket +from dlg.common.reproducibility.reproducibility import init_runtime_repro_data +from dlg.utils import createDirIfMissing + from . import constants from .. import droputils from .. import graph_loader from .. import rpc from .. import utils +from ..common.reproducibility.constants import ReproducibilityFlags, ALL_RMODES from ..ddap_protocol import DROPLinkType, DROPRel, DROPStates from ..drop import ( AbstractDROP, @@ -51,7 +57,6 @@ DaliugeException, ) - logger = logging.getLogger(__name__) @@ -83,6 +88,25 @@ def handleEvent(self, evt): self._session.finish() +class ReproFinishedListener(object): + def __init__(self, graph, session): + self._session = session + self._nexpected = len(graph) + self._completed = 0 + + def handleEvent(self, evt): + self._completed += 1 + self._session.append_reprodata(evt.oid, evt.reprodata) + logger.debug( + "%d/%d drops filed reproducibility", self._completed, self._nexpected + ) + if self._completed == self._nexpected: + logger.debug("Building Reproducibility BlockDAG") + init_runtime_repro_data(self._session._graph, self._session._graphreprodata) + self._session.reprostatus = True + self._session.write_reprodata() + + class EndListener(object): """ Listener for an EndDROP that will end the session when complete @@ -134,6 +158,8 @@ def __init__(self, sessionId, nm=None): self._error_status_listener = None self._nm = nm self._dropsubs = {} + self._graphreprodata = None + self._reprofinished = False class SessionFilter(logging.Filter): def __init__(self, sessionId): @@ -184,6 +210,27 @@ def roots(self): def drops(self): return self._drops + @property + def reprodata(self): + return self._graphreprodata + + @property + def reprostatus(self): + return self._reprofinished + + @reprostatus.setter + def reprostatus(self, status): + with self._statusLock: # TODO: Consider creating another lock + self._reprofinished = status + + def write_reprodata(self): + parts = [utils.getDlgLogsDir(), self._sessionId] + the_dir = os.path.abspath(os.path.normpath(os.path.join(*parts))) + createDirIfMissing(the_dir) + the_path = os.path.join(the_dir, "reprodata.out") + with open(the_path, "w+", encoding="utf-8") as file: + json.dump([self._graph, self._graphreprodata], file, indent=4) + @track_current_session def addGraphSpec(self, graphSpec): """ @@ -193,7 +240,10 @@ def addGraphSpec(self, graphSpec): DROP. Each DROP specification is checked to see it contains all the necessary details to construct a proper DROP. If one DROP specification is found to be inconsistent the whole operation - fill wail. + will fail. + + This operation also 'slices off' a dictionary containing graph-wide + reproducibility information. This is stored as a class variable for later use. Adding graph specs to the session is only allowed while the session is in the PRISTINE or BUILDING status; otherwise an exception will be @@ -214,8 +264,7 @@ def addGraphSpec(self, graphSpec): self.status = SessionStates.BUILDING # This will check the consistency of each dropSpec - graphSpecDict = graph_loader.loadDropSpecs(graphSpec) - + graphSpecDict, self._graphreprodata = graph_loader.loadDropSpecs(graphSpec) # Check for duplicates duplicates = set(graphSpecDict) & set(self._graph) if duplicates: @@ -299,6 +348,9 @@ def deploy(self, completedDrops=[], event_listeners=[], foreach=None): ) logger.info("%d drops successfully created", len(self._graph)) + # Add listeners for reproducibility information + repro_listener = ReproFinishedListener(self._graph, self) + for drop, _ in droputils.breadFirstTraverse(self._roots): # Register them @@ -313,6 +365,8 @@ def deploy(self, completedDrops=[], event_listeners=[], foreach=None): # Register them with the error handler for l in event_listeners: drop.subscribe(l) + # Register each drop for reproducibility listening + drop.subscribe(repro_listener, "reproducibility") logger.info("Stored all drops, proceeding with further customization") @@ -391,7 +445,7 @@ def deliver_event(self, evt): Called when an event has been fired by a remote drop. The event is then delivered to the interested drops of this session. """ - if not evt.uid in self._dropsubs: + if evt.uid not in self._dropsubs: logger.debug("No subscription found for drop %s", evt.uid) return for tgt in self._dropsubs[evt.uid]: @@ -456,6 +510,29 @@ def add_node_subscriptions(self, relationships): self._proxyinfo.append((host, rpc_port, local_uid, mname, remote_uid)) + def append_reprodata(self, oid, reprodata): + if oid in self._graph: + if self._graph[oid].get("reprodata") is None: + return + if self._graph[oid]["reprodata"]["rmode"] == str( + ReproducibilityFlags.ALL.value + ): + drop_reprodata = reprodata.get("data", {}) + drop_hashes = reprodata.get("merkleroot", {}) + for rmode in ALL_RMODES: + self._graph[oid]["reprodata"][rmode.name][ + "rg_data" + ] = drop_reprodata[rmode.name] + self._graph[oid]["reprodata"][rmode.name]["rg_data"][ + "merkleroot" + ] = drop_hashes.get(rmode.name, b"") + + else: + self._graph[oid]["reprodata"]["rg_data"] = reprodata.get("data", {}) + self._graph[oid]["reprodata"]["rg_data"]["merkleroot"] = reprodata.get( + "merkleroot", b"" + ) + @track_current_session def finish(self): self.status = SessionStates.FINISHED diff --git a/daliuge-engine/dlg/manager/shared_memory_manager.py b/daliuge-engine/dlg/manager/shared_memory_manager.py index f03ee7fae..3ca7e1473 100644 --- a/daliuge-engine/dlg/manager/shared_memory_manager.py +++ b/daliuge-engine/dlg/manager/shared_memory_manager.py @@ -31,13 +31,13 @@ def _cleanup_block(session_id, name): - mem = DlgSharedMemory(f'{session_id}_{name}') + mem = DlgSharedMemory(f"{session_id}_{name}") mem.close() mem.unlink() # It is unlinking that is critical to freeing resources from the OS def _close_block(session_id, name): - mem = DlgSharedMemory(f'{session_id}_{name}') + mem = DlgSharedMemory(f"{session_id}_{name}") mem.close() diff --git a/daliuge-engine/dlg/manager/web/dim.html b/daliuge-engine/dlg/manager/web/dim.html index 97283eb3a..92deb9280 100644 --- a/daliuge-engine/dlg/manager/web/dim.html +++ b/daliuge-engine/dlg/manager/web/dim.html @@ -31,78 +31,91 @@
-
  • -
  • -
    + +
  • +
    -
    -

    Sessions

    - - - - - - - - +
    +

    Sessions

    +
    Session IDState# DropsDetails
    + + + + + + + - - - -
    Session IDState# DropsDetails
    - - -
    + + + + + + +
    -
    -

    Nodes

    - - - - - - - - -
    NodeDetails
    - -
    - +
    +

    Nodes

    + + + + + + + + +
    NodeDetails
    + +
    + + \ No newline at end of file diff --git a/daliuge-engine/dlg/manager/web/dm.html b/daliuge-engine/dlg/manager/web/dm.html index e0d0313b0..9a69535e6 100644 --- a/daliuge-engine/dlg/manager/web/dm.html +++ b/daliuge-engine/dlg/manager/web/dm.html @@ -31,7 +31,7 @@ - +