In [None]:
# default_exp experiment.tracking

# Flow Experiment Tracking

`sacred` ...

An issue that prevents greater adoption of the SIO stack sacred/incense/omniboard is dependence on an external service, namely MongoDB. ..

> This `sacred` observer adds support for a data lake observer. This observer stores all data in block storage under a root experiment directory. Each experiment component, e.g artifacts, metrics, runs is stored in it's own directory. Components like runs and metrics can be queried using a lake compatible query engine with a client ODBC driver. Files and other nested/unstructured entities can be accessed from the block storage client directly. The goal is to provide the same capability as the MongoDBObserver and hence to be compatible with key downstream libraries like: `incense` and `omniboard`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export


import datetime

from sacred.stdout_capturing import get_stdcapturer
from sacred.utils import IntervalTimer

# Step-level

In [None]:
# export


class StepTracker:
    def __init__(self, flow_base_uri, flow_run_id, step_name, capture_mode="sys"):
        self.flow_base_uri = flow_base_uri
        self.flow_run_id = flow_run_id
        self.step_name = step_name
        self.capture_mode = capture_mode
        self._stop_heartbeat_event = None
        self._heartbeat = None
        self._output_file = None
        self.captured_out = None
        self.saved_metrics = {}

    def start_heartbeat(self):
        print("Starting Heartbeat")
        self._stop_heartbeat_event, self._heartbeat = IntervalTimer.create(
            _emit_heartbeat
        )
        self._heartbeat.start()

    def stop_heartbeat(self):
        print("Stopping Heartbeat")
        if self._heartbeat is not None:
            self._stop_heartbeat_event.set()
            self._heartbeat.join(timeout=2)

    def capture_out(self):
        #
        # TODO figure out why only sys seems to work in Sagemaker
        _, capture_stdout = get_stdcapturer(self.capture_mode)
        return capture_stdout()

    def get_captured_out(self):
        if self._output_file.closed:
            return
        text = self._output_file.get()
        if isinstance(text, bytes):
            text = text.decode("utf-8", "replace")
        if self.captured_out:
            text = self.captured_out + text
        self.captured_out = text

    def log_metrics(self, metrics_by_name, info):
        """Store new measurements into metrics.json."""
        for metric_name, metric_ptr in metrics_by_name.items():

            if metric_name not in self.saved_metrics:
                self.saved_metrics[metric_name] = {
                    "values": [],
                    "steps": [],
                    "timestamps": [],
                }

            self.saved_metrics[metric_name]["values"] += metric_ptr["values"]
            self.saved_metrics[metric_name]["steps"] += metric_ptr["steps"]

            timestamps_norm = [ts.isoformat() for ts in metric_ptr["timestamps"]]
            self.saved_metrics[metric_name]["timestamps"] += timestamps_norm

        self.save_json(self.saved_metrics, "metrics.json")

    def add_artifact(self, name, filename, metadata=None, content_type=None):
        self.save_file(self.artifacts_dir, filename, name)
        self.run_entry["artifacts"].append(name)
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def _emit_heartbeat(self):
        self.get_captured_out()
        # Read all measured metrics since last heartbeat
        logged_metrics = self._metrics.get_last_metrics()
        metrics_by_name = linearize_metrics(logged_metrics)
        self.log_metrics(metrics_by_name, self.info)
        self.heartbeat_event()

    def heartbeat_event(self):
        beat_time = datetime.datetime.utcnow()
        self.run_entry["heartbeat"] = beat_time.isoformat()
        self.run_entry["captured_out"] = self.get_captured_out()
        self.run_entry["result"] = self.result
        self.save_json(self.runs_dir, self.run_entry, "run.json")

In [None]:
# # export
# class AWSLakeObserver(RunObserver):
#     VERSION = "AWSLakeObserver-0.1.0"

#     def __init__(
#         self,
#         flow_base_uri,
#         flow_run_id,
#         bucket_name=None,
#         experiments_key_prefix=None,
#         priority=DEFAULT_S3_PRIORITY,
#         region="eu-west-1",
#     ):
#         """Constructor for a AWSLakeObserver object.

#         Run when the object is first created,
#         before it's used within an experiment.

#         Parameters
#         ----------
#         experiment_name
#             The nme of this experiment
#         bucket_name
#             The name of the bucket you want to store results in.
#             Doesn't need to contain `s3://`, but needs to be a valid bucket name
#         experiments_key_prefix
#             The relative path inside your bucket where you want this experiment to store results
#         priority
#             The priority to assign to this observer if
#             multiple observers are present
#         region
#             The AWS region in which you want to create and access
#             buckets. Needs to be either set here or configured in your AWS
#         """
#         self.experiment_name = experiment_name
#         if bucket_name is None:
#             try:
#                 bucket_name = os.environ["SCIFLOW_BUCKET"]
#             except KeyError:
#                 raise ValueError(
#                     "Bucket name must be provided or set using SCIFLOW_BUCKET env"
#                 )
#         self.bucket_name = (
#             os.environ["SCIFLOW_BUCKET"] if bucket_name is None else bucket_name
#         )
#         if not is_valid_bucket(self.bucket_name):
#             raise ValueError(
#                 "Your chosen bucket name doesn't follow AWS bucket naming rules"
#             )
#         self.experiments_key_prefix = (
#             f"{project}/experiments"
#             if experiments_key_prefix is None
#             else experiments_key_prefix
#         )
#         self.experiments_key = s3_join(
#             self.experiments_key_prefix, self.experiment_name
#         )
#         self.experiment_dir = s3_join(self.bucket_name, self.experiments_key)
#         self.bucket_name = bucket_name
#         self.priority = priority
#         self.resource_dir = None
#         self.source_dir = None
#         self.runs_dir = None
#         self.metrics_dir = None
#         self.artifacts_dir = None
#         self.run_entry = None
#         self.config = None
#         self.info = None
#         self.experiment_id = None
#         self.cout = ""
#         self.cout_write_cursor = 0
#         self.saved_metrics = {}
#         if region is not None:
#             self.region = region
#             self.s3 = boto3.resource("s3", region_name=region)
#         else:
#             session = boto3.session.Session()
#             if session.region_name is not None:
#                 self.region = session.region_name
#                 self.s3 = boto3.resource("s3")
#             else:
#                 raise ValueError(
#                     "You must either pass in an AWS region name, or have a "
#                     "region name specified in your AWS config file"
#                 )

#     def put_data(self, key, binary_data):
#         self.s3.Object(self.bucket_name, key).put(Body=binary_data)

#     def save_json(self, table_dir, obj, filename):
#         key = s3_join(table_dir, filename)
#         self.put_data(key, json.dumps(flatten(obj), sort_keys=True, indent=2))

#     def save_file(self, file_save_dir, filename, target_name=None):
#         target_name = target_name or os.path.basename(filename)
#         key = s3_join(file_save_dir, target_name)
#         self.put_data(key, open(filename, "rb"))

#     def save_sources(self, ex_info):
#         base_dir = ex_info["base_dir"]
#         source_info = []
#         for s, m in ex_info["sources"]:
#             abspath = os.path.join(base_dir, s)
#             store_path, md5sum = self.find_or_save(abspath, self.source_dir)
#             source_info.append(
#                 [s, os.path.relpath(store_path, self.experiments_key_prefix)]
#             )
#         return source_info

#     def find_or_save(self, filename, store_dir):
#         source_name, ext = os.path.splitext(os.path.basename(filename))
#         md5sum = get_digest(filename)
#         store_name = source_name + "_" + md5sum + ext
#         store_path = s3_join(store_dir, store_name)
#         if len(list_s3_subdirs(self.s3, self.bucket_name, prefix=store_path)) == 0:
#             self.save_file(self.source_dir, filename, store_path)
#         return store_path, md5sum

#     def _determine_run_dir(self, run_id):
#         self.runs_dir = s3_join(self.experiments_key, "runs", str(run_id))
#         self.metrics_dir = s3_join(self.experiments_key, "metrics", str(run_id))
#         self.artifacts_dir = s3_join(self.experiments_key, "artifacts", str(run_id))
#         self.resource_dir = s3_join(self.experiments_key, "resources", str(run_id))
#         self.source_dir = s3_join(self.experiments_key, "sources", str(run_id))

#         self.dirs = (
#             self.runs_dir,
#             self.metrics_dir,
#             self.artifacts_dir,
#             self.resource_dir,
#             self.source_dir,
#         )
#         for dir_to_check in self.dirs:
#             if objects_exist_in_dir(self.s3, self.bucket_name, dir_to_check):
#                 raise FileExistsError(
#                     "S3 dir at {}/{} already exists; check your run_id is unique".format(
#                         self.bucket_name, dir_to_check
#                     )
#                 )

#     def started_event(
#         self, ex_info, command, host_info, start_time, config, meta_info, _id
#     ):
#         self._determine_run_dir(meta_info["run_id"])
#         self.experiment_id = meta_info["run_id"]

#         ex_info["sources"] = self.save_sources(ex_info)

#         self.run_entry = {
#             "experiment_id": self.experiment_id,
#             "experiment": dict(ex_info),
#             "format": self.VERSION,
#             "command": command,
#             "host": dict(host_info),
#             "start_time": start_time.isoformat(),
#             "config": flatten(config),
#             "meta": meta_info,
#             "status": "RUNNING",
#             "resources": [],
#             "artifacts": [],
#             "captured_out": "",
#             "info": {},
#             "heartbeat": None,
#         }
#         self.config = config
#         self.info = {}
#         self.cout = ""
#         self.cout_write_cursor = 0

#         self.save_json(self.runs_dir, self.run_entry, "run.json")

#         return _id


#     def completed_event(self, stop_time, result):
#         self.run_entry["stop_time"] = stop_time.isoformat()
#         self.run_entry["result"] = result
#         self.run_entry["status"] = "COMPLETED"

#         self.save_json(self.runs_dir, self.run_entry, "run.json")

#     def interrupted_event(self, interrupt_time, status):
#         self.run_entry["stop_time"] = interrupt_time.isoformat()
#         self.run_entry["status"] = status
#         self.save_json(self.runs_dir, self.run_entry, "run.json")

#     def failed_event(self, fail_time, fail_trace):
#         self.run_entry["stop_time"] = fail_time.isoformat()
#         self.run_entry["status"] = "FAILED"
#         self.run_entry["fail_trace"] = fail_trace
#         self.save_json(self.runs_dir, self.run_entry, "run.json")

#     def resource_event(self, filename):
#         store_path, md5sum = self.find_or_save(filename, self.resource_dir)
#         self.run_entry["resources"].append([filename, store_path])
#         self.save_json(self.runs_dir, self.run_entry, "run.json")


#     def __eq__(self, other):
#         if isinstance(other, AWSLakeObserver):
#             return (
#                 self.experiment_name == other.experiment_name
#                 and self.bucket_name == other.bucket_name
#                 and self.experiments_key_prefix == other.experiments_key_prefix
#             )
#         else:
#             return False

In [None]:
tracker = StepTracker("file://tmp/test-flow/", "flow-123", "experiment-test")

In [None]:
assert tracker.captured_out is None
with tracker.capture_out() as tracker._output_file:
    print("Some text")
    print("Some text")
    tracker.get_captured_out()
assert tracker.captured_out == "Some text\nSome text\n"

# Flow-level

In [None]:
# export


def _emit_started(flow_run_id):
    pass

In [None]:
# export


def _emit_interrupted(flow_run_id):
    pass

In [None]:
# export


def _emit_failed(flow_run_id):
    pass

In [None]:
# export


def _emit_completed(flow_run_id):
    pass