In [None]:
# default_exp experiment.tracking

# Flow Experiment Tracking

`sacred` ...

An issue that prevents greater adoption of the SIO stack sacred/incense/omniboard is dependence on an external service, namely MongoDB. ..

> This `sacred` observer adds support for a data lake observer. This observer stores all data in block storage under a root experiment directory. Each experiment component, e.g artifacts, metrics, runs is stored in it's own directory. Components like runs and metrics can be queried using a lake compatible query engine with a client ODBC driver. Files and other nested/unstructured entities can be accessed from the block storage client directly. The goal is to provide the same capability as the MongoDBObserver and hence to be compatible with key downstream libraries like: `incense` and `omniboard`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export


import datetime
import json
import socket
import time
import uuid
from pathlib import Path

import boto3
import pandas as pd
from sacred import metrics_logger
from sacred.host_info import get_host_info
from sacred.serializer import flatten
from sacred.stdout_capturing import get_stdcapturer
from sacred.utils import IntervalTimer

from sciflow.s3_utils import (
    delete_dir,
    list_bucket,
    objects_exist_in_dir,
    put_data,
    s3_join,
)

# Step-level

In [None]:
# export


def save_json(s3_res, bucket_name, key, filename, obj):
    key = s3_join(key, filename)
    put_data(
        s3_res, bucket_name, key, json.dumps(flatten(obj), sort_keys=True, indent=2)
    )

In [None]:
# export


class StepTracker:
    def __init__(
        self,
        bucket_name,
        flow_base_key,
        flow_run_id,
        step_name,
        capture_mode="sys",
        region="eu-west-1",
    ):
        self.bucket_name = bucket_name
        self.flow_base_key = flow_base_key
        self.flow_run_id = flow_run_id
        self.exp_base_key = s3_join(flow_base_key, flow_run_id, "experiment")
        self.step_name = step_name
        self.capture_mode = capture_mode
        self._stop_heartbeat_event = None
        self._heartbeat = None
        self._output_file = None
        self._metrics = metrics_logger.MetricsLogger()
        self.captured_out = None
        self.saved_metrics = {}
        self.info = {}
        self.result = None

        # TODO read run_entry from run.json...
        self.run_entry = {
            "experiment_id": self.flow_run_id,
            "experiment": {},
            "format": None,
            "command": None,
            "host": get_host_info(),
            "all_hosts": {socket.gethostname(): get_host_info()},
            "start_time": datetime.datetime.utcnow().isoformat(),
            "config": {},
            "meta": {},
            "status": "RUNNING",
            "resources": [],
            "artifacts": [],
            "captured_out": "",
            "info": self.info,
            "heartbeat": None,
        }

        if region is not None:
            self.region = region
            self.s3_res = boto3.resource("s3", region_name=region)
        else:
            session = boto3.session.Session()
            if session.region_name is not None:
                self.region = session.region_name
                self.s3_res = boto3.resource("s3")
            else:
                raise ValueError(
                    "You must either pass in an AWS region name, or have a "
                    "region name specified in your AWS config file"
                )

        self.init_keys()

    def start_heartbeat(self, beat_interval=10.0):
        print("Starting Heartbeat")
        self._stop_heartbeat_event, self._heartbeat = IntervalTimer.create(
            self._emit_heartbeat, beat_interval
        )
        self._heartbeat.start()

    def stop_heartbeat(self):
        print("Stopping Heartbeat")
        if self._heartbeat is not None:
            self._stop_heartbeat_event.set()
            self._heartbeat.join(timeout=2)

    def capture_out(self):
        # TODO figure out why only "sys" seems to work in Sagemaker? - tee is installed
        _, capture_stdout = get_stdcapturer(self.capture_mode)
        return capture_stdout()

    def get_captured_out(self):
        if self._output_file.closed:
            return
        text = self._output_file.get()
        if isinstance(text, bytes):
            text = text.decode("utf-8", "replace")
        if self.captured_out:
            text = self.captured_out + text
        self.captured_out = text

    def log_metric(self, metric_name, metric_value, metric_step):
        if metric_name not in self.saved_metrics:
            self.saved_metrics[metric_name] = {
                "values": [],
                "steps": [],
                "timestamps": [],
            }

        self.saved_metrics[metric_name]["values"].append(metric_value)
        self.saved_metrics[metric_name]["steps"].append(metric_step)
        self.saved_metrics[metric_name]["timestamps"].append(
            datetime.datetime.utcnow().isoformat()
        )
        save_json(
            self.s3_res,
            self.bucket_name,
            self.metrics_key,
            "metrics.json",
            self.saved_metrics,
        )

    def add_artifact(self, artifact_path):
        name = Path(artifact_path).name
        self.save_file(self.artifacts_key, artifact_path, name)
        self.run_entry["artifacts"].append(name)
        save_json(
            self.s3_res, self.bucket_name, self.runs_key, "run.json", self.run_entry
        )

    def _emit_heartbeat(self):
        beat_time = datetime.datetime.utcnow().isoformat()
        self.run_entry["heartbeat"] = beat_time
        print(f"Emitted heartbeat at: {beat_time}")
        self.run_entry["captured_out"] = self.get_captured_out()
        self.run_entry["result"] = self.result
        save_json(
            self.s3_res, self.bucket_name, self.runs_key, "run.json", self.run_entry
        )

    def save_file(self, file_save_dir, filename, target_name=None):
        target_name = target_name or os.path.basename(filename)
        key = s3_join(file_save_dir, target_name)
        put_data(self.s3_res, self.bucket_name, key, open(filename, "rb"))

    def init_keys(self):
        self.runs_key = s3_join(self.exp_base_key, "runs")
        self.metrics_key = s3_join(self.exp_base_key, "metrics")
        self.artifacts_key = s3_join(self.exp_base_key, "artifacts")
        self.resource_key = s3_join(self.exp_base_key, "resources")
        self.source_key = s3_join(self.exp_base_key, "sources")

        self.keys = (
            self.runs_key,
            self.metrics_key,
            self.artifacts_key,
            self.resource_key,
            self.source_key,
        )
        for key_to_check in self.keys:
            if objects_exist_in_dir(self.s3_res, self.bucket_name, key_to_check):
                raise FileExistsError(
                    f"S3 dir at {self.bucket_name}/{key_to_check} already exists; check your run_id is unique"
                )

In [None]:
bucket_name = "pprsandboxpdlras3"
flow_base_key = "flow-" + str(uuid.uuid4())
flow_run_id = "sample_flow_instance_123"
flow_run_key = s3_join(flow_base_key, flow_run_id)
s3_res = boto3.resource("s3")

In [None]:
flow_base_key

In [None]:
tracker = StepTracker(bucket_name, flow_base_key, flow_run_id, "experiment-test")

# Metrics

In [None]:
tracker.log_metric("auc", 0.37, 0)
tracker.log_metric("auc", 0.45, 1)
tracker.log_metric("auc", 0.63, 2)
tracker.log_metric("auc", 0.89, 3)

In [None]:
metrics_by_name = json.loads(
    pd.read_json(
        f"s3://{bucket_name}/{flow_base_key}/{flow_run_id}/experiment/metrics/metrics.json"
    ).to_json()
)

In [None]:
def metrics_to_df(metrics_by_name):
    metric_frames = []
    for metric_name, metric_ptr in metrics_by_name.items():
        metric_frame = pd.DataFrame(metric_ptr)
        metric_frame["metric"] = metric_name
        metric_frames.append(metric_frame)
    metrics = pd.concat(metric_frames).reset_index(drop=True)
    metrics["flow_run_id"] = flow_run_id
    return metrics

In [None]:
metrics = metrics_to_df(metrics_by_name)
assert metrics["steps"].tolist() == [0, 1, 2, 3]
assert metrics["values"].tolist() == [0.37, 0.45, 0.63, 0.89]
assert metrics["metric"].tolist() == ["auc", "auc", "auc", "auc"]

In [None]:
contents = list_bucket(bucket_name, flow_run_key)
assert len(contents) == 1
assert contents[0].split("/")[-1] == "metrics.json"

# Out Capture

In [None]:
assert tracker.captured_out is None
with tracker.capture_out() as tracker._output_file:
    print("Some text")
    print("Some text")
    tracker.get_captured_out()
assert tracker.captured_out == "Some text\nSome text\n"
tracker.captured_out = None

# Artifacts

> Support is provided for the same artifact types as found in `sacred`; however we will not be testing the creation, saving or loading of mp4s here as this would require external dependencies for video creation such as ffmpeg. 

Supported artifact types:

* `.txt`: `text/csv`,
* `.csv`: `text/csv`,
* `.png`: `image/png`,
* `.jpg`: `image/jpeg`,
* `.mp4`: `video/mp4`,
* `.pickle`: `application/octet-stream`,

In [None]:
import tempfile

import pandas as pd

In [None]:
df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})

In [None]:
%matplotlib auto

In [None]:
with tempfile.TemporaryDirectory() as temp_dir:
    csv_path = f"{temp_dir}/testfile.csv"
    df.to_csv(csv_path)
    txt_path = f"{temp_dir}/testfile.txt"
    df.to_csv(txt_path)
    fig = df.a.plot.hist().figure
    png_path = f"{temp_dir}/testfile.png"
    fig.savefig(png_path)
    pdf_path = f"{temp_dir}/testfile.pdf"
    fig.savefig(pdf_path)
    pickle_path = f"{temp_dir}/testfile.pkl"
    df.to_pickle(pickle_path)
    artifacts = [csv_path, txt_path, png_path, pdf_path, pickle_path]
    for artifact_path in artifacts:
        tracker.add_artifact(artifact_path)

In [None]:
# mock tracker in user mode
# running in a flow??

# Heartbeat

In [None]:
with tracker.capture_out() as tracker._output_file:
    tracker.start_heartbeat(1.0)
    print("Some text")
    time.sleep(4)
    print("Some text")
    tracker.stop_heartbeat()
    tracker.get_captured_out()
assert len([t for t in tracker.captured_out.split("\n") if t == "Some text"]) == 2

# Run Entry

In [None]:
tracker.run_entry

# Tear-down - delete created remote objects

In [None]:
delete_dir(s3_res, bucket_name, flow_base_key)

# Flow-level

In [None]:
# export


def tracking_started(s3_res, bucket_name, flow_base_key, flow_run_id):
    # Create run-entry
    # Write run.json
    # experiment = experiment_info - sacred.ingredient.experiment_info
    # Command = run_flow?
    # Config = params?
    # meta = startup metadata put in FlowTracker

    host_info = get_host_info()
    self.run_entry = {
        "experiment_id": self.flow_run_id,
        "experiment": {},
        "format": None,
        "command": None,
        "host": host_info,
        "all_hosts": {socket.gethostname(): host_info},
        "start_time": datetime.datetime.utcnow().isoformat(),
        "config": {},
        "meta": {},
        "status": "RUNNING",
        "resources": [],
        "artifacts": [],
        "captured_out": "",
        "info": self.info,
        "heartbeat": None,
    }

    runs_key = s3_join(flow_base_key, flow_run_id, "experiment", "runs")

    save_json(s3_res, bucket_name, runs_key, run_entry, "run.json")

In [None]:
# export


def tracking_interrupted(flow_run_id):
    # Read run-entry from run.json
    # Add interrupt
    # Write run.json
    pass

In [None]:
# export


def tracking_failed(flow_run_id):
    # Read run-entry from run.json
    # Add failure
    # Write run.json
    pass

In [None]:
# export


def tracking_completed(flow_run_id):
    # Read run-entry from run.json
    # Add failure
    # Write run.json
    pass