In [1]:
# default_exp experiment.lake_observer

# Sacred Data Lake Observer

`sacred` is an excellent library for tracking machine learning experiments. It has an observer model for experiments and there are many different types of observer, which accomodate many destinations. When you combined with some community provided tooling like incense and omniboard this as complete an experimentation management capability as Data Scientists need. 

An issue that prevents greater adoption of the SIO stack sacred/incense/omniboard is dependence on an external service, namely MongoDB. It is not easy for Data Scientists to deploy a MongoDB instance within a production environment. However most Data Science notebook environments now permit access to data lake storage such as S3.

> This `sacred` observer adds support for a data lake observer. This observer stores all data in block storage under a root experiment directory. Each experiment component, e.g artifacts, metrics, runs is stored in it's own directory. Components like runs and metrics can be queried using a lake compatible query engine with a client ODBC driver. Files and other nested/unstructured entities can be accessed from the block storage client directly. The goal is to provide the same capability as the MongoDBObserver and hence to be compatible with key downstream libraries like: `incense` and `omniboard`.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# export
import json
import os
import os.path
import uuid

import boto3
import pandas as pd
from sacred.dependencies import get_digest
from sacred.observers.base import RunObserver
from sacred.serializer import flatten
from sciflow.s3_utils import (
    delete_dir,
    is_valid_bucket,
    list_s3_subdirs,
    objects_exist_in_dir,
    s3_join,
)
from sciflow.utils import prepare_env

In [4]:
# export
DEFAULT_S3_PRIORITY = 20

In [5]:
# export
class AWSLakeObserver(RunObserver):
    VERSION = "AWSLakeObserver-0.1.0"

    def __init__(
        self,
        project,
        experiment_name,
        bucket_name=None,
        experiments_key_prefix=None,
        priority=DEFAULT_S3_PRIORITY,
        region="eu-west-1",
    ):
        """Constructor for a AWSLakeObserver object.

        Run when the object is first created,
        before it's used within an experiment.

        Parameters
        ----------
        experiment_name
            The nme of this experiment
        bucket_name
            The name of the bucket you want to store results in.
            Doesn't need to contain `s3://`, but needs to be a valid bucket name
        experiments_key_prefix
            The relative path inside your bucket where you want this experiment to store results
        priority
            The priority to assign to this observer if
            multiple observers are present
        region
            The AWS region in which you want to create and access
            buckets. Needs to be either set here or configured in your AWS
        """
        self.experiment_name = experiment_name
        if bucket_name is None:
            try:
                bucket_name = os.environ["SCIFLOW_BUCKET"]
            except KeyError:
                raise ValueError(
                    "Bucket name must be provided or set using SCIFLOW_BUCKET env"
                )
        self.bucket_name = (
            os.environ["SCIFLOW_BUCKET"] if bucket_name is None else bucket_name
        )
        if not is_valid_bucket(self.bucket_name):
            raise ValueError(
                "Your chosen bucket name doesn't follow AWS bucket naming rules"
            )
        self.experiments_key_prefix = (
            f"{project}/experiments"
            if experiments_key_prefix is None
            else experiments_key_prefix
        )
        self.experiments_key = s3_join(
            self.experiments_key_prefix, self.experiment_name
        )
        self.experiment_dir = s3_join(self.bucket_name, self.experiments_key)
        self.bucket_name = bucket_name
        self.priority = priority
        self.resource_dir = None
        self.source_dir = None
        self.runs_dir = None
        self.metrics_dir = None
        self.artifacts_dir = None
        self.run_entry = None
        self.config = None
        self.info = None
        self.experiment_id = None
        self.cout = ""
        self.cout_write_cursor = 0
        self.saved_metrics = {}
        if region is not None:
            self.region = region
            self.s3 = boto3.resource("s3", region_name=region)
        else:
            session = boto3.session.Session()
            if session.region_name is not None:
                self.region = session.region_name
                self.s3 = boto3.resource("s3")
            else:
                raise ValueError(
                    "You must either pass in an AWS region name, or have a "
                    "region name specified in your AWS config file"
                )

    def put_data(self, key, binary_data):
        self.s3.Object(self.bucket_name, key).put(Body=binary_data)

    def save_json(self, table_dir, obj, filename):
        key = s3_join(table_dir, filename)
        self.put_data(key, json.dumps(flatten(obj), sort_keys=True, indent=2))

    def save_file(self, file_save_dir, filename, target_name=None):
        target_name = target_name or os.path.basename(filename)
        key = s3_join(file_save_dir, target_name)
        self.put_data(key, open(filename, "rb"))

    def save_sources(self, ex_info):
        base_dir = ex_info["base_dir"]
        source_info = []
        for s, m in ex_info["sources"]:
            abspath = os.path.join(base_dir, s)
            store_path, md5sum = self.find_or_save(abspath, self.source_dir)
            source_info.append(
                [s, os.path.relpath(store_path, self.experiments_key_prefix)]
            )
        return source_info

    def find_or_save(self, filename, store_dir):
        source_name, ext = os.path.splitext(os.path.basename(filename))
        md5sum = get_digest(filename)
        store_name = source_name + "_" + md5sum + ext
        store_path = s3_join(store_dir, store_name)
        if len(list_s3_subdirs(self.s3, self.bucket_name, prefix=store_path)) == 0:
            self.save_file(self.source_dir, filename, store_path)
        return store_path, md5sum

    def _determine_run_dir(self, run_id):
        self.runs_dir = s3_join(self.experiments_key, "runs", str(run_id))
        self.metrics_dir = s3_join(self.experiments_key, "metrics", str(run_id))
        self.artifacts_dir = s3_join(self.experiments_key, "artifacts", str(run_id))
        self.resource_dir = s3_join(self.experiments_key, "resources", str(run_id))
        self.source_dir = s3_join(self.experiments_key, "sources", str(run_id))

        self.dirs = (
            self.runs_dir,
            self.metrics_dir,
            self.artifacts_dir,
            self.resource_dir,
            self.source_dir,
        )
        for dir_to_check in self.dirs:
            if objects_exist_in_dir(self.s3, self.bucket_name, dir_to_check):
                raise FileExistsError(
                    "S3 dir at {}/{} already exists; check your run_id is unique".format(
                        self.bucket_name, dir_to_check
                    )
                )

    def queued_event(
        self, ex_info, command, host_info, queue_time, config, meta_info, _id
    ):
        self._determine_run_dir(meta_info["run_id"])

        self.run_entry = {
            "experiment": dict(ex_info),
            "command": command,
            "host": dict(host_info),
            "config": flatten(config),
            "meta": meta_info,
            "status": "QUEUED",
        }
        self.config = config
        self.info = {}

        self.save_json(self.run_entry, "run.json")

        return _id

    def started_event(
        self, ex_info, command, host_info, start_time, config, meta_info, _id
    ):
        self._determine_run_dir(meta_info["run_id"])
        self.experiment_id = meta_info["run_id"]

        ex_info["sources"] = self.save_sources(ex_info)

        self.run_entry = {
            "experiment_id": self.experiment_id,
            "experiment": dict(ex_info),
            "format": self.VERSION,
            "command": command,
            "host": dict(host_info),
            "start_time": start_time.isoformat(),
            "config": flatten(config),
            "meta": meta_info,
            "status": "RUNNING",
            "resources": [],
            "artifacts": [],
            "captured_out": "",
            "info": {},
            "heartbeat": None,
        }
        self.config = config
        self.info = {}
        self.cout = ""
        self.cout_write_cursor = 0

        self.save_json(self.runs_dir, self.run_entry, "run.json")

        return _id

    def heartbeat_event(self, info, captured_out, beat_time, result):
        self.info = info
        self.run_entry["heartbeat"] = beat_time.isoformat()
        self.run_entry["captured_out"] = captured_out
        self.run_entry["result"] = result
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def completed_event(self, stop_time, result):
        self.run_entry["stop_time"] = stop_time.isoformat()
        self.run_entry["result"] = result
        self.run_entry["status"] = "COMPLETED"

        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def interrupted_event(self, interrupt_time, status):
        self.run_entry["stop_time"] = interrupt_time.isoformat()
        self.run_entry["status"] = status
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def failed_event(self, fail_time, fail_trace):
        self.run_entry["stop_time"] = fail_time.isoformat()
        self.run_entry["status"] = "FAILED"
        self.run_entry["fail_trace"] = fail_trace
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def resource_event(self, filename):
        store_path, md5sum = self.find_or_save(filename, self.resource_dir)
        self.run_entry["resources"].append([filename, store_path])
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def artifact_event(self, name, filename, metadata=None, content_type=None):
        self.save_file(self.artifacts_dir, filename, name)
        self.run_entry["artifacts"].append(name)
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def log_metrics(self, metrics_by_name, info):
        """Store new measurements into metrics.csv"""
        if len(metrics_by_name.values()) > 0:
            metric_frames = [pd.DataFrame(v) for v in metrics_by_name.values()]
            metrics = pd.concat(metric_frames).reset_index(drop=True)
            metrics["experiment_id"] = self.experiment_id
            metrics_path = f"s3://{self.bucket_name}/{self.metrics_dir}/metrics.csv"
            metrics.to_csv(metrics_path, index=False)

    def __eq__(self, other):
        if isinstance(other, AWSLakeObserver):
            return (
                self.experiment_name == other.experiment_name
                and self.bucket_name == other.bucket_name
                and self.experiments_key_prefix == other.experiments_key_prefix
            )
        else:
            return False

In [6]:
project = "sciflow"
experiment_name = "lake_observer"
experiments_key_prefix = "sciflow/experiments"
missing_bucket = "s3awsmissing"
invalid_bucket = "some bucket"

In [7]:
from sciflow.utils import prepare_env

prepare_env()

In [8]:
observer = AWSLakeObserver(project, experiment_name)
observer = AWSLakeObserver(
    project, experiment_name, experiments_key_prefix=experiments_key_prefix
)
observer = AWSLakeObserver(
    project,
    experiment_name,
    experiments_key_prefix=experiments_key_prefix,
    region="eu-west-1",
)
assert observer.region == "eu-west-1"
# Do not check for missing bucket yet
observer = AWSLakeObserver(
    project, experiment_name, missing_bucket, experiments_key_prefix, region="eu-west-1"
)
try:
    observer = AWSLakeObserver(
        project,
        experiment_name,
        invalid_bucket,
        experiments_key_prefix,
        region="eu-west-1",
    )
except ValueError as ve:
    assert "naming" in str(ve).lower()

# Test Run Expeirments

In [9]:
from sacred import Experiment
from sacred.run import Run

ex_name = "test_lake_obs"
ex = Experiment("test_lake_obs", interactive=True)

obs = AWSLakeObserver(
    project,
    experiment_name=ex_name,
    bucket_name=os.environ["SCIFLOW_BUCKET"],
    experiments_key_prefix=experiments_key_prefix,
)

ex.observers.append(obs)


@ex.config
def my_config():
    recipient = "test"
    message = f"Hello {recipient}!"
    f"{message}"


@ex.main
def my_main(message, _run: Run):
    _run.add_artifact("test/requirements-generated.txt")
    _run.add_artifact("test/dataframe_artifact.csv")
    _run.log_scalar("another one", 9.12, 0)
    print(message)

In [10]:
sample_id = str(uuid.uuid4()).replace("-", "_")

In [11]:
ex.run(meta_info={"run_id": sample_id})

INFO - test_lake_obs - Running command 'my_main'
INFO - test_lake_obs - Started
INFO - test_lake_obs - Completed after 0:00:00


Hello test!


<sacred.run.Run at 0x7fa60c9b5190>

# Check they were created correctly

In [14]:
assert objects_exist_in_dir(
    boto3.resource("s3"),
    os.environ["SCIFLOW_BUCKET"],
    f"{experiments_key_prefix}/test_lake_obs/artifacts/{sample_id}/requirements-generated.txt",
)
assert objects_exist_in_dir(
    boto3.resource("s3"),
    os.environ["SCIFLOW_BUCKET"],
    f"{experiments_key_prefix}/test_lake_obs/artifacts/{sample_id}/dataframe_artifact.csv",
)
assert objects_exist_in_dir(
    boto3.resource("s3"),
    os.environ["SCIFLOW_BUCKET"],
    f"{experiments_key_prefix}/test_lake_obs/metrics/{sample_id}/metrics.csv",
)
assert objects_exist_in_dir(
    boto3.resource("s3"),
    os.environ["SCIFLOW_BUCKET"],
    f"{experiments_key_prefix}/test_lake_obs/runs/{sample_id}/run.json",
)

# Clean up test resources

In [15]:
s3 = boto3.resource("s3")
delete_dir(s3, os.environ["SCIFLOW_BUCKET"], f"{experiments_key_prefix}/test_lake_obs")