In [1]:
# default_exp lake_observer

# Sacred Data Lake Observer

`sacred` is an excellent library for tracking machine learning experiments. It has an observer model for experiments and there are many different types of observer, which accomodate many destinations. When you combined with some community provided tooling like incense and omniboard this as complete an experimentation management capability as Data Scientists need. 

An issue that prevents greater adoption of the SIO stack sacred/incense/omniboard is dependence on an external service, namely MongoDB. It is not easy for Data Scientists to deploy a MongoDB instance within a production environment. However most Data Science notebook environments now permit access to data lake storage such as S3.

> This `sacred` observer adds support for a data lake observer. This observer stores all data in block storage under a root experiment directory. Each experiment component, e.g artifacts, metrics, runs is stored in it's own directory. Components like runs and metrics can be queried using a lake compatible query engine with a client ODBC driver. Files and other nested/unstructured entities can be accessed from the block storage client directly. The goal is to provide the same capability as the MongoDBObserver and hence to be compatible with key downstream libraries like: `incense` and `omniboard`.

In [2]:
%load_ext autoreload
%autoreload 2

In [115]:
# export
import json
import os
import os.path
import pandas as pd

from sacred.commandline_options import cli_option
from sacred.dependencies import get_digest
from sacred.observers.base import RunObserver
from sacred.serializer import flatten
import re
import socket
from text_discovery.s3_utils import (s3_join,
                                     is_valid_bucket, 
                                     list_s3_subdirs, 
                                     objects_exist_in_dir,
                                     delete_dir)
import boto3

In [117]:
# export
DEFAULT_S3_PRIORITY = 20

In [116]:
# export
class AWSLakeObserver(RunObserver):
    VERSION = "AWSLakeObserver-0.1.0"

    def __init__(
        self,
        bucket_name,
        experiment_dir,
        priority=DEFAULT_S3_PRIORITY,
        region=None,
    ):
        """Constructor for a AWSLakeObserver object.

        Run when the object is first created,
        before it's used within an experiment.

        Parameters
        ----------
        bucket_name
            The name of the bucket you want to store results in.
            Doesn't need to contain `s3://`, but needs to be a valid bucket name
        experiment_dir
            The relative path inside your bucket where you want this experiment to store results
        priority
            The priority to assign to this observer if
            multiple observers are present
        region
            The AWS region in which you want to create and access
            buckets. Needs to be either set here or configured in your AWS
        """
        if not is_valid_bucket(bucket_name):
            raise ValueError(
                "Your chosen bucket name doesn't follow AWS bucket naming rules"
            )
        self.experiment_dir = experiment_dir
        self.bucket_name = bucket_name
        self.priority = priority
        self.resource_dir = None
        self.source_dir = None
        self.runs_dir = None
        self.metrics_dir = None
        self.artifacts_dir = None
        self.run_entry = None
        self.config = None
        self.info = None
        self.experiment_id = None
        self.cout = ""
        self.cout_write_cursor = 0
        self.saved_metrics = {}
        if region is not None:
            self.region = region
            self.s3 = boto3.resource("s3", region_name=region)
        else:
            session = boto3.session.Session()
            if session.region_name is not None:
                self.region = session.region_name
                self.s3 = boto3.resource("s3")
            else:
                raise ValueError(
                    "You must either pass in an AWS region name, or have a "
                    "region name specified in your AWS config file"
                )
                
    def put_data(self, key, binary_data):
        self.s3.Object(self.bucket_name, key).put(Body=binary_data)

    def save_json(self, table_dir, obj, filename):
        key = s3_join(table_dir, filename)
        self.put_data(key, json.dumps(flatten(obj), sort_keys=True, indent=2))
        
    def save_file(self, file_save_dir, filename, target_name=None):
        target_name = target_name or os.path.basename(filename)
        key = s3_join(file_save_dir, target_name)
        self.put_data(key, open(filename, "rb"))
    
    def save_sources(self, ex_info):
        base_dir = ex_info["base_dir"]
        source_info = []
        for s, m in ex_info["sources"]:
            abspath = os.path.join(base_dir, s)
            store_path, md5sum = self.find_or_save(abspath, self.source_dir)
            source_info.append([s, os.path.relpath(store_path, self.experiment_dir)])
        return source_info
    
    def find_or_save(self, filename, store_dir):
        source_name, ext = os.path.splitext(os.path.basename(filename))
        md5sum = get_digest(filename)
        store_name = source_name + "_" + md5sum + ext
        store_path = s3_join(store_dir, store_name)
        if len(list_s3_subdirs(self.s3, self.bucket_name, prefix=store_path)) == 0:
            self.save_file(self.source_dir, filename, store_path)
        return store_path, md5sum
    
    def _determine_run_dir(self, _id):
        if _id is None:
            path_subdirs = list_s3_subdirs(self.s3, self.bucket_name, 
                                           s3_join(self.experiment_dir, "runs"))
            if not path_subdirs:
                max_run_id = 0
            else:
                integer_directories = [
                    int(d) for d in path_subdirs if d.isdigit()
                ]
                if not integer_directories:
                    max_run_id = 0
                else:
                    # If there are directories under experiment_dir that aren't
                    # numeric run directories, ignore those
                    max_run_id = max(integer_directories)

            _id = max_run_id + 1

        self.runs_dir = s3_join(self.experiment_dir, "runs", str(_id))
        self.metrics_dir = s3_join(self.experiment_dir, "metrics", str(_id))
        self.artifacts_dir = s3_join(self.experiment_dir, "artifacts", str(_id))
        self.resource_dir = s3_join(self.experiment_dir, "resources", str(_id))
        self.source_dir = s3_join(self.experiment_dir, "sources", str(_id))
        
        self.dirs = (self.runs_dir, self.metrics_dir, self.artifacts_dir,
                    self.resource_dir, self.source_dir)
        for dir_to_check in self.dirs:
            if objects_exist_in_dir(self.s3, self.bucket_name, dir_to_check):
                raise FileExistsError("S3 dir at {} already exists".format(self.runs_dir))

        return _id

    def queued_event(
        self, ex_info, command, host_info, queue_time, config, meta_info, _id
    ):
        _id = self._determine_run_dir(_id)

        self.run_entry = {
            "experiment": dict(ex_info),
            "command": command,
            "host": dict(host_info),
            "config": flatten(config),
            "meta": meta_info,
            "status": "QUEUED",
        }
        self.config = config
        self.info = {}

        self.save_json(self.run_entry, "run.json")

        return _id

    def started_event(
        self, ex_info, command, host_info, start_time, config, meta_info, _id
    ):
        _id = self._determine_run_dir(_id)
        self.experiment_id = _id
        
        ex_info["sources"] = self.save_sources(ex_info)

        self.run_entry = {
            "experiment_id": self.experiment_id,
            "experiment": dict(ex_info),
            "format": self.VERSION,
            "command": command,
            "host": dict(host_info),
            "start_time": start_time.isoformat(),
            "config": flatten(config),
            "meta": meta_info,
            "status": "RUNNING",
            "resources": [],
            "artifacts": [],
            "captured_out": "",
            "info": {},
            "heartbeat": None,
        }
        self.config = config
        self.info = {}
        self.cout = ""
        self.cout_write_cursor = 0

        self.save_json(self.runs_dir, self.run_entry, "run.json")

        return _id

    def heartbeat_event(self, info, captured_out, beat_time, result):
        self.info = info
        self.run_entry["heartbeat"] = beat_time.isoformat()
        self.run_entry["captured_out"] = captured_out
        self.run_entry["result"] = result
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def completed_event(self, stop_time, result):
        self.run_entry["stop_time"] = stop_time.isoformat()
        self.run_entry["result"] = result
        self.run_entry["status"] = "COMPLETED"

        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def interrupted_event(self, interrupt_time, status):
        self.run_entry["stop_time"] = interrupt_time.isoformat()
        self.run_entry["status"] = status
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def failed_event(self, fail_time, fail_trace):
        self.run_entry["stop_time"] = fail_time.isoformat()
        self.run_entry["status"] = "FAILED"
        self.run_entry["fail_trace"] = fail_trace
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def resource_event(self, filename):
        store_path, md5sum = self.find_or_save(filename, self.resource_dir)
        self.run_entry["resources"].append([filename, store_path])
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def artifact_event(self, name, filename, metadata=None, content_type=None):
        self.save_file(self.artifacts_dir, filename, name)
        self.run_entry["artifacts"].append(name)
        self.save_json(self.runs_dir, self.run_entry, "run.json")

    def log_metrics(self, metrics_by_name, info):
        """Store new measurements into metrics.csv"""
        metric_frames = [pd.DataFrame(v) for v in metrics_by_name.values()]
        metrics = pd.concat(metric_frames).reset_index(drop=True)
        metrics['experiment_id'] = self.experiment_id
        metrics_path = f's3://{self.bucket_name}/{self.metrics_dir}/metrics.csv'
        metrics.to_csv(metrics_path, index=False)

    def __eq__(self, other):
        if isinstance(other, AWSLakeObserver):
            return self.bucket == other.bucket and self.experiment_dir == other.experiment_dir
        else:
            return False

In [78]:
try:
    observer = AWSLakeObserver(bucket_name, experiment_dir)
except ValueError as ve:
    assert('region' in str(ve).lower())
observer = AWSLakeObserver(bucket_name, experiment_dir, region='eu-west-1')
assert(observer.region == 'eu-west-1')
# Do not check for missing bucket yet
observer = AWSLakeObserver(missing_bucket, experiment_dir, region='eu-west-1')
try:
    observer = AWSLakeObserver(invalid_bucket, experiment_dir, 
                               region='eu-west-1')
except ValueError as ve:
    assert('naming' in str(ve).lower())

In [91]:
from sacred import Experiment
from sacred.run import Run

ex = Experiment('test-lake-obs', interactive=True)

test_key_prefix = 'discovery/experiments/test/'
obs = AWSLakeObserver(bucket_name='s3bawspprwe1chatbotunpub01', 
                   experiment_dir=f'{test_key_prefix}lake_observer',
                   region='eu-west-1')

ex.observers.append(obs)

@ex.config
def my_config():
    recipient = "test"
    message = "Hello %s!" % recipient

@ex.main
def my_main(message, _run: Run):
    _run.add_artifact('data/requirements.txt')
    _run.add_artifact('data/dataframe_artifact.csv')
    _run.log_scalar('rsme', 1.79, 0)
    _run.log_scalar('another one', 9.12, 0)
    print(message)

In [112]:
ex.run()

INFO:test-lake-obs:Running command 'my_main'
INFO:test-lake-obs:Started run with ID "3"
INFO:test-lake-obs:Completed after 0:00:00


Hello test!


<sacred.run.Run at 0x7fa9b7ed82d0>

In [114]:
import boto3
s3_res = boto3.resource('s3')

In [113]:
# delete_dir doesn't seem to always work.. why not?
bucket = s3_res.Bucket(bucket_name)
bucket.objects.filter(Prefix=test_key_prefix).delete()

[{'ResponseMetadata': {'RequestId': 'XZ67G2706K98PZBY',
   'HostId': 'Fut0qvcoUC0sX3zr8KwMSCbC+U888m4OCB4xmSfKg/s3WNjqVN88JiXoCQJio8obkRxAgFyUx8M=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'Fut0qvcoUC0sX3zr8KwMSCbC+U888m4OCB4xmSfKg/s3WNjqVN88JiXoCQJio8obkRxAgFyUx8M=',
    'x-amz-request-id': 'XZ67G2706K98PZBY',
    'date': 'Sun, 14 Mar 2021 10:58:34 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'discovery/experiments/test/lake_observer/runs/2/run.json',
    'DeleteMarker': True,
    'DeleteMarkerVersionId': '7pMG_ItD3yJzjSQBd3KASyOkCRoZUXsd'},
   {'Key': 'discovery/experiments/test/lake_observer/artifacts/1/requirements.txt',
    'DeleteMarker': True,
    'DeleteMarkerVersionId': 'k4oizgXkkiiji1JnzoHPruXuVtfLeEP.'},
   {'Key': 'discovery/experiments/test/lake_observer/metrics/1/metrics.csv',
    'DeleteMarker': True,
    'DeleteMarker

In [33]:
# export
@cli_option("-L", "--lake")
def lake_option(args, run):
    """Add a Data Lake observer to the experiment.

    The argument value should be `s3://<bucket>/path/to/exp`.
    """
    match_obj = re.match(r"s3:\/\/([^\/]*)\/(.*)", args)
    if match_obj is None or len(match_obj.groups()) != 2:
        raise ValueError(
            "Valid bucket specification not found. "
            "Enter bucket and directory path like: "
            "s3://<bucket>/path/to/exp"
        )
    bucket_name, experiment_dir = match_obj.groups()
    run.observers.append(AWSLakeObserver(bucket_name=bucket_name, experiment_dir=experiment_dir))