In [1]:
# default_exp experiment.lake_experiment_loader

# Sacred Data Lake Experiment Loader

> This class extends the `incense` project to allow you to load `sacred` experiments from a data lake store such as S3. It is assumed that there exists a ODBC SQL driver for this lake source.

> NOTE: initially this class supports S3 & turbodbc only

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# export

import json
import os
import uuid
from functools import lru_cache
from typing import Tuple

import boto3
import numpy as np
import pandas as pd
from pandas.io.sql import DatabaseError
from sciflow.experiment.lake_experiment import CSVArtifact, LakeExperiment
from sciflow.s3_utils import delete_dir
from sciflow.utils import odbc_connect, prepare_env, query
from tinydb import Query, TinyDB
from tinydb.storages import MemoryStorage

MAX_CACHE_SIZE = 32

# Setup

In [4]:
experiment_name = "lake_experiment_loader"
project = "sciflow"

In [5]:
prepare_env()

In [6]:
s3 = boto3.resource("s3")
delete_dir(s3, os.environ["SCIFLOW_BUCKET"], f"sciflow/experiments/{experiment_name}")

In [7]:
# export
class LakeExpLoader:
    def __init__(
        self,
        project,
        experiment_name,
        experiments_key_prefix=None,
        connection=None,
        bucket_name=None,
        bucket_table_alias=None,
    ):
        self.project = project
        self.experiment_name = experiment_name
        self.connection = odbc_connect() if connection is None else connection
        self.bucket_name = (
            os.environ["SCIFLOW_BUCKET"] if bucket_name is None else bucket_name
        )
        self.bucket_table_alias = (
            os.environ["SCIFLOW_BUCKET_TABLE_ALIAS"]
            if bucket_table_alias is None
            else bucket_table_alias
        )
        self.experiments_key_prefix = (
            f"{project}/experiments"
            if experiments_key_prefix is None
            else experiments_key_prefix
        )
        table_path = self.experiments_key_prefix.replace("/", ".")
        self.table_context = f"{self.bucket_table_alias}.{table_path}"
        self.remote_path = (
            f"{self.bucket_name}/{self.experiments_key_prefix}/{self.experiment_name}"
        )
        self.lake_table = f"{self.table_context}.{self.experiment_name}"

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def _find(
        self,
        experiment_name=None,
        experiment_ids=None,
        experiment_id: str = None,
        order_by: str = None,
        limit: int = None,
    ) -> LakeExperiment:
        if experiment_name is None:
            experiment_name = self.experiment_name
        table_name = f"{self.table_context}.{experiment_name}.runs"
        # TODO Dremio Specific code in utils.py
        data = query(self.connection, f"ALTER TABLE {table_name} REFRESH METADATA")

        query_stmt = f"select * from {table_name}"
        if experiment_ids:
            ", ".join([str(i) for i in experiment_ids])
            query_stmt += (
                f" where dir0 IN {tuple('{}'.format(x) for x in experiment_ids)}"
            )
        if experiment_id:
            query_stmt += f" where dir0 = '{experiment_id}'"
        if order_by:
            query_stmt += f" order by {order_by} desc"
        if limit:
            query_stmt += f" limit {limit}"
        data = query(self.connection, query_stmt)
        experiments = [
            LakeExperiment(
                self.bucket_name,
                self.experiments_key_prefix,
                experiment_name,
                ex_id,
                data.iloc[i, :].to_dict()["start_time"],
                data.iloc[i, :].to_dict(),
            )
            for i, ex_id in enumerate(data.dir0.tolist())
        ]
        return experiments

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_id(self, experiment_id):
        experiments = self._find(experiment_id=experiment_id)
        return None if len(experiments) == 0 else experiments[0]

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_ids(self, experiment_ids: Tuple[str]):
        if len(experiment_ids) == 1:
            raise ValueError("Use find_by_id for a single experiment")
        return self._find(experiment_ids=experiment_ids)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_latest(self, n=5):
        return self._find(order_by="start_time", limit=n)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_all(self):
        return self._find()

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_name(self, experiment_name):
        result = None
        try:
            result = self._find(experiment_name=experiment_name)
        except PermissionError:
            print(f"File not found or access not granted; check path information")
        return result

    def insert_docs(self, db, prop_name):
        experiments = self.find_all()
        for ex in experiments:
            document = json.loads(ex._data[prop_name])
            document["experiment_id"] = ex.experiment_id
            db.insert(document)

    def find_by_key(self, prop_name, key, value):
        db = TinyDB(storage=MemoryStorage)
        self.insert_docs(db, prop_name)
        Experiment = Query()
        docs = list(db.search(Experiment[key] == value))
        if len(docs) == 0:
            return None
        if len(docs) == 1:
            return self.find_by_id(docs[0]["experiment_id"])
        return self.find_by_ids(tuple(d["experiment_id"] for d in docs))

    def find_by_config_key(self, key, value):
        return self.find_by_key("config", key, value)

    def cache_clear(self):
        """Clear all caches of all find functions.
        Useful when you want to see the updates to your database."""
        self._find.cache_clear()
        self.find_all.cache_clear()
        self.find_by_id.cache_clear()
        self.find_by_ids.cache_clear()
        self.find_by_name.cache_clear()
        self.find_latest.cache_clear()

    def __repr__(self):
        return (
            f"Project: {self.project}\n"
            f"Experiment: {self.experiment_name}\n"
            f"Remote Path: {self.remote_path}\n"
            f"Lake Table: {self.lake_table}"
        )

In [8]:
loader = LakeExpLoader(project=project, experiment_name=experiment_name)

In [9]:
loader

Project: sciflow
Experiment: lake_experiment_loader
Remote Path: pprsandboxpdlras3/sciflow/experiments/lake_experiment_loader
Lake Table: ra_s3.sciflow.experiments.lake_experiment_loader

In [10]:
assert (
    loader.remote_path
    == f"{os.environ['SCIFLOW_BUCKET']}/{project}/experiments/{experiment_name}"
)
assert (
    loader.lake_table
    == f"{os.environ['SCIFLOW_BUCKET_TABLE_ALIAS']}.{project}.experiments.{experiment_name}"
)

In [11]:
missing_loader = LakeExpLoader(
    project, f"generated_experiment_name_{np.random.randint(10**5)}"
)

In [12]:
missing_loader

Project: sciflow
Experiment: generated_experiment_name_47624
Remote Path: pprsandboxpdlras3/sciflow/experiments/generated_experiment_name_47624
Lake Table: ra_s3.sciflow.experiments.generated_experiment_name_47624

# Create Test Data

In [13]:
# Setup Experiments
from sacred import Experiment
from sacred.run import Run
from sciflow.experiment.lake_observer import AWSLakeObserver

ex = Experiment(experiment_name, interactive=True)

obs = AWSLakeObserver(
    project, experiment_name=experiment_name, bucket_name=os.environ["SCIFLOW_BUCKET"]
)

ex.observers.append(obs)


@ex.config
def my_config():
    recipient = "test"
    message = f"Hello {recipient}!"
    f"{message}"


@ex.main
def my_main(message, _run: Run):
    _run.add_artifact("test/requirements-generated.txt")
    _run.add_artifact("test/dataframe_artifact.csv")
    _run.log_scalar("another one", 9.12, 0)
    print(message)


sample_id_1 = str(uuid.uuid4()).replace("-", "_")
sample_id_2 = str(uuid.uuid4()).replace("-", "_")

In [14]:
ex.run(meta_info={"run_id": sample_id_1})
ex.run(meta_info={"run_id": sample_id_2})

INFO - lake_experiment_loader - Running command 'my_main'
INFO - lake_experiment_loader - Started
INFO - lake_experiment_loader - Completed after 0:00:01


Hello test!


INFO - lake_experiment_loader - Running command 'my_main'
INFO - lake_experiment_loader - Started
INFO - lake_experiment_loader - Completed after 0:00:00


Hello test!


<sacred.run.Run at 0x7f830d89f8b0>

In [15]:
try:
    missing_loader.find_all()
    # TODO clean up error messaging
except DatabaseError:
    pass



In [16]:
assert loader.find_by_id(1) is None



In [17]:
ex1 = loader.find_by_id(sample_id_1)
assert len(ex1.metrics) == 1
assert ex1.metrics["values"].iloc[0] == 9.12
assert type(ex1.metrics) == pd.DataFrame
assert len(ex1.artifacts.values()) == 2
assert all([type(art) == CSVArtifact for art in ex1.artifacts.values()])



In [18]:
try:
    ex_ids = (sample_id_1,)
    exs = loader.find_by_ids(ex_ids)
except ValueError:
    pass
ex_ids = (sample_id_1, sample_id_2)
assert len(loader.find_by_ids(ex_ids)) == 2



In [19]:
assert [ex.experiment_id for ex in loader.find_latest()] == [sample_id_2, sample_id_1]
assert [ex.experiment_id for ex in loader.find_latest(n=1)] == [sample_id_2]



In [20]:
assert len(loader.find_all()) == 2
assert sorted([ex.experiment_id for ex in loader.find_all()]) == sorted(
    [sample_id_2, sample_id_1]
)



In [21]:
try:
    assert len(loader.find_by_name("laketest")) is None
except DatabaseError:
    print("Table not found")

Table not found




In [22]:
assert len(loader.find_by_name(experiment_name)) == 2



In [23]:
assert len(loader.find_by_config_key("recipient", "test")) == 2
assert loader.find_by_config_key("recipient", "hello") is None



In [24]:
assert len(loader.find_by_key("experiment", "name", "lake_experiment_loader")) == 2
assert loader.find_by_key("experiment", "mainfile", "extest.py") is None