In [None]:
# default_exp experiment.lake_experiment_loader

# Sacred Data Lake Experiment Loader

> This class extends the `incense` project to allow you to load `sacred` experiments from a data lake store such as S3. It is assumed that there exists a ODBC SQL driver for this lake source.

> NOTE: initially this class supports S3 & turbodbc only

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# export

import json
from functools import lru_cache
from typing import Tuple

import numpy as np
import pandas as pd
from incense.artifact import CSVArtifact
from incense.experiment import Experiment
from nbdev import Config
from sciflow.experiment.lake_experiment import LakeExperiment
from sciflow.utils import odbc_connect, query_odbc
from tinydb import Query, TinyDB
from tinydb.storages import MemoryStorage
from turbodbc.exceptions import DatabaseError

MAX_CACHE_SIZE = 32

In [None]:
# export
class LakeExpLoader:
    def __init__(
        self,
        experiment_name,
        experiments_key_prefix=None,
        connection=None,
        bucket_name=None,
        bucket_table_alias=None,
    ):
        config = Config()
        lib_name = config.lib_name
        self.experiment_name = experiment_name
        self.connection = odbc_connect() if connection is None else connection
        self.bucket_name = config.bucket if bucket_name is None else bucket_name
        self.bucket_table_alias = (
            config.bucket_table_alias
            if bucket_table_alias is None
            else bucket_table_alias
        )
        self.experiments_key_prefix = (
            f"{lib_name}/experiments"
            if experiments_key_prefix is None
            else experiments_key_prefix
        )
        table_path = self.experiments_key_prefix.replace("/", ".")
        self.table_context = f"{self.bucket_table_alias}.{table_path}"
        self.remote_path = (
            f"{self.bucket_name}/{self.experiments_key_prefix}/{self.experiment_name}"
        )
        self.lake_table = f"{self.table_context}.{self.experiment_name}"

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def _find(
        self,
        experiment_name=None,
        experiment_ids=None,
        experiment_id: int = None,
        order_by: str = None,
        limit: int = None,
    ) -> Experiment:
        if experiment_name is None:
            experiment_name = self.experiment_name
        query = f"select * from {self.table_context}.{experiment_name}.runs"
        if experiment_ids:
            ids = ", ".join([str(i) for i in experiment_ids])
            query += f" where dir0 IN ({ids})"
        if experiment_id:
            query += f" where dir0 = {experiment_id}"
        if order_by:
            query += f" order by {order_by} desc"
        if limit:
            query += f" limit {limit}"
        data = query_odbc(self.connection, query)
        experiments = [
            LakeExperiment(
                self.bucket_name,
                self.experiments_key_prefix,
                experiment_name,
                ex_id,
                data.iloc[i, :].to_dict(),
            )
            for i, ex_id in enumerate(data.dir0.tolist())
        ]
        return experiments

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_id(self, experiment_id):
        experiments = self._find(experiment_id=experiment_id)
        return None if len(experiments) == 0 else experiments[0]

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_ids(self, experiment_ids: Tuple[int]):
        if len(experiment_ids) == 1:
            raise ValueError("Use find_by_id for a single experiment")
        return self._find(experiment_ids=experiment_ids)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_latest(self, n=5):
        return self._find(order_by="dir0", limit=n)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_all(self):
        return self._find()

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_name(self, experiment_name):
        result = None
        try:
            result = self._find(experiment_name=experiment_name)
        except PermissionError:
            print(f"File not found or access not granted; check path information")
        return result

    def insert_docs(self, db, prop_name):
        experiments = self.find_all()
        for ex in experiments:
            document = json.loads(ex._data[prop_name])
            document["experiment_id"] = ex.experiment_id
            db.insert(document)

    def find_by_key(self, prop_name, key, value):
        db = TinyDB(storage=MemoryStorage)
        self.insert_docs(db, prop_name)
        Experiment = Query()
        docs = list(db.search(Experiment[key] == value))
        if len(docs) == 0:
            return None
        if len(docs) == 1:
            return self.find_by_id(docs[0]["experiment_id"])
        return self.find_by_ids(tuple(d["experiment_id"] for d in docs))

    def find_by_config_key(self, key, value):
        return self.find_by_key("config", key, value)

    def cache_clear(self):
        """Clear all caches of all find functions.
        Useful when you want to see the updates to your database."""
        self._find.cache_clear()
        self.find_all.cache_clear()
        self.find_by_id.cache_clear()
        self.find_by_ids.cache_clear()
        self.find_by_name.cache_clear()
        self.find_latest.cache_clear()

    def __repr__(self):
        return (
            f"Experiment: {self.experiment_name}\n"
            f"Remote Path: {self.remote_path}\n"
            f"Lake Table: {self.lake_table}"
        )

In [None]:
bucket_name = "s3bawspprwe1chatbotunpub01"  # default = bucket
bucket_table_alias = "chatbot_unpublish_s3"
experiments_key_prefix = "discovery/experiments/test"  # default = lib_name/experiments
experiment_name = "lake_observer"  # default should be flow name

In [None]:
loader = LakeExpLoader(
    experiment_name=experiment_name,
    experiments_key_prefix=experiments_key_prefix,
    bucket_name=bucket_name,
    bucket_table_alias=bucket_table_alias,
)

ODBC_DRIVER /opt/dremio-odbc/lib64/libdrillodbc_sb64.so
ODBC_USER E02079
ODBC_PWD HYgzAG9ASxy1BBUa24jS4VVOL/wrJcyDAIKUdSPv4aD+7jFzvNUyoziBWeqKjg==
ODBC_PORT 31010
ODBC_HOST dremio-master-0.dremio-cluster-pod.default.svc.cluster.local
SSL_CERTS /etc/ssl/certs/ca-certificates.crt


In [None]:
loader

Experiment: lake_observer
Remote Path: s3bawspprwe1chatbotunpub01/discovery/experiments/test/lake_observer
Lake Table: chatbot_unpublish_s3.discovery.experiments.test.lake_observer

In [None]:
assert (
    loader.remote_path
    == "s3bawspprwe1chatbotunpub01/discovery/experiments/test/lake_observer"
)
assert (
    loader.lake_table == "chatbot_unpublish_s3.discovery.experiments.test.lake_observer"
)

In [None]:
missing_loader = LakeExpLoader(f"generated_experiment_name_{np.random.randint(10**5)}")

In [None]:
missing_loader

Experiment: generated_experiment_name_69049
Remote Path: pprsandboxpdlras3/sciflow/experiments/generated_experiment_name_69049
Lake Table: ra_s3.sciflow.experiments.generated_experiment_name_69049

In [None]:
try:
    missing_loader.find_all()
    # TODO clean up error messaging
except DatabaseError:
    pass

In [None]:
ex1 = loader.find_by_id(1)
assert len(ex1.metrics) == 2
assert type(ex1.metrics) == pd.DataFrame
assert len(ex1.artifacts.values()) == 2
assert all([type(art) == CSVArtifact for art in ex1.artifacts.values()])

In [None]:
try:
    ex_ids = (1,)
    exs = loader.find_by_ids(ex_ids)
except ValueError:
    pass
ex_ids = (1, 3)
assert len(loader.find_by_ids(ex_ids)) == 2

In [None]:
assert [int(ex.experiment_id) for ex in loader.find_latest()] == [5, 4, 3, 2, 1]
assert [int(ex.experiment_id) for ex in loader.find_latest(n=2)] == [5, 4]

In [None]:
assert len(loader.find_all()) == 5
assert sorted([int(ex.experiment_id) for ex in loader.find_all()]) == [1, 2, 3, 4, 5]

In [None]:
%time assert len(loader.find_all()) == 5

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.54 Âµs


In [None]:
loader.cache_clear()

In [None]:
%time assert len(loader.find_all()) == 5

CPU times: user 336 ms, sys: 4 ms, total: 340 ms
Wall time: 1.82 s


In [None]:
try:
    assert len(loader.find_by_name("laketest")) is None
except DatabaseError:
    print("Table not found")

Table not found


In [None]:
assert len(loader.find_by_name("lake_observer")) == 5

In [None]:
assert len(loader.find_by_config_key("recipient", "test")) == 5
assert loader.find_by_config_key("recipient", "hello") is None

In [None]:
assert len(loader.find_by_key("experiment", "name", "test-lake-obs")) == 5
assert loader.find_by_key("experiment", "mainfile", "extest.py") is None