In [3]:
# default_exp lake_experiment_loader

# Sacred Data Lake Experiment Loader

> This class extends the `incense` project to allow you to load `sacred` experiments from a data lake store such as S3. It is assumed that there exists a ODBC SQL driver for this lake source.

> NOTE: initially this class supports S3 & turbodbc only

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import json
from functools import lru_cache
from typing import Tuple

import pandas as pd
from incense.artifact import CSVArtifact
from incense.experiment import Experiment
from sciflow.utils import load_dremio_access
from text_discovery.lake_experiment import LakeExperiment
from tinydb import Query, TinyDB
from tinydb.storages import MemoryStorage
from turbodbc.exceptions import DatabaseError

In [6]:
MAX_CACHE_SIZE = 32

In [7]:
dremio_access = load_dremio_access()
bucket_name = "s3bawspprwe1chatbotunpub01"
project_dir = "discovery/experiments/test"
experiments_dir = "discovery/experiments/test"
experiment_name = "lake_observer"
table_context = '"chatbot_unpublish_s3".discovery.experiments.test'

In [8]:
# export
class LakeExperimentLoader:
    def __init__(
        self,
        dremio_access,
        table_context,
        bucket_name,
        experiments_dir,
        experiment_name,
    ):
        self.dremio_access = dremio_access
        self.table_context = table_context
        self.bucket_name = bucket_name
        self.experiments_dir = experiments_dir
        self.experiment_name = experiment_name

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def _find(
        self,
        experiment_name=None,
        experiment_ids=None,
        experiment_id: int = None,
        order_by: str = None,
        limit: int = None,
    ) -> Experiment:
        if experiment_name is None:
            experiment_name = self.experiment_name
        query = f"select * from {self.table_context}.{experiment_name}.runs"
        if experiment_ids:
            ids = ", ".join([str(i) for i in experiment_ids])
            query += f" where dir0 IN ({ids})"
        if experiment_id:
            query += f" where dir0 = {experiment_id}"
        if order_by:
            query += f" order by {order_by} desc"
        if limit:
            query += f" limit {limit}"
        data = dremio_access.read_sql_to_dataframe(query)
        experiments = [
            LakeExperiment(
                self.bucket_name,
                self.experiments_dir,
                experiment_name,
                ex_id,
                data.iloc[i, :].to_dict(),
            )
            for i, ex_id in enumerate(data.dir0.tolist())
        ]
        return experiments

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_id(self, experiment_id):
        experiments = self._find(experiment_id=experiment_id)
        return None if len(experiments) == 0 else experiments[0]

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_ids(self, experiment_ids: Tuple[int]):
        if len(experiment_ids) == 1:
            raise ValueError("Use find_by_id for a single experiment")
        return self._find(experiment_ids=experiment_ids)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_latest(self, n=5):
        return self._find(order_by="dir0", limit=n)

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_all(self):
        return self._find()

    @lru_cache(maxsize=MAX_CACHE_SIZE)
    def find_by_name(self, experiment_name):
        result = None
        try:
            result = self._find(experiment_name=experiment_name)
        except PermissionError:
            print(f"File not found or access not granted; check path information")
        return result

    def insert_docs(self, db, prop_name):
        experiments = self.find_all()
        for ex in experiments:
            document = json.loads(ex._data[prop_name])
            document["experiment_id"] = ex.experiment_id
            db.insert(document)

    def find_by_key(self, prop_name, key, value):
        db = TinyDB(storage=MemoryStorage)
        self.insert_docs(db, prop_name)
        Experiment = Query()
        docs = list(db.search(Experiment[key] == value))
        if len(docs) == 0:
            return None
        if len(docs) == 1:
            return self.find_by_id(docs[0]["experiment_id"])
        return self.find_by_ids(tuple(d["experiment_id"] for d in docs))

    def find_by_config_key(self, key, value):
        return self.find_by_key("config", key, value)

    def cache_clear(self):
        """Clear all caches of all find functions.
        Useful when you want to see the updates to your database."""
        self._find.cache_clear()
        self.find_all.cache_clear()
        self.find_by_id.cache_clear()
        self.find_by_ids.cache_clear()
        self.find_by_name.cache_clear()
        self.find_latest.cache_clear()

In [9]:
loader = LakeExperimentLoader(
    dremio_access, table_context, bucket_name, experiments_dir, experiment_name
)

In [10]:
ex1 = loader.find_by_id(1)
assert len(ex1.metrics) == 2
assert type(ex1.metrics) == pd.DataFrame
assert len(ex1.artifacts.values()) == 2
assert all([type(art) == CSVArtifact for art in ex1.artifacts.values()])

In [11]:
try:
    ex_ids = (1,)
    exs = loader.find_by_ids(ex_ids)
except ValueError:
    pass
ex_ids = (1, 3)
assert len(loader.find_by_ids(ex_ids)) == 2

In [12]:
assert [int(ex.experiment_id) for ex in loader.find_latest()] == [5, 4, 3, 2, 1]
assert [int(ex.experiment_id) for ex in loader.find_latest(n=2)] == [5, 4]

In [13]:
assert len(loader.find_all()) == 5
assert sorted([int(ex.experiment_id) for ex in loader.find_all()]) == [1, 2, 3, 4, 5]

In [14]:
%time assert len(loader.find_all()) == 5

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.34 µs


In [15]:
loader.cache_clear()

In [16]:
%time assert len(loader.find_all()) == 5

CPU times: user 248 ms, sys: 24 ms, total: 272 ms
Wall time: 1.44 s


In [17]:
try:
    assert len(loader.find_by_name("laketest")) is None
except DatabaseError:
    print("Table not found")

ERROR IS: `ODBC error
state: HY000
native error code: 1040
message: [Dremio][Connector] (1040) Dremio failed to execute the query: select * from "chatbot_unpublish_s3".discovery.experiments.test.laketest.runs
[30038]Query execution error. Details:[ 
VALIDATION ERROR: Table 'chatbot_unpublish_s3.discovery.experiments.test.laketest.runs' not found

SQL Query select * from "chatbot_unpublish_s3".discovery.experiments.test.laketest.runs
startLine 1
startColumn 15
endLine 1
endColumn 36

[Error Id: f5f5...[see log]`
Table not found


In [18]:
assert len(loader.find_by_name("lake_observer")) == 5

In [19]:
assert len(loader.find_by_config_key("recipient", "test")) == 5
assert loader.find_by_config_key("recipient", "hello") is None

In [20]:
assert len(loader.find_by_key("experiment", "name", "test-lake-obs")) == 5
assert loader.find_by_key("experiment", "mainfile", "extest.py") is None