In [None]:
# default_exp experiment.lake_experiment

# `incense` Experiment - adapted for Data Lakes

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export
import os
from typing import Dict

import boto3
import pandas as pd
from pyrsistent import freeze, thaw

from sciflow.s3_utils import S3File, s3_join

In [None]:
# export
file_to_mime_type_map = {
    ".txt": "text/csv",
    ".csv": "text/csv",
    ".png": "image/png",
    ".jpg": "image/jpeg",
    ".mp4": "video/mp4",
    ".pickle": "application/octet-stream",
}

In [None]:
# export

import os
import pickle
import warnings
from copy import copy
from typing import Set

import pandas as pd
from IPython import display
from IPython.display import HTML


class Artifact:
    """Displays or saves an artifact."""

    can_render: Set[str] = set()

    def __init__(self, name: str, file, content_type: str = None):
        self.name = name
        self.file = file
        self.content_type = content_type
        self.extension = (
            "" if self.content_type is None else self.content_type.split("/")[-1]
        )
        self._content = None
        self._rendered = None

    def __repr__(self):
        return f"{self.__class__.__name__}(name={self.name})"

    def render(self):
        """Render the artifact according to its content-type."""
        if self._rendered is None:
            self._rendered = self._render()
        return self._rendered

    def _render(self):
        """Return the object that represents the rendered artifact."""
        raise NotImplementedError

    def show(self):
        warnings.warn(
            "`show` is deprecated in favor of `render` and will removed in a future release.",
            DeprecationWarning,
            stacklevel=2,
        )
        return self.render()

    def save(self, to_dir: str = "") -> None:
        """
        Save artifact to disk.

        Args:
            to_dir: Directory in which to save the artifact. Defaults to the current working directory.

        """
        if to_dir:
            os.makedirs(str(to_dir), exist_ok=True)
        with open(os.path.join(str(to_dir), self._make_filename()), "wb") as file:
            file.write(self.content)

    def as_content_type(self, content_type: str) -> "Artifact":
        """Interpret artifact as being of content-type."""
        try:
            artifact_type = content_type_to_artifact_cls[content_type]
        except KeyError:
            raise ValueError(
                f"Incense does not have a class that maps to content-type {content_type}"
            )
        else:
            return self.as_type(artifact_type)

    def as_type(self, artifact_type) -> "Artifact":
        self.file.seek(0)
        return artifact_type(self.name, self.file)

    @property
    def content(self):
        """Access the raw bytes of the artifact."""
        if self._content is None:
            self._content = self.file.read()
        return self._content

    def _make_filename(self):
        # TODO does this work on gridfs file?
        exp_id, artifact_name = self.file.name.split("/")[-2:]
        return f"{exp_id}_{artifact_name}" + (
            "" if artifact_name.endswith(self.extension) else f".{self.extension}"
        )


class ImageArtifact(Artifact):
    """Displays or saves an image artifact."""

    can_render = {"image/png", "image/jpeg"}

    def _render(self):
        return display.Image(data=self.content)


class MP4Artifact(Artifact):
    """Displays or saves a MP4 artifact"""

    can_render = {"video/mp4"}

    def _render(self):
        self.save()
        return HTML(
            f"""
        <video width="640" height="480" controls autoplay>
          <source src="{self._make_filename()}" type="video/mp4">
        </video>
        """
        )


class CSVArtifact(Artifact):
    """Displays and saves a CSV artifact"""

    can_render = {"text/csv"}

    def _render(self):
        return pd.read_csv(self.file)


class PickleArtifact(Artifact):
    """Displays and saves a Pickle artifact"""

    can_render: Set[str] = set()

    def __init__(self, name: str, file, content_type: str = None):
        super().__init__(name, file, content_type)
        self.extension = "pickle"

    def _render(self):
        return pickle.load(self.file)


class PDFArtifact(Artifact):
    """Displays and saves a PDF artifacts."""

    can_render = {"application/pdf"}

    # TODO probably needs jupyter extension to be able to display pdf.
    # def _render(self):
    #     return IFrame(self._make_filename(), width=600, height=300)
    #


content_type_to_artifact_cls = {}
for cls in copy(locals()).values():
    if isinstance(cls, type) and issubclass(cls, Artifact):
        for content_type in cls.can_render:
            content_type_to_artifact_cls[content_type] = cls

In [None]:
# export
class LakeExperiment:
    def __init__(
        self, bucket_name, experiments_dir, name, experiment_id, start_time, data
    ):
        self.bucket_name = bucket_name
        self.experiments_dir = experiments_dir
        self.name = name
        self.experiment_id = experiment_id
        self.project_name = s3_join(experiments_dir, name)
        self.project_dir = s3_join(self.bucket_name, self.project_name)
        self.metrics_dir = s3_join(self.project_dir, "metrics", str(experiment_id))
        self.artifacts_dir = s3_join(self.project_dir, "artifacts", str(experiment_id))
        self.start_time = start_time
        self._data = freeze(data)
        self.s3 = boto3.client("s3")
        self.artifacts = self._load_artifacts()
        self.metrics = self._load_metrics()

    def __repr__(self):
        return f"{self.__class__.__name__}(id={self.experiment_id}, name={self.name})"

    def __getattr__(self, item):
        """Try to relay attribute access to easy dict, to allow dotted access."""
        return getattr(self._data, item)

    def to_dict(self) -> dict:
        return thaw(self._data)

    def _load_artifacts(self) -> Dict[str, Artifact]:
        artifacts = {}
        s3 = boto3.resource("s3")
        bucket = s3.Bucket(self.bucket_name)

        artifacts_key_prefix = s3_join(
            self.project_name, "artifacts", str(self.experiment_id)
        )
        artifact_keys = [
            obj.key for obj in bucket.objects.filter(Prefix=artifacts_key_prefix)
        ]

        for artifact_key in artifact_keys:
            s3_object = s3.Object(bucket_name=self.bucket_name, key=artifact_key)
            artifact_file = S3File(s3_object)
            name = os.path.basename(artifact_key)

            try:
                content_type = file_to_mime_type_map[os.path.splitext(name)[-1]]
                artifact_type = content_type_to_artifact_cls[content_type]
                artifacts[name] = artifact_type(
                    name, artifact_file, content_type=content_type
                )
            except KeyError:
                artifact_type = Artifact
                artifacts[name] = artifact_type(name, artifact_file)

        return artifacts

    def _load_metrics(self) -> Dict[str, pd.Series]:
        metrics_path = f"s3://{self.metrics_dir}/metrics.csv"
        return pd.read_csv(metrics_path)

    def delete(self, confirmed: bool = False):
        raise NotImplementedError

    def _delete(self):
        raise NotImplementedError

    def _delete_metrics(self):
        raise NotImplementedError

    def _delete_artifacts(self):
        raise NotImplementedError