In [None]:
# | default_exp s3_utils

In [None]:
# | export

import json
import os
import re
import socket
import uuid
from pathlib import Path

import boto3
from botocore.client import Config
from botocore.errorfactory import ClientError
from botocore.exceptions import ConnectTimeoutError
from nbdev.export import get_config

from sciflow.utils import lib_path, prepare_env

In [None]:
# | export


def is_valid_bucket(bucket_name):
    # See https://docs.aws.amazon.com/awscloudtrail/latest/userguide/
    # cloudtrail-s3-bucket-naming-requirements.html
    if len(bucket_name) < 3 or len(bucket_name) > 63:
        return False

    labels = bucket_name.split(".")
    # A bucket name consists of "labels" separated by periods
    for label in labels:
        if len(label) == 0 or label[0] == "-" or label[-1] == "-":
            # Labels must be of nonzero length,
            # and cannot begin or end with a hyphen
            return False
        for char in label:
            # Labels can only contain digits, lowercase letters, or hyphens.
            # Anything else will fail here
            if not (char.isdigit() or char.islower() or char == "-"):
                return False
    try:
        # If a name is a valid IP address, it cannot be a bucket name
        socket.inet_aton(bucket_name)
    except socket.error:
        return True

In [None]:
assert is_valid_bucket("some.bucket.name")
assert is_valid_bucket("somebucketname")
assert not is_valid_bucket("path/sep")
assert not is_valid_bucket("snake_case")

In [None]:
# | export


def s3_join(*args):
    return os.path.join(*args).replace("\\", "/")

In [None]:
assert "windows\path\key".replace("\\", "/") == "windows/path/key"
path_a = s3_join("some/path", "artifacts", "runs.json")
path_b = s3_join("some/path/", "artifacts", "runs.json")
path_c = s3_join("some/path/", "artifacts/", "runs.json")
expected = "some/path/artifacts/runs.json"
assert path_a == path_b == path_c == expected

In [None]:
# | export


def objects_exist_in_dir(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    all_keys = [el.key for el in bucket.objects.filter(Prefix=prefix)]
    return len(all_keys) > 0

In [None]:
missing_bucket = "nobuckethere"
invalid_bucket = "invalid_bucket_name"
test_stem = f"sciflow_testing_{str(uuid.uuid4()).split('-')[-1]}"
test_root = s3_join("sciflow", test_stem)
test_dir = f"{test_root}/s3_utils"
lib_dir = s3_join(test_dir, "sciflow")
local_dir = f"/tmp/{test_root}/sciflow"
test_dir

'sciflow/sciflow_testing_14087f979d48/s3_utils'

In [None]:
config = Config(connect_timeout=5, read_timeout=5, retries={"max_attempts": 0})
s3_res = boto3.resource("s3", config=config)
s3_client = boto3.client("s3")

In [None]:
# | export


def delete_dir(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    bucket.objects.filter(Prefix=prefix).delete()

In [None]:
prepare_env()
bucket_name = os.environ["SCIFLOW_BUCKET"]

In [None]:
import pandas as pd

df = pd.read_csv("s3://prosandboxpdlras3/sciflow/dataframe_artifact.csv")

In [None]:
assert not objects_exist_in_dir(s3_res, bucket_name, "/non")
assert not objects_exist_in_dir(s3_res, bucket_name, test_dir)

In [None]:
s3_client.upload_file(
    str(Path("index.ipynb").resolve()), bucket_name, s3_join(test_dir, "index.json")
)

In [None]:
assert objects_exist_in_dir(s3_res, bucket_name, test_dir)

In [None]:
# | export


def bucket_exists(s3_res, bucket_name):
    if not is_valid_bucket(bucket_name):
        raise ValueError("Bucket name does not follow AWS bucket naming rules")
    try:
        s3_res.meta.client.head_bucket(Bucket=bucket_name)
    except ClientError as er:
        if er.response["Error"]["Code"] == "404":
            return False
    return True

In [None]:
%%time
assert bucket_exists(s3_res, bucket_name)
try:
    bucket_exists(s3_res, missing_bucket)
except ConnectTimeoutError:
    pass
try:
    assert bucket_exists(s3_res, invalid_bucket)
except ValueError:
    pass

CPU times: user 20.4 ms, sys: 5.85 ms, total: 26.3 ms
Wall time: 77.6 ms


In [None]:
# | export


def list_s3_subdirs(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    all_keys = [obj.key for obj in bucket.objects.filter(Prefix=prefix)]
    subdir_match = r"{prefix}\/(.*)\/".format(prefix=prefix)
    subdirs = []
    for key in all_keys:
        match_obj = re.match(subdir_match, key)
        if match_obj is None:
            continue
        else:
            subdirs.append(match_obj.groups()[0])
    distinct_subdirs = set(subdirs)
    return sorted(list(distinct_subdirs))

In [None]:
assert len(list_s3_subdirs(s3_res, bucket_name, test_root)) > 0

In [None]:
assert len(list_s3_subdirs(s3_res, bucket_name, "blabla/somekey/nonsense")) == 0

In [None]:
# | export


def list_bucket(bucket_name, prefix, s3_res=None):
    s3_res = s3_res if s3_res is not None else boto3.resource("s3")
    bucket = s3_res.Bucket(bucket_name)
    all_keys = [obj.key for obj in bucket.objects.filter(Prefix=prefix)]
    return all_keys

In [None]:
listed_keys = list_bucket(bucket_name, test_root, s3_res)
assert len(listed_keys) == 1
assert listed_keys[0].split("/")[-1] == "index.json"

In [None]:
# | export


def put_data(s3_res, bucket_name, key, binary_data):
    s3_res.Object(bucket_name, key).put(Body=binary_data)

In [None]:
put_data(s3_res, bucket_name, s3_join(test_root, "put-test"), "value".encode("utf-8"))

In [None]:
result = (
    s3_res.Object(bucket_name, s3_join(test_root, "put-test"))
    .get()["Body"]
    .read()
    .decode("utf-8")
)
assert "value" == result

In [None]:
# | export


def load_json(s3_res, bucket_name, key):
    obj = s3_res.Object(bucket_name, key)
    return json.load(obj.get()["Body"])

In [None]:
index_nb_json = load_json(s3_res, bucket_name, listed_keys[0])

In [None]:
assert type(index_nb_json) == dict
assert "cells" in index_nb_json

In [None]:
# | export


def upload_directory(s3_client, path, bucket_name, prefix):
    for root, dirs, files in os.walk(path):
        if ".ipynb_checkpoints" not in root and "__pycache__" not in root:
            # Ignore non-python source files and IPython checkpoint files
            for file in [
                f
                for f in files
                if f.split(".")[-1] == "py" and root.find("ipynb_checkpoints") == -1
            ]:
                if root != path:
                    sub_dir = root.replace(path, "").lstrip("/")
                    upload_key = f"{prefix}/{sub_dir}/{file}"
                else:
                    upload_key = f"{prefix}/{file}"
                print(
                    f"Uploading file: {os.path.join(root, file)} to: {bucket_name}/{upload_key}"
                )
                s3_client.upload_file(os.path.join(root, file), bucket_name, upload_key)

In [None]:
lib_name = get_config().get("lib_name")

assert not (
    objects_exist_in_dir(s3_res, bucket_name, f"{test_dir}/{lib_name}/_modidx.py")
)
assert not (
    objects_exist_in_dir(
        s3_res, bucket_name, f"{test_dir}/{lib_name}/experiment/__init__.py"
    )
)

upload_directory(
    s3_client,
    str(Path(lib_path(), get_config().lib_path)),
    bucket_name,
    lib_dir,
)

assert objects_exist_in_dir(s3_res, bucket_name, f"{lib_dir}/_modidx.py")

Uploading file: /home/sagemaker-user/git/sciflow/sciflow/utils.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/utils.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/export_step.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/export_step.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/data_handler.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/data_handler.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/_modidx.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/_modidx.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/init.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/init.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/params.py to: prosandboxpdlras3/sciflow/sciflow_testing_14087f979d48/s3_utils/sciflow/params.py
Uploading file: /home/sagemaker-user/git/sciflow/sciflow/__init_

In [None]:
# | export


def download_directory(s3_client, s3_res, bucket_name, remote_key, local_dir):
    if not Path(local_dir).exists():
        Path(local_dir).mkdir(parents=True)
    all_files = [
        obj.key for obj in s3_res.Bucket(bucket_name).objects.filter(Prefix=remote_key)
    ]
    for file in all_files:
        file_name = file.replace(remote_key, "").lstrip("/")
        local_path = Path(local_dir, file_name)
        if not local_path.parent.exists():
            local_path.parent.mkdir(parents=True)
        s3_client.download_file(bucket_name, file, f"{local_path}")
        assert local_path.exists()

In [None]:
assert not Path(f"{local_dir}/_modidx.py").exists()

download_directory(s3_client, s3_res, bucket_name, lib_dir, local_dir)

import shutil

assert Path(local_dir, "_modidx.py").exists()

In [None]:
shutil.rmtree(local_dir)
delete_dir(s3_res, bucket_name, test_dir)

In [None]:
# | export


import io


# Copied from: https://alexwlchan.net/2019/02/working-with-large-s3-objects/
class S3File(io.RawIOBase):
    def __init__(self, s3_object):
        self.s3_object = s3_object
        self.position = 0

    def __repr__(self):
        return "<%s s3_object=%r>" % (type(self).__name__, self.s3_object)

    @property
    def size(self):
        return self.s3_object.content_length

    def tell(self):
        return self.position

    def seek(self, offset, whence=io.SEEK_SET):
        if whence == io.SEEK_SET:
            self.position = offset
        elif whence == io.SEEK_CUR:
            self.position += offset
        elif whence == io.SEEK_END:
            self.position = self.size + offset
        else:
            raise ValueError(
                f"invalid whence ({whence}, should be {io.SEEK_SET}, io.SEEK_CUR {io.SEEK_END})"
            )

        return self.position

    def seekable(self):
        return True

    def read(self, size=-1):
        if size == -1:
            # Read to the end of the file
            range_header = "bytes=%d-" % self.position
            self.seek(offset=0, whence=io.SEEK_END)
        else:
            new_position = self.position + size

            # If we're going to read beyond the end of the object, return
            # the entire object.
            if new_position >= self.size:
                return self.read()

            range_header = "bytes=%d-%d" % (self.position, new_position - 1)
            self.seek(offset=size, whence=io.SEEK_CUR)

        return self.s3_object.get(Range=range_header)["Body"].read()

    def readable(self):
        return True