In [1]:
# default_exp s3_utils

# S3 Utilities

> Provides utility functions for interacting with Amazon S3

In [2]:
# export

import os
import re
import socket
from pathlib import Path

import boto3
from botocore.client import Config
from botocore.errorfactory import ClientError
from botocore.exceptions import ConnectTimeoutError

from sciflow.utils import prepare_env

ModuleNotFoundError: No module named 'sciflow'

In [6]:
# export


def is_valid_bucket(bucket_name):
    # See https://docs.aws.amazon.com/awscloudtrail/latest/userguide/
    # cloudtrail-s3-bucket-naming-requirements.html
    if len(bucket_name) < 3 or len(bucket_name) > 63:
        return False

    labels = bucket_name.split(".")
    # A bucket name consists of "labels" separated by periods
    for label in labels:
        if len(label) == 0 or label[0] == "-" or label[-1] == "-":
            # Labels must be of nonzero length,
            # and cannot begin or end with a hyphen
            return False
        for char in label:
            # Labels can only contain digits, lowercase letters, or hyphens.
            # Anything else will fail here
            if not (char.isdigit() or char.islower() or char == "-"):
                return False
    try:
        # If a name is a valid IP address, it cannot be a bucket name
        socket.inet_aton(bucket_name)
    except socket.error:
        return True

In [7]:
assert is_valid_bucket("some.bucket.name")
assert is_valid_bucket("somebucketname")
assert not is_valid_bucket("path/sep")
assert not is_valid_bucket("snake_case")

In [8]:
# export


def s3_join(*args):
    return os.path.join(*args).replace("\\", "/")

In [9]:
assert "windows\path\key".replace("\\", "/") == "windows/path/key"
path_a = s3_join("some/path", "artifacts", "runs.json")
path_b = s3_join("some/path/", "artifacts", "runs.json")
path_c = s3_join("some/path/", "artifacts/", "runs.json")
expected = "some/path/artifacts/runs.json"
assert path_a == path_b == path_c == expected

In [10]:
# export
def objects_exist_in_dir(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    all_keys = [el.key for el in bucket.objects.filter(Prefix=prefix)]
    return len(all_keys) > 0

In [11]:
missing_bucket = "nobuckethere"
invalid_bucket = "invalid_bucket_name"
experiment_dir = "discovery/experiments/s3_test"

In [12]:
config = Config(connect_timeout=5, read_timeout=5, retries={"max_attempts": 0})
s3_res = boto3.resource("s3", config=config)

In [13]:
# export
def delete_dir(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    bucket.objects.filter(Prefix=prefix).delete()

In [15]:
prepare_env()
bucket_name = os.environ['SCIFLOW_BUCKET']

KeyError: 'SCIFLOW_BUCKET'

In [14]:
assert not objects_exist_in_dir(s3_res, bucket_name, "/non")
assert objects_exist_in_dir(s3_res, bucket_name, experiment_dir)

NameError: name 'bucket_name' is not defined

In [None]:
# export
def bucket_exists(s3_res, bucket_name):
    if not is_valid_bucket(bucket_name):
        raise ValueError("Bucket name does not follow AWS bucket naming rules")
    try:
        s3_res.meta.client.head_bucket(Bucket=bucket_name)
    except ClientError as er:
        if er.response["Error"]["Code"] == "404":
            return False
    return True

In [None]:
%%time
assert bucket_exists(s3_res, bucket_name)
try:
    bucket_exists(s3_res, missing_bucket)
except ConnectTimeoutError:
    pass
try:
    assert bucket_exists(s3_res, invalid_bucket)
except ValueError:
    pass

In [None]:
# export
def list_s3_subdirs(s3_res, bucket_name, prefix):
    bucket = s3_res.Bucket(bucket_name)
    all_keys = [obj.key for obj in bucket.objects.filter(Prefix=prefix)]
    subdir_match = r"{prefix}\/(.*)\/".format(prefix=prefix)
    subdirs = []
    for key in all_keys:
        match_obj = re.match(subdir_match, key)
        if match_obj is None:
            continue
        else:
            subdirs.append(match_obj.groups()[0])
    distinct_subdirs = set(subdirs)
    return sorted(list(distinct_subdirs))

In [None]:
assert len(list_s3_subdirs(s3_res, bucket_name, experiment_dir)) > 0

In [None]:
assert len(list_s3_subdirs(s3_res, bucket_name, "blabla/somekey/nonsense")) == 0

In [None]:
# export

def upload_directory(s3_res, path, bucket_name, prefix):
    for root, dirs, files in os.walk(path):
        # Ignore non-python source files and IPython checkpoint files
        for file in [f for f in files if f.split('.')[-1] == 'py' and root.find('ipynb_checkpoints') == -1]:
            s3_res.upload_file(os.path.join(root, file), bucket_name, f"{prefix}{file}")

In [None]:
# TODO test

In [None]:
# export

def download_directory(s3_client, bucketname, remote_key, local_dir):
    if not Path(local_dir).exists():
        Path(local_dir).mkdir(parents=True)
    all_files = [obj.key for obj in boto3.resource('s3').Bucket(bucketname).objects.filter(Prefix=remote_key)]
    for file in all_files:
        file_name = file.split('/')[-1]
        s3_client.download_file(bucketname, file, f'{local_dir}/{file_name}')

In [None]:
# TODO test

In [None]:
# export


import io


# Copied from: https://alexwlchan.net/2019/02/working-with-large-s3-objects/
class S3File(io.RawIOBase):
    def __init__(self, s3_object):
        self.s3_object = s3_object
        self.position = 0

    def __repr__(self):
        return "<%s s3_object=%r>" % (type(self).__name__, self.s3_object)

    @property
    def size(self):
        return self.s3_object.content_length

    def tell(self):
        return self.position

    def seek(self, offset, whence=io.SEEK_SET):
        if whence == io.SEEK_SET:
            self.position = offset
        elif whence == io.SEEK_CUR:
            self.position += offset
        elif whence == io.SEEK_END:
            self.position = self.size + offset
        else:
            raise ValueError(
                f"invalid whence ({whence}, should be {io.SEEK_SET}, io.SEEK_CUR {io.SEEK_END})"
            )

        return self.position

    def seekable(self):
        return True

    def read(self, size=-1):
        if size == -1:
            # Read to the end of the file
            range_header = "bytes=%d-" % self.position
            self.seek(offset=0, whence=io.SEEK_END)
        else:
            new_position = self.position + size

            # If we're going to read beyond the end of the object, return
            # the entire object.
            if new_position >= self.size:
                return self.read()

            range_header = "bytes=%d-%d" % (self.position, new_position - 1)
            self.seek(offset=size, whence=io.SEEK_CUR)

        return self.s3_object.get(Range=range_header)["Body"].read()

    def readable(self):
        return True