In [None]:
# AIStore Python SDK ETL Tutorial

### Initialize ETLs


In [None]:
from aistore import Client
from aistore.client.etl_templates import MD5
import hashlib
from itertools import cycle

BUCKET_NAME = "bucket-demo"

# Note: AIS-ETLs require Kubernetes.
client = Client("http://192.168.49.2:8080")


# ETL w/ Code
def transform(input_bytes):
    md5 = hashlib.md5()
    md5.update(input_bytes)
    return md5.hexdigest().encode()


client.etl().init_code(transform=transform, etl_name="etl-code-demo")


# ETL w/ Spec
template = MD5.format(communication_type="hpush")
client.etl().init_spec(template=template, etl_name="etl-spec-demo")

### List ETLs

In [None]:
# Verify ETLs are running
client.etl().list()

### View ETLs

In [None]:
client.etl().view(etl_name="etl-code-demo")

In [None]:
client.etl().view(etl_name="etl-spec-demo")

### Get Object w/ ETL Transformation

In [None]:
import random
import string
import tempfile


def create_and_put_object(
    client: Client,
    bck_name: str,
    obj_name: str,
    provider: str = "ais",
    obj_size: int = 0,
):
    obj_size = obj_size if obj_size else random.randrange(10, 20)
    obj_body = "".join(random.choices(string.ascii_letters, k=obj_size))
    content = obj_body.encode("utf-8")
    with tempfile.NamedTemporaryFile() as file:
        file.write(content)
        file.flush()
        client.bucket(bck_name, provider=provider).object(obj_name).put(file.name)
    return content

In [None]:
client.bucket(bck_name=BUCKET_NAME).create()
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-demo.jpg"
)

In [None]:
# Get object w/ ETL code transformation
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_name="etl-code-demo"
).read_all()

In [None]:
# Get object w/ ETL spec transformation
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_name="etl-spec-demo"
).read_all()

### Transform Bucket w/ ETL Transformation

In [None]:
# Create bucket to store transformed objects
client.bucket("transform-destination-bucket").create()

# Transform bucket contents (w/ on-the-fly object renames)
client.bucket(BUCKET_NAME).transform(
    etl_name="etl-spec-demo",
    to_bck="transform-destination-bucket",
    prepend="transformed-",
    ext={"jpg": "txt"},
)

In [None]:
# Verify rename operations for transformed objects
client.bucket("transform-destination-bucket").list_objects().get_entries()

### Stop ETLs

In [None]:
client.etl().stop(etl_name="etl-code-demo")
client.etl().stop(etl_name="etl-spec-demo")

### Restart Stopped ETLs

In [None]:
client.etl().start(etl_name="etl-code-demo")
client.etl().start(etl_name="etl-spec-demo")

### Stop & Delete ETLs

In [None]:
client.etl().stop(etl_name="etl-code-demo")
client.etl().stop(etl_name="etl-spec-demo")

client.etl().delete(etl_name="etl-code-demo")
client.etl().delete(etl_name="etl-spec-demo")

### Starting Deleted ETL Raises Exception

In [None]:
client.etl().start(etl_name="etl-code-demo")

In [None]:
client.etl().start(etl_name="etl-spec-demo")

### Initialize ETL XOR+Checksum with streaming data

In [None]:
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-xor-demo.jpg", obj_size=256
)

In [None]:
def transform(reader, writer):
    checksum = hashlib.md5()
    key = b"AISTORE"
    for b in reader:
        out = bytes([_a ^ _b for _a, _b in zip(b, cycle(key))])
        writer.write(out)
        checksum.update(out)
    writer.write(checksum.hexdigest().encode())


client.etl().init_code(
    transform=transform,
    etl_name="xor-md5-stream",
    chunk_size=32,
)

In [None]:
# Get object w/ XOR+Checksum ETL and verify checksum
xor_obj = (
    client.bucket(BUCKET_NAME)
    .object("object-xor-demo.jpg")
    .get(etl_name="xor-md5-stream")
    .read_all()
)
data, checksum = xor_obj[:-32], xor_obj[-32:]
computed_checksum = hashlib.md5(data).hexdigest().encode()
computed_checksum == checksum

In [None]:
client.etl().stop(etl_name="xor-md5-stream")
client.etl().delete(etl_name="xor-md5-stream")

In [None]:
# Cleanup buckets
for bucket in client.cluster().list_buckets():
    client.bucket(bucket.name).delete()