# AIStore Python SDK ETL Tutorial

### Set up constants and initialize the client


In [None]:
from aistore import Client
from aistore.sdk.etl_templates import MD5
import hashlib
from itertools import cycle

BUCKET_NAME = "bucket-demo"
SPEC_ETL_NAME = "etl-spec-demo"
CODE_ETL_NAME = "etl-code-demo"

# Note: AIS-ETLs require Kubernetes.
client = Client("http://192.168.49.2:8080")
client.bucket(bck_name=BUCKET_NAME).create(exist_ok=True)

### We can initialize ETLs with either [code](https://aiatscale.org/docs/etl#init-code-request) or [spec](https://aiatscale.org/docs/etl#init-spec-request).

#### Initialize an ETL with code:

In [None]:
# Defining ETL transformation code
def transform(input_bytes):
    md5 = hashlib.md5()
    md5.update(input_bytes)
    return md5.hexdigest().encode()


md5_code_etl = client.etl(etl_name=CODE_ETL_NAME)
# Initializing ETL with transform()
md5_code_etl.init_code(transform=transform)

#### Initialize ETL with spec

In [None]:
# Use the provided template and substitute in the communication type
template = MD5.format(communication_type="hpush")
md5_spec_etl = client.etl(etl_name=SPEC_ETL_NAME)
md5_spec_etl.init_spec(template=template)

Refer to more ETL templates [here](https://github.com/NVIDIA/aistore/blob/main/python/aistore/sdk/etl/etl_templates.py).

### List ETLs
Once initialized, we can verify the ETLs are running:

In [None]:
client.cluster().list_running_etls()

### View ETLs

In [None]:
md5_code_etl.view()

In [None]:
md5_spec_etl.view()

## Get an object with ETL transformation applied

### First, create some objects to transform

In [None]:
import random
import string
import tempfile


def create_and_put_object(
    client: Client,
    bck_name: str,
    obj_name: str,
    provider: str = "ais",
    obj_size: int = 0,
):
    obj_size = obj_size if obj_size else random.randrange(10, 20)
    obj_body = "".join(random.choices(string.ascii_letters, k=obj_size))
    content = obj_body.encode("utf-8")
    with tempfile.NamedTemporaryFile() as file:
        file.write(content)
        file.flush()
        client.bucket(bck_name, provider=provider).object(obj_name).put_file(file.name)
    return content

In [None]:
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-demo.jpg"
)

### Get single object with ETL code transformation

In [None]:
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_name=md5_code_etl.name
).read_all()

### Get single object with ETL spec transformation

In [None]:
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_name=md5_spec_etl.name
).read_all()

## Transform entire bucket with ETL

In [None]:
# Create bucket to store transformed objects
dest_bucket = client.bucket("transform-destination-bucket").create(exist_ok=True)

# Transform bucket contents (with on-the-fly object renames)
client.bucket(BUCKET_NAME).transform(
    etl_name=md5_spec_etl.name,
    to_bck=dest_bucket,
    prepend="transformed-",
    ext={"jpg": "txt"},
)

In [None]:
# Verify rename operations for transformed objects
dest_bucket.list_objects().get_entries()

### Stop ETLs
If an ETL is stopped, any Kubernetes pods created for the ETL are *stopped*, but *not deleted*. Any transforms by the stopped ETL are terminated. Stopped ETLs can be resumed for use with method `start()`:


In [None]:
md5_code_etl.stop()
md5_spec_etl.stop()
client.cluster().list_running_etls()

### Restart Stopped ETLs

In [None]:
md5_code_etl.start()
md5_spec_etl.start()
client.cluster().list_running_etls()

### Stop & Delete ETLs
Once completely finished with the ETLs, we clean up (for storage) by stopping the ETLs with `stop` and subsequently deleting the ETLs with `delete`.
Deleting an ETL deletes all pods created by Kubernetes for the ETL as well as any specifications for the ETL on Kubernetes. Consequently, deleted ETLs cannot be started again and will need to be re-initialized.

In [None]:
md5_code_etl.stop()
md5_spec_etl.stop()

md5_code_etl.delete()
md5_spec_etl.delete()

### Starting Deleted ETL Raises Exception

In [None]:
md5_code_etl.start()

In [None]:
md5_spec_etl.start()

### Initialize ETL XOR+Checksum with streaming data

In [None]:
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-xor-demo.jpg", obj_size=256
)

In [None]:
def transform(reader, writer):
    checksum = hashlib.md5()
    key = b"AISTORE"
    for b in reader:
        out = bytes([_a ^ _b for _a, _b in zip(b, cycle(key))])
        writer.write(out)
        checksum.update(out)
    writer.write(checksum.hexdigest().encode())


xor_stream_etl = client.etl("xor-md5-stream")
xor_stream_etl.init_code(
    transform=transform,
    chunk_size=32,
)

### Get object with XOR+Checksum ETL and verify checksum

In [None]:
xor_obj = (
    client.bucket(BUCKET_NAME)
    .object("object-xor-demo.jpg")
    .get(etl_name=xor_stream_etl.name)
    .read_all()
)
data, checksum = xor_obj[:-32], xor_obj[-32:]
computed_checksum = hashlib.md5(data).hexdigest().encode()
computed_checksum == checksum

In [None]:
xor_stream_etl.stop()
xor_stream_etl.delete()

### Cleanup buckets

In [None]:
for bucket in client.cluster().list_buckets():
    client.bucket(bucket.name).delete()