# AIStore Python SDK ETL Tutorial

### Initialize ETLs


In [1]:
from aistore import Client
from aistore.client.etl_templates import MD5
import hashlib
from itertools import cycle

BUCKET_NAME = "bucket-demo"

# Note: AIS-ETLs require Kubernetes.
client = Client("http://192.168.49.2:8080")


# ETL w/ Code
def transform(input_bytes):
    md5 = hashlib.md5()
    md5.update(input_bytes)
    return md5.hexdigest().encode()


client.etl().init_code(transform=transform, etl_id="etl-code-demo")


# ETL w/ Spec
template = MD5.format(communication_type="hpush")
client.etl().init_spec(template=template, etl_id="etl-spec-demo")

'etl-spec-demo'

### List ETLs

In [2]:
# Verify ETLs are running
client.etl().list()

[ETL(id='etl-code-demo', obj_count=0, in_bytes=0, out_bytes=0),
 ETL(id='etl-spec-demo', obj_count=0, in_bytes=0, out_bytes=0)]

### View ETLs

In [3]:
client.etl().view(etl_id="etl-code-demo")

ETLDetails(id='etl-code-demo', communication='hpush://', timeout='5m', code=b"\nimport pickle\nimport base64\n\ntransform = pickle.loads(base64.b64decode('gAWVMwIAAAAAAACMF2Nsb3VkcGlja2xlLmNsb3VkcGlja2xllIwNX2J1aWx0aW5fdHlwZZSTlIwKTGFtYmRhVHlwZZSFlFKUKGgCjAhDb2RlVHlwZZSFlFKUKEsBSwBLAEsCSwNLQ0MedACgAaEAfQF8AaACfAChAQEAfAGgA6EAoAShAFMAlE6FlCiMB2hhc2hsaWKUjANtZDWUjAZ1cGRhdGWUjAloZXhkaWdlc3SUjAZlbmNvZGWUdJSMC2lucHV0X2J5dGVzlGgMhpSMIy90bXAvaXB5a2VybmVsXzM0NTc4Ni8xNDEwMzkyNTA4LnB5lIwJdHJhbnNmb3JtlEsNQwYIAQoBDAGUKSl0lFKUfZQojAtfX3BhY2thZ2VfX5ROjAhfX25hbWVfX5SMCF9fbWFpbl9flHVOTk50lFKUjBxjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZV9mYXN0lIwSX2Z1bmN0aW9uX3NldHN0YXRllJOUaB19lH2UKGgaaBSMDF9fcXVhbG5hbWVfX5RoFIwPX19hbm5vdGF0aW9uc19flH2UjA5fX2t3ZGVmYXVsdHNfX5ROjAxfX2RlZmF1bHRzX1+UTowKX19tb2R1bGVfX5RoG4wHX19kb2NfX5ROjAtfX2Nsb3N1cmVfX5ROjBdfY2xvdWRwaWNrbGVfc3VibW9kdWxlc5RdlIwLX19nbG9iYWxzX1+UfZRoC2gAjAlzdWJpbXBvcnSUk5RoC4WUUpRzdYaUhlIwLg=='))\n\n", spec=None, dependencies='Y2xvdWRwaWNrbGU9PTIuMC4w', runtime='python

In [4]:
client.etl().view(etl_id="etl-spec-demo")

ETLDetails(id='etl-spec-demo', communication='hpush://', timeout='5m', code=None, spec=b'\napiVersion: v1\nkind: Pod\nmetadata:\n  name: transformer-md5\n  annotations:\n    # Values it can take ["hpull://","hrev://","hpush://"]\n    communication_type: "hpush://"\n    wait_timeout: 5m\nspec:\n  containers:\n    - name: server\n      image: aistore/transformer_md5:latest\n      imagePullPolicy: IfNotPresent\n      ports:\n        - name: default\n          containerPort: 80\n      command: [\'/code/server.py\', \'--listen\', \'0.0.0.0\', \'--port\', \'80\']\n      readinessProbe:\n        httpGet:\n          path: /health\n          port: default\n', dependencies=None, runtime='python3.8v2', chunk_size=0)

### Get Object w/ ETL Transformation

In [5]:
import random
import string
import tempfile


def create_and_put_object(
    client: Client,
    bck_name: str,
    obj_name: str,
    provider: str = "ais",
    obj_size: int = 0,
):
    obj_size = obj_size if obj_size else random.randrange(10, 20)
    obj_body = "".join(random.choices(string.ascii_letters, k=obj_size))
    content = obj_body.encode("utf-8")
    with tempfile.NamedTemporaryFile() as file:
        file.write(content)
        file.flush()
        client.bucket(bck_name, provider=provider).object(obj_name).put(file.name)
    return content

In [6]:
client.bucket(bck_name=BUCKET_NAME).create()
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-demo.jpg"
)

In [7]:
# Get object w/ ETL code transformation
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_id="etl-code-demo"
).read_all()

b'8bd7cac0c8eb6ee6eccc23ac0fc1ad60'

In [8]:
# Get object w/ ETL spec transformation
client.bucket(BUCKET_NAME).object("object-demo.jpg").get(
    etl_id="etl-spec-demo"
).read_all()

b'8bd7cac0c8eb6ee6eccc23ac0fc1ad60'

### Transform Bucket w/ ETL Transformation

In [9]:
# Create bucket to store transformed objects
client.bucket("transform-destination-bucket").create()

# Transform bucket contents (w/ on-the-fly object renames)
client.bucket(BUCKET_NAME).transform(
    etl_id="etl-spec-demo",
    to_bck="transform-destination-bucket",
    prefix="transformed-",
    ext={"jpg": "txt"},
)

'kCBPKCheX'

In [10]:
# Verify rename operations for transformed objects
client.bucket("transform-destination-bucket").list_objects().get_entries()

[BucketEntry(name='transformed-object-demo.txt', size=32, checksum='3d8a221c47994975', atime='26 Sep 22 22:21 UTC', version='', target_url='', copies=0, flags=64)]

### Stop ETLs

In [11]:
client.etl().stop(etl_id="etl-code-demo")
client.etl().stop(etl_id="etl-spec-demo")

### Restart Stopped ETLs

In [12]:
client.etl().start(etl_id="etl-code-demo")
client.etl().start(etl_id="etl-spec-demo")

### Stop & Delete ETLs

In [13]:
client.etl().stop(etl_id="etl-code-demo")
client.etl().stop(etl_id="etl-spec-demo")

client.etl().delete(etl_id="etl-code-demo")
client.etl().delete(etl_id="etl-spec-demo")

### Starting Deleted ETL Raises Exception

In [14]:
client.etl().start(etl_id="etl-code-demo")

AISError: STATUS:404, MESSAGE:p[RGCuHzVH]: etl UUID etl-code-demo does not exist

In [15]:
client.etl().start(etl_id="etl-spec-demo")

AISError: STATUS:404, MESSAGE:p[RGCuHzVH]: etl UUID etl-spec-demo does not exist

### Initialize ETL XOR+Checksum with streaming data

In [16]:
content = create_and_put_object(
    client=client, bck_name=BUCKET_NAME, obj_name="object-xor-demo.jpg", obj_size=256
)

In [17]:
def transform(reader, writer):
    checksum = hashlib.md5()
    key = b"AISTORE"
    for b in reader:
        out = bytes([_a ^ _b for _a, _b in zip(b, cycle(key))])
        writer.write(out)
        checksum.update(out)
    writer.write(checksum.hexdigest().encode())


client.etl().init_code(
    transform=transform,
    etl_id="xor-md5-stream",
    chunk_size=32,
)

'xor-md5-stream'

In [18]:
# Get object w/ XOR+Checksum ETL and verify checksum
xor_obj = (
    client.bucket(BUCKET_NAME)
    .object("object-xor-demo.jpg")
    .get(etl_id="xor-md5-stream")
    .read_all()
)
data, checksum = xor_obj[:-32], xor_obj[-32:]
computed_checksum = hashlib.md5(data).hexdigest().encode()
computed_checksum == checksum

True

In [19]:
client.etl().stop(etl_id="xor-md5-stream")
client.etl().delete(etl_id="xor-md5-stream")

In [20]:
# Cleanup buckets
for bucket in client.cluster().list_buckets():
    client.bucket(bucket.name).delete()