# kafka-streams-flows-as-source

The cells needed to run your application are included below. Make any changes and add your sources, analytics and outputs.

### Documentation
   - [Streams Python development guide](https://ibmstreams.github.io/streamsx.documentation/docs/latest/python/)
   - [Streams Python API](https://streamsxtopology.readthedocs.io/)

## Install  python packages
Installs the required python packages with pip.

In [1]:
!pip install --user streamsx.kafka>=1.9.0
!pip install --user streamsx==1.14.13



## Setup 

Sets up the Streams instance name and extracts the resources required for the Streams application to a local directory.

In order to submit a Streams application you need to provide the name of the Streams instance.
To change the instance for the Streams application:
1. From the navigation menu, click **My instances**.
2. Click the **Provisioned Instances** tab.
3. Update the value of streams_instance_name in the cell below according to your Streams instance.


In [2]:
from project_lib import Project
import os, shutil, tarfile
from icpd_core import icpd_util    

def setup(archive, resource_path):
    def extract_project_file(file, path):
        project = Project.access()
        if os.path.exists(path):
            shutil.rmtree(path)
        os.makedirs(path)
        buffio = project.get_file(file, direct_storage=True)
        tarfile.open(fileobj=buffio, mode="r:gz").extractall(path)
    extract_project_file(archive, resource_path)
    os.chdir(resource_path) 

In [3]:
streams_instance_name = "streams"
cfg = icpd_util.get_service_instance_details(streams_instance_name)
resource_path = "streams_flows_notebooks/kafka_streams_flows_as_source_1597265335685"
setup("streams_flows_notebooks/kafka_streams_flows_as_source_1597265335685.tar.gz", resource_path)

## Create the flow

In [4]:
%%writefile flow_schemas

from typing import NamedTuple


class KafkaSchema(NamedTuple):
    event_key: str = ""
    event_topic: str = ""
    event_offset: int = 0.0
    event_partition: int = 0.0
    event_timestamp: int = 0.0
    event_message: str = ""


class SchemaMapper1Schema(NamedTuple):
    key: str = ""
    topic: str = ""
    offset: str = ""
    partition: float = 0.0
    time: str = ""
    message: str = ""


Writing flow_schemas


In [5]:

from streamsx.topology.topology import Topology
import flow_schemas

from lib.error_utils import TupleError
import lib.file_utils as file_utils
import os
import streamsx.kafka as kafka
import typing


# ================================================================================
# MAIN

def build_flow():
    topo = Topology(name='kafka_streams_flows_as_source', namespace=os.environ.get('USER', 'flow'))
    topo.name_to_runtime_id = name_mapping().get

    topo.add_pip_package('streamsx.kafka>=1.9.0')

    kafka_stream = add_kafka(topo)  # Node: "Kafka"
    debug_stream = add_debug(kafka_stream)  # Node: "Debug"

    add_views(topo)
    return topo


# ================================================================================
# Function for top-level operator: Kafka
def add_kafka(topo):
    connection = file_utils.read_from_json(os.path.abspath("connections/kafka_4560768a-c25f-49e7-9333-23726b8ae71e.json"))

    return (
        topo
        .source(
            kafka.KafkaConsumer(
                config={
                    'bootstrap.servers': connection['brokers'],
                    'security.protocol': connection['security_protocol'],
                    'sasl.mechanism': connection['sasl_mechanism'],
                    'sasl.jaas.config': f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{connection["username"]}" password="{connection["api_key"]}";',
                    'auto.offset.reset': 'latest'
                },
                topic="clicks",
                message_attribute_name='event_message',
                key_attribute_name='event_key',
                topic_attribute_name='event_topic',
                offset_attribute_name='event_offset',
                partition_attribute_name='event_partition',
                timestamp_attribute_name='event_timestamp',
                schema=flow_schemas.KafkaSchema),
            name='Kafka')
        .map(
            _map_schema_for_kafka,
            name='SchemaMapper1',
            schema=flow_schemas.SchemaMapper1Schema)
        .filter(
            lambda event: True,
            name='CompositeOutput1')
    )


# ================================================================================
# Function for top-level operator: Debug
def add_debug(stream):
    return (
        stream
        .for_each(
            debug,
            name='Debug')
    )


# ================================================================================
# Operator-specific global code, such as filter classes:

def _map_schema_for_kafka(event):
    try:
        return flow_schemas.SchemaMapper1Schema(
            key=event.event_key,
            topic=event.event_topic,
            offset=str(event.event_offset),
            partition=float(event.event_partition),
            time=str(event.event_timestamp),
            message=event.event_message
        )
    except Exception as err:
        TupleError(operation_id='Kafka', message=str(err))
        return None


def debug(event):
    # you can add debugging/logging code here
    pass


# ================================================================================
# Utils:

def add_views(topo):
    name_to_id = name_mapping()
    for name, stream in topo.streams.items():
        stream_id = name_to_id.get(name)
        if stream_id and stream_id.endswith('__Composite_Output_Id'):
            stream.view(name=stream_id + "__output")


def name_mapping():
    return {
        'Kafka': 'Kafka',
        'SchemaMapper1': 'SchemaMapper1',
        'CompositeOutput1': 'Kafka__Composite_Output_Id',
        'Debug': 'Debug'
    }


## Submit the application

In [6]:
import streamsx
import datetime
from streamsx.topology.context import ContextTypes, JobConfig
from streamsx.topology import context

def submit_app():
    cfg[context.ConfigParams.SSL_VERIFY] = False
    app = build_flow()

    dt = datetime.datetime.now().strftime('%F_%T')
    
    job_config = JobConfig(job_name=f'{app.namespace}:{app.name}:{dt}', tracing='info')
    job_config.add(cfg)

    shutil.copytree('lib', 'python/modules/lib')
    app.add_file_dependency('python', 'opt')

    submission_result = streamsx.topology.context.submit(ContextTypes.DISTRIBUTED, app, config=cfg)
    streams_job = submission_result.job
    print("JobId: ", streams_job.id, "\nJob name: ", streams_job.name)
submit_app()

properties file /tmp/wsuser/consumer-5yyttcd9.properties generated.
Properties file etc/consumer-5yyttcd9.properties added to the topology kafka_streams_flows_as_source


IntProgress(value=0, bar_style='info', description='Initializing', max=10, style=ProgressStyle(description_wid…

2020-08-12 20:49:58,499 streamsx.topology.context [INFO] Generating SPL and submitting application.


JobId:  18 
Job name:  flow:kafka_streams_flows_as_source:2020-08-12_20:49:58


## Delete the resource directory (Optional)
Cleans up the resource folders used in this application.

In [None]:
#cleanup()
# import shutil
# os.chdir(os.environ['PWD'])
# if os.path.exists(resource_path):
#     shutil.rmtree(resource_path)
