In [15]:
# Load dependencies and set constants
from pyflink.datastream.functions import ProcessFunction
from pyflink.common.serialization import SimpleStringSchema
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
from pyflink.datastream.connectors import FlinkKafkaConsumer
from matplotlib import pyplot as plt
import time
import sys


kafka_props = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'twitter-consumers'
}

KAFKA_TOPIC = "twitter-stream"
KAFKA_CONNECTOR_JAR = "file:///home/ubuntu/flink-sql-connector-kafka_2.11-1.12.2.jar"
# Number of tweets in the dataset (10 MB)
NUMBER_OF_TWEETS = 14484
# Number of times the dataset is produced by Kafka
NUMBER_OF_PRODUCTIONS = 10

In [None]:
latencies = []
records_received = 0

def collect_stats():
    plt.plot(latencies)
    plt.show()


class MyProcessFunction(ProcessFunction):

    def process_element(self, value, ctx: 'ProcessFunction.Context'):
        latency = (time.time() * 1000) - ctx.timestamp()
        result = "Latency: {}".format(str(latency))
        yield result
        latencies.append(latency)
        # global might not be ideal here (parallelism possible?) but currently nothing else comes to mind
        global records_received
        records_received += 1
        if records_received >= NUMBER_OF_TWEETS * NUMBER_OF_PRODUCTIONS:
            collect_stats()


env = StreamExecutionEnvironment.get_execution_environment()
env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
# Add the Kafka Connector Dependency
env.add_jars(KAFKA_CONNECTOR_JAR)
env.add_classpaths(KAFKA_CONNECTOR_JAR)

kafka_consumer = FlinkKafkaConsumer("twitter-stream", SimpleStringSchema(), kafka_props)

stream = env.add_source(kafka_consumer)
stream_latency = stream.process(MyProcessFunction(), output_type=Types.STRING()).print()

env.execute()


# TODO: Write function for plotting and call it from process-function


