In [4]:
# Load dependencies and set constants
from pyflink.datastream.functions import ProcessFunction
from pyflink.common.serialization import SimpleStringSchema, SerializationSchema
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
from pyflink.datastream.connectors import FlinkKafkaConsumer, FlinkKafkaProducer
from matplotlib import pyplot as plt

import time
import sys

producer_props = {
    'bootstrap.servers': 'localhost:9092',
    'queue.buffering.max.messages': '1000000'
}

consumer_props = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "twitter-consumers",
    "client.id": "client-1",
}

KAFKA_TOPIC = "twitter-stream"
KAFKA_CONNECTOR_JAR = "file:///home/ubuntu/flink-sql-connector-kafka_2.11-1.12.2.jar"

# Number of tweets in the dataset (10 MB)
NUMBER_OF_TWEETS = 14485

# Number of times the dataset is produced by Kafka
NUMBER_OF_PRODUCTIONS = 10

NUMBER_OF_PRODUCTIONS = 100

DATASET_SIZE = 10

NUMBER_OF_EXECUTIONS = 10

In [5]:
measured_times = []
records_received = 0
start_time = -1
iterations = 0
kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'twitter_consumers'}


def collect_stats():
    throughput_mbs = [(DATASET_SIZE * NUMBER_OF_PRODUCTIONS) / x for x in measured_times]
    throughput_messages = [NUMBER_OF_TWEETS / x for x in measured_times]
    min_throughput = min(throughput_mbs)
    max_throughput = max(throughput_mbs)
    avg_throughput = mean(throughput_mbs)

    print("============ Throughput Results ============")
    for i in len(throughput_mbs):
        print("Iteration {]: {} MB/s | {} Msgs/s".format(str(i), throughput_mbs[i], throughput_messages[i]))

    # Plot Max, Min, Avg MB/s
    fig = plt.figure(figsize=(10, 6), dpi=80)
    plt.bar(0, max_throughput, width=1, color='navy')
    plt.bar(1, avg_throughput, width=1, color='darkcyan')
    plt.bar(2, min_throughput, width=1, color='skyblue')

    plt.tick_params(labelleft=True, labelbottom=False)
    plt.legend(["Maximum MB/s", "Average MB/s", "Minimum MB/s"], prop={'size': 12}, bbox_to_anchor=(1.05, 0.8))
    plt.title("Summarization of Throughput in MB/s", fontsize=14, pad=12)
    plt.savefig('throughput.png')


class MyProcessFunction(ProcessFunction):

    def process_element(self, value, ctx: 'ProcessFunction.Context'):
        # global might not be ideal here (parallelism possible?) but currently nothing else comes to mind
        global records_received
        global start_time
        global iterations
        cur_time = time.time()
        if records_received == 0:
            start_time = cur_time
        records_received += 1
        latency = (cur_time * 1000) - ctx.timestamp()
        result = str(latency)
        yield result
        if records_received % (NUMBER_OF_TWEETS * NUMBER_OF_PRODUCTIONS) == 0:
            end_time = cur_time()
            total_time = end_time - start_time
            measured_times.append(total_time)
            records_received = 0
            iterations += 1
        if iterations == NUMBER_OF_EXECUTIONS:
            collect_stats()

env = StreamExecutionEnvironment.get_execution_environment()
'''
An execution environment defines a default parallelism for all operators, data sources,
and data sinks it executes
'''
env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

# Add the Kafka Connector Dependency
env.add_jars(KAFKA_CONNECTOR_JAR)
env.add_classpaths(KAFKA_CONNECTOR_JAR)

kafka_consumer = FlinkKafkaConsumer("twitter-stream", SimpleStringSchema(), consumer_props)
kafka_producer = FlinkKafkaProducer("twitter-stream-results", SimpleStringSchema(), producer_props)

stream = env.add_source(kafka_consumer)
stream.process(MyProcessFunction(), output_type=Types.STRING()) \
      .add_sink(kafka_producer)

env.execute()

'''
Weird stuff:
    - Sometimes more records than emitted by Kafka (sometimes also less)
    - print output sometimes was "Latency: {}   Record-Number: {}   Total Number of Records: {}".format(str(latency), str(len(latencies)), str(NUMBER_OF_TWEETS * NUMBER_OF_PRODUCTIONS))
      and sometimes str(latency)!?!
    - what value for parallelism?
    
Notes:
    - Flink parallelism: https://stackoverflow.com/questions/61139187/what-is-the-difference-between-parallelism-and-parallel-computing-in-flink
'''




KeyboardInterrupt: 

In [None]:
'''
# Check the results by consuming the result stream from Kafka
import platform, socket, json, psutil, logging, multiprocessing

from time import time, perf_counter
from confluent_kafka import Consumer, KafkaError

# Create Kafka Consumer
consumer_config = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "twitter-consumers",
    "client.id": "client-1",
    "enable.auto.commit": True,
    "session.timeout.ms": 6000,
    "default.topic.config": {"auto.offset.reset": "smallest"}
}
c = Consumer(consumer_config)

msg_counter = 0

c.subscribe(["twitter-stream-results"])
while True:             
    msg = c.poll(0.5)

    if msg is None:
        continue
    elif not msg.error():
        msg_counter += 1
        print("Message: {} Latency: {}".format(msg_counter, msg.value()))
#         msg_counter += 1
        # Start the timer once the first message was received
#         if msg_counter == 1:
#             start = perf_counter()
#         latencies.append(latency)
    elif msg.error().code() == KafkaError._PARTITION_EOF:
        print("End of partition reached {}/{}".format(msg.topic(), msg.partition()))
        print("Messages consumed: {}".format(msg_counter))
    else:
        print("Error occured: {}".format(msg.error().str()))
'''

In [6]:
print(iterations)

0
