In [1]:
%%bash
python3 -m pip install kafka-python rx



In [2]:
# Initialize Kafka Topics
from kafka import KafkaClient
from kafka.admin import KafkaAdminClient, NewTopic

TOPICS = ["clicks"]

kafka_client = KafkaClient("localhost:9092")
existing_topics = kafka_client.topic_partitions

admin_client = KafkaAdminClient(bootstrap_servers="localhost:9092")
topic_list = []
for topicname in TOPICS:
    if topicname not in existing_topics:
        print("Creating topic: {}".format(topicname))
        topic_list.append(NewTopic(name=topicname, num_partitions=1, replication_factor=1))
admin_client.create_topics(new_topics=topic_list, validate_only=False)

CreateTopicsResponse_v0(topic_errors=[])

In [3]:
# Extract archive of click data
import zipfile

ARCHIVEF = "20180113-6-9-with-nulls.json.zip"
CLICKSF = "20180113-6-9-with-nulls.json"

with zipfile.ZipFile(ARCHIVEF,"r") as zip_ref:
    zip_ref.extractall(".")

In [4]:
# Start sending click data
import json
import time
from datetime import datetime

from kafka import KafkaProducer
from kafka.errors import KafkaError

producer = KafkaProducer(bootstrap_servers=['localhost:9092'])

orig_start = 0
replay_start = time.time() * 1000

counter = 0

with open(CLICKSF, 'r') as f:
    first = True
    start_ts = 0
    for line in f:
        click = json.loads(line)

        orig_event = click["ts_ingest"] or 0
        
        if orig_event:
            curr_ts = time.time() * 1000

            if first:
                first = False
                orig_start = click["ts_ingest"]

            difference = (orig_event - orig_start) - (curr_ts - replay_start)

            if difference > 0:
                print("sleeping {0:.2f} seconds".format(difference/1000.0))
                time.sleep(difference/1000.0)


        future = producer.send('clicks', line.encode())
        try:
            record_metadata = future.get(timeout=10)
            print("✓ {}".format(datetime.fromtimestamp(orig_event/1000.0)))
        except KafkaError:
            log.exception()
            pass

✓ 2018-01-13 05:04:04.345000
✓ 2018-01-13 05:04:04.349000
sleeping 0.00 seconds
✓ 2018-01-13 05:04:04.359000
sleeping 0.05 seconds
✓ 2018-01-13 05:04:04.409000
✓ 1970-01-01 00:00:00
sleeping 0.00 seconds
✓ 2018-01-13 05:04:04.416000
sleeping 0.05 seconds
✓ 2018-01-13 05:04:04.469000
sleeping 0.02 seconds
✓ 2018-01-13 05:04:04.487000
sleeping 0.11 seconds
✓ 2018-01-13 05:04:04.598000
sleeping 0.02 seconds
✓ 2018-01-13 05:04:04.616000
sleeping 0.01 seconds
✓ 2018-01-13 05:04:04.625000
sleeping 0.02 seconds
✓ 2018-01-13 05:04:04.652000
sleeping 0.03 seconds
✓ 2018-01-13 05:04:04.687000
✓ 2018-01-13 05:04:04.689000
✓ 1970-01-01 00:00:00
sleeping 0.01 seconds
✓ 2018-01-13 05:04:04.703000
sleeping 0.05 seconds
✓ 2018-01-13 05:04:04.758000
sleeping 0.03 seconds
✓ 2018-01-13 05:04:04.790000
sleeping 0.05 seconds
✓ 2018-01-13 05:04:04.842000
sleeping 0.08 seconds
✓ 2018-01-13 05:04:04.927000
sleeping 0.00 seconds
✓ 2018-01-13 05:04:04.931000
sleeping 0.02 seconds
✓ 2018-01-13 05:04:04.958000
sl

KeyboardInterrupt: 