In [1]:
%%bash
# Install the required Python 3 dependencies
python3 -m pip install kafka-python pyarrow influxdb rx  # type: ignore

Collecting influxdb
  Downloading https://files.pythonhosted.org/packages/b7/eb/ff503c3195c2bfcf53be1deca10396e9e7b55096ccb51ee94aef54434f33/influxdb-5.2.2-py2.py3-none-any.whl (70kB)
Installing collected packages: influxdb
Successfully installed influxdb-5.2.2


In [2]:
# Initialize Kafka Topics
from kafka import KafkaClient
from kafka.admin import KafkaAdminClient, NewTopic

TOPICS = ["smoke-test"]

kafka_client = KafkaClient("localhost:9092")
existing_topics = kafka_client.topic_partitions

admin_client = KafkaAdminClient(bootstrap_servers="localhost:9092")
topic_list = []
for topicname in TOPICS:
    if topicname not in existing_topics:
        print("Creating topic: {}".format(topicname))
        topic_list.append(NewTopic(name=topicname, num_partitions=1, replication_factor=1))
admin_client.create_topics(new_topics=topic_list, validate_only=False)

Creating topic: smoke-test


CreateTopicsResponse_v0(topic_errors=[(topic='smoke-test', error_code=0)])

In [3]:
# Extract archive of click data
import zipfile

ARCHIVEF = "20180113-6-9-with-nulls.json.zip"
CLICKSF = "20180113-6-9-with-nulls.json"

with zipfile.ZipFile(ARCHIVEF,"r") as zip_ref:
    zip_ref.extractall(".")

In [4]:
SENDMODE = "kafka"
#SENDMODE = "http"

In [5]:
# Send click data
import json
import time
from datetime import datetime

import requests
from kafka import KafkaProducer
from kafka.errors import KafkaError

producer = KafkaProducer(bootstrap_servers=['localhost:9092'])

orig_start = 0
replay_start = time.time() * 1000

counter = 0

print("Simulating website traffic...")

with open(CLICKSF, 'r') as f:
    first = True
    start_ts = 0
    for line in f:
        click = json.loads(line)

        orig_event = click["ts_ingest"] or 0
        
        if orig_event:
            curr_ts = time.time() * 1000

            if first:
                first = False
                orig_start = click["ts_ingest"]

            difference = (orig_event - orig_start) - (curr_ts - replay_start)

            if difference > 0:
                print("sleeping {0:.2f} seconds".format(difference/1000.0))
                time.sleep(difference/1000.0)

        if SENDMODE == "kafka":
            future = producer.send('clicks', line.encode())
            try:
                record_metadata = future.get(timeout=10)
            except KafkaError:
                log.exception()
                pass
        else:
            #print(f"sending {line}")
            requests.post("http://localhost:5000/clicks", data=line.encode("utf-8"), headers={'Content-Type':'application/json'})
        
        print("✓ {}".format(datetime.fromtimestamp(orig_event/1000.0)))
        break

Simulating website traffic...
✓ 2018-01-13 05:04:04.345000


In [6]:
#
# source: https://www.influxdata.com/blog/getting-started-python-influxdb/
#
from influxdb import InfluxDBClient
client = InfluxDBClient(host='localhost', port=8086)

In [7]:
client.create_database('pyexample')
client.get_list_database()

[{'name': '_internal'}, {'name': 'pyexample'}]

In [8]:
client.switch_database('pyexample')
json_body = [
    {
        "measurement": "brushEvents",
        "tags": {
            "user": "Carol",
            "brushId": "6c89f539-71c6-490d-a28d-6c5d84c0ee2f"
        },
        "time": "2018-03-28T8:01:00Z",
        "fields": {
            "duration": 127
        }
    },
    {
        "measurement": "brushEvents",
        "tags": {
            "user": "Carol",
            "brushId": "6c89f539-71c6-490d-a28d-6c5d84c0ee2f"
        },
        "time": "2018-03-29T8:04:00Z",
        "fields": {
            "duration": 132
        }
    },
    {
        "measurement": "brushEvents",
        "tags": {
            "user": "Carol",
            "brushId": "6c89f539-71c6-490d-a28d-6c5d84c0ee2f"
        },
        "time": "2018-03-30T8:02:00Z",
        "fields": {
            "duration": 129
        }
    }
]
client.write_points(json_body)

True

In [9]:
results = client.query('SELECT "duration" FROM "pyexample"."autogen"."brushEvents" WHERE time < now() - 4d GROUP BY "user"')
results.raw

{'statement_id': 0,
 'series': [{'name': 'brushEvents',
   'tags': {'user': 'Carol'},
   'columns': ['time', 'duration'],
   'values': [['2018-03-28T08:01:00Z', 127],
    ['2018-03-29T08:04:00Z', 132],
    ['2018-03-30T08:02:00Z', 129]]}]}