# Fake Page View generator

This notebook generates page views based on a three-hour dataset.

In [1]:
%%bash
python3 -m pip install kafka-python



In [1]:
# Initialize Kafka Topics
from kafka import KafkaClient
from kafka.admin import KafkaAdminClient, NewTopic

TOPICS = ["clicks", "clicks-cleaned", "clicks-calculated", "clicks-calculated-ddos", "clicks-calculated-forbidden"]

kafka_client = KafkaClient("localhost:9092")
existing_topics = kafka_client.topic_partitions

admin_client = KafkaAdminClient(bootstrap_servers="localhost:9092")
topic_list = []
for topicname in TOPICS:
    if topicname not in existing_topics:
        print("Creating topic: {}".format(topicname))
        topic_list.append(NewTopic(name=topicname, num_partitions=1, replication_factor=1))
admin_client.create_topics(new_topics=topic_list, validate_only=False)

Creating topic: clicks
Creating topic: clicks-cleaned
Creating topic: clicks-calculated
Creating topic: clicks-calculated-ddos
Creating topic: clicks-calculated-forbidden


CreateTopicsResponse_v0(topic_errors=[(topic='clicks', error_code=0), (topic='clicks-calculated', error_code=0), (topic='clicks-cleaned', error_code=0), (topic='clicks-calculated-ddos', error_code=0), (topic='clicks-calculated-forbidden', error_code=0)])

In [2]:
# Extract archive of click data
import zipfile

ARCHIVEF = "20180113-6-9-with-nulls.json.zip"
CLICKSF = "20180113-6-9-with-nulls.json"

with zipfile.ZipFile(ARCHIVEF,"r") as zip_ref:
    zip_ref.extractall(".")

In [3]:
# SENDMODE = "kafka"
SENDMODE = "http"

In [4]:
# Send click data
import json
import time
from datetime import datetime

import requests
from kafka import KafkaProducer
from kafka.errors import KafkaError
from IPython.display import clear_output

producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda v: json.dumps(v).encode('utf-8'))

orig_start = 0
replay_start = time.time() * 1000

counter = 0
i = 0

print("Simulating website traffic...")

with open(CLICKSF, 'r') as f:
    first = True
    start_ts = 0
    for line in f:
        click = json.loads(line)

        orig_event = click["ts_ingest"] or 0
        
        if orig_event:
            curr_ts = time.time() * 1000

            if first:
                first = False
                orig_start = click["ts_ingest"]

            difference = (orig_event - orig_start) - (curr_ts - replay_start)

            if difference > 0:
                print("sleeping {0:.2f} seconds".format(difference/1000.0))
                time.sleep(difference/1000.0)
            
        click['ts_ingest'] = round((orig_event - orig_start) + replay_start)

        if SENDMODE == "kafka":            
            future = producer.send('clicks', click)
            try:
                record_metadata = future.get(timeout=10)
            except KafkaError:
                log.exception()
                pass
        else:
            #print(f"sending {line}")
            requests.post("http://localhost:5000/clicks", data=json.dumps(click).encode("utf-8"), headers={'Content-Type':'application/json'})
        
        print("✓ {}".format(datetime.fromtimestamp(orig_event/1000.0)))
        if i > 1000:
            clear_output()
            i = 0
        i = i+1


sleeping 0.01 seconds
✓ 2018-01-13 05:22:13.268000
sleeping 0.02 seconds
✓ 2018-01-13 05:22:13.295000
sleeping 0.04 seconds
✓ 2018-01-13 05:22:13.342000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:13.366000
sleeping 0.03 seconds
✓ 2018-01-13 05:22:13.399000
sleeping 0.03 seconds
✓ 2018-01-13 05:22:13.434000
✓ 2018-01-13 05:22:13.436000
sleeping 0.05 seconds
✓ 2018-01-13 05:22:13.498000
sleeping 0.00 seconds
✓ 2018-01-13 05:22:13.507000
✓ 2018-01-13 05:22:13.507000
✓ 2018-01-13 05:22:13.515000
✓ 2018-01-13 05:22:13.519000
✓ 2018-01-13 05:22:13.529000
✓ 2018-01-13 05:22:13.530000
✓ 2018-01-13 05:22:13.553000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:13.600000
✓ 2018-01-13 05:22:13.603000
✓ 2018-01-13 05:22:13.603000
✓ 2018-01-13 05:22:13.603000
✓ 2018-01-13 05:22:13.613000
✓ 2018-01-13 05:22:13.633000
✓ 2018-01-13 05:22:13.648000
✓ 2018-01-13 05:22:13.649000
✓ 2018-01-13 05:22:13.677000
✓ 2018-01-13 05:22:13.681000
✓ 2018-01-13 05:22:13.686000
sleeping 0.02 seconds
✓ 2018-01-13 05:22:13.7620

✓ 2018-01-13 05:22:22.384000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:22.402000
sleeping 0.15 seconds
✓ 2018-01-13 05:22:22.563000
sleeping 0.40 seconds
✓ 2018-01-13 05:22:22.989000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:23.009000
sleeping 0.11 seconds
✓ 2018-01-13 05:22:23.128000
✓ 2018-01-13 05:22:23.131000
sleeping 0.08 seconds
✓ 2018-01-13 05:22:23.230000
sleeping 0.10 seconds
✓ 2018-01-13 05:22:23.343000
sleeping 0.02 seconds
✓ 2018-01-13 05:22:23.375000
sleeping 0.02 seconds
✓ 2018-01-13 05:22:23.400000
✓ 1970-01-01 00:00:00
✓ 2018-01-13 05:22:23.406000
✓ 2018-01-13 05:22:23.436000
sleeping 0.02 seconds
✓ 2018-01-13 05:22:23.479000
sleeping 0.07 seconds
✓ 2018-01-13 05:22:23.569000
sleeping 0.41 seconds
✓ 2018-01-13 05:22:23.985000
sleeping 0.08 seconds
✓ 2018-01-13 05:22:24.071000
sleeping 0.07 seconds
✓ 2018-01-13 05:22:24.148000
sleeping 0.04 seconds
✓ 2018-01-13 05:22:24.197000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:24.235000
sleeping 0.05 seconds
✓ 2018-01-13 05:22:24.29

✓ 2018-01-13 05:22:28.834000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:28.863000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:28.889000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:28.918000
✓ 2018-01-13 05:22:28.924000
✓ 2018-01-13 05:22:28.924000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:28.956000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:28.977000
✓ 2018-01-13 05:22:28.980000
sleeping 0.04 seconds
✓ 2018-01-13 05:22:29.037000
✓ 2018-01-13 05:22:29.039000
✓ 2018-01-13 05:22:29.044000
✓ 2018-01-13 05:22:29.047000
✓ 2018-01-13 05:22:29.055000
✓ 2018-01-13 05:22:29.069000
✓ 2018-01-13 05:22:29.073000
sleeping 0.04 seconds
✓ 2018-01-13 05:22:29.146000
✓ 2018-01-13 05:22:29.152000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:29.180000
sleeping 0.03 seconds
✓ 2018-01-13 05:22:29.221000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:29.244000
sleeping 0.00 seconds
✓ 2018-01-13 05:22:29.256000
✓ 2018-01-13 05:22:29.261000
sleeping 0.01 seconds
✓ 2018-01-13 05:22:29.287000
sleeping 0.02 seconds
✓ 2018-01-13 05:22

ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /clicks (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f779a9cf320>: Failed to establish a new connection: [Errno 111] Connection refused'))