# Sending NYC taxi rides as Kafka messages

## Install some Python libraries

In [6]:
!pip install confluent_kafka sseclient

In [1]:
from datetime import datetime
import csv, json, time, sys, random

from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.admin import AdminClient

## Kafka producer setup

In [2]:
brokers = "kafka1:9092,kafka2:9093"

pconf = {
    'bootstrap.servers': brokers,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer':  StringSerializer('utf_8')
}
p = SerializingProducer(pconf)

stocks = "sample.csv"

## Construct Kafka message object

In [7]:
def to_dictionary(row_values):
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    str_date_time = date_time.strftime("%Y-%m-%dT%H:%M:%SZ")
    
    return {
        "medallion":                row_values[0],
        "hack_licence":             row_values[1],
        "vendor_id":                row_values[2],
        "rate_code":            int(row_values[3]),
        "store_and_fwd_flag":       row_values[4],
        "pickup_datetime":          row_values[5],
        "dropoff_datetime":         row_values[6],
        "passenger_count":      int(row_values[7]),
        "trip_time_in_secs":    int(row_values[8]),
        "trip_distance":      float(row_values[9]),
        "pickup_longitude":   float(row_values[10]),
        "pickup_latitude":    float(row_values[11]),
        "dropoff_longitude":  float(row_values[12]),
        "dropoff_latitude":   float(row_values[13]),
        "timestamp":          str_date_time
    }

## Start the stream

While this is running (or after), run the other notebook `kafka_consumer.ipynb`. Change the `n_messages` to configure how many messages are sent.

In [8]:
n_messages = 3

n = 0
with open(stocks) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader) # Skip header row
    try:
        for row in csv_reader:
            if n == n_messages:
                break
            stock = to_dictionary(row)
            print(stock)
            p.produce('stock', value=json.dumps(stock))
            p.poll(0)
            p.flush()
            time.sleep(0.5)
            n = n + 1
    except BufferError:
        sys.stderr.write(
            '%% Local producer queue is full (%d messages awaiting delivery): try again\n' % len(p)
        )

{'medallion': '89D227B655E5C82AECF13C3F540D4CF4', 'hack_licence': 'BA96DE419E711691B9445D6A6307C170', 'vendor_id': 'CMT', 'rate_code': 1, 'store_and_fwd_flag': 'N', 'pickup_datetime': '2013-01-01 15:11:48', 'dropoff_datetime': '2013-01-01 15:18:10', 'passenger_count': 4, 'trip_time_in_secs': 382, 'trip_distance': 1.0, 'pickup_longitude': -73.978165, 'pickup_latitude': 40.757977, 'dropoff_longitude': -73.989838, 'dropoff_latitude': 40.751171, 'timestamp': '2024-05-23T12:11:54Z'}
{'medallion': '0BD7C8F5BA12B88E0B67BED28BEA73D8', 'hack_licence': '9FD8F69F0804BDB5549F40E9DA1BE472', 'vendor_id': 'CMT', 'rate_code': 1, 'store_and_fwd_flag': 'N', 'pickup_datetime': '2013-01-06 00:18:35', 'dropoff_datetime': '2013-01-06 00:22:54', 'passenger_count': 1, 'trip_time_in_secs': 259, 'trip_distance': 1.5, 'pickup_longitude': -74.006683, 'pickup_latitude': 40.731781, 'dropoff_longitude': -73.994499, 'dropoff_latitude': 40.75066, 'timestamp': '2024-05-23T12:11:54Z'}
{'medallion': '0BD7C8F5BA12B88E0B67

## Use only if you need to purge all the messages in the queue

In [5]:
admin_client = AdminClient({"bootstrap.servers":brokers})
admin_client.delete_topics(topics=['stock'])

{'stock': <Future at 0xffff948baed0 state=running>}