# Kafka Producer for Processor Data

## 1. Producing the data
### 1.1. Process Event Producer

#### Import Libraries

In [1]:
# Import libraries
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import time
from pprint import pprint
import pandas as pd

#### Define a function to read CSV file

* The CSV data file is read using `pandas`.

In [2]:
# Define the CSV reading function
def read_csv(fileName):
    # Read the file using Pandas 
    data = pd.read_csv(fileName)
    return data

#### Define Kafka functions

In [3]:
# Define the function to publish messages
def publish_message(producer_instance, topic_name, data):
    # Send message containing data to the specified topic
    try:
        producer_instance.send(topic_name, data)
        print('Message successfully sent. Sent ', len(data['data']), ' rows. At ', data['ts'])
    
    # If encounters error then print out the error message
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex)) 

* The producer is created at port 9092 and would send messages to the specified topic.

In [4]:
# Define the function to generate Kafka producer
def connect_kafka_producer():
    _producer = None
    
    # Generate the connection at port 9092
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
        
    # If encounters error then print out the error message
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
        
    # Return the producer
    finally:
        return _producer

#### Generate and Run the producer

* The Process Producer is generated at port 9092 and would produce messages to the `process` topic. Data included in each message is extracted from the `Streaming_Linux_process.csv` file. In each iteration, a random number of records for each machine are extracted and included in the message together with the current timestamp. The timestamp comes in the form of Unix format.


* The execution of this producer is carried out infinitely and could only be stopped with a keyboard interruption from user.

In [5]:
# Run the main program
if __name__ == '__main__':

    # Specify the topic
    topic = 'process'
    
    # Read data from the CSV file
    all_machines = read_csv('data/Streaming_Linux_process.csv')
    
    # Split the dataframe into sub-dataframes by machine number
    machine_4 = all_machines[all_machines.machine==4].sort_values(by=['sequence'])
    machine_5 = all_machines[all_machines.machine==5].sort_values(by=['sequence'])
    machine_6 = all_machines[all_machines.machine==6].sort_values(by=['sequence'])
    machine_7 = all_machines[all_machines.machine==7].sort_values(by=['sequence'])
    machine_8 = all_machines[all_machines.machine==8].sort_values(by=['sequence'])
    
    # Combine sub-dataframes with their sequence tracking counter
    data_tracking = [ [machine_4, 0], [machine_5, 0], [machine_6, 0], [machine_7, 0], [machine_8, 0] ]
    
    # Create the producer
    producer = connect_kafka_producer()
    
    # Start sending messages
    while True:
        
        # Store data for each message in a list
        rows = list()
        
        # Get records for each machine
        for idx in range(len(data_tracking)):
            
            # Take the dataset and its sequence tracking counter
            dataset = data_tracking[idx][0]
            counter = data_tracking[idx][1]
            
            # Get size of the dataset
            dataset_size = len(dataset)
            
            # Create a random number X from 10 to 50
            X = random.randint(10, 50)
            
            # If the data is exhausted then restart from the beginning
            if(counter+X >= dataset_size):
                # Take the remaining rows and the ones in the beginning
                records = pd.concat([dataset.iloc[counter:dataset_size], dataset.iloc[0:counter+X-dataset_size]])
                # Reset sequence tracking counter
                data_tracking[idx][1] = counter+X-dataset_size
                
            # If the data is not exhausted 
            else:
                # Take X rows
                records = dataset.iloc[counter:(counter+X)]
                # Update sequence tracking counter
                data_tracking[idx][1] = counter + X
            
            # Add all rows to the list
            rows = rows + records.to_dict('records')
        
        # Send data together with timestamp
        data = {
            'ts': int(time.time()), 
            'data': rows
        }
        publish_message(producer, topic, data)
        
        # Rest for 5 seconds
        sleep(5)

Message successfully sent. Sent  155  rows. At  1603783048
Message successfully sent. Sent  121  rows. At  1603783053
Message successfully sent. Sent  152  rows. At  1603783058
Message successfully sent. Sent  145  rows. At  1603783063
Message successfully sent. Sent  127  rows. At  1603783068
Message successfully sent. Sent  176  rows. At  1603783073
Message successfully sent. Sent  137  rows. At  1603783078
Message successfully sent. Sent  150  rows. At  1603783083
Message successfully sent. Sent  130  rows. At  1603783088
Message successfully sent. Sent  143  rows. At  1603783093
Message successfully sent. Sent  104  rows. At  1603783098
Message successfully sent. Sent  145  rows. At  1603783103
Message successfully sent. Sent  133  rows. At  1603783108
Message successfully sent. Sent  173  rows. At  1603783113
Message successfully sent. Sent  154  rows. At  1603783118
Message successfully sent. Sent  153  rows. At  1603783123
Message successfully sent. Sent  146  rows. At  16037831

KeyboardInterrupt: 