# Kafka Producer for Memory Data

## 1. Producing the data
### 1.2. Memory Event Producer

In [2]:
# Import libraries
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import time
from pprint import pprint
import pandas as pd

#### Define a function to read CSV file

* The CSV data file is read using `pandas`.

In [3]:
# Define the CSV reading function
def read_csv(fileName):
    # Read the file using Pandas 
    data = pd.read_csv(fileName)
    return data

#### Define Kafka functions

In [4]:
# Define the function to publish messages
def publish_message(producer_instance, topic_name, data):
    # Send message containing data to the specified topic
    try:
        producer_instance.send(topic_name, data)
        print('Message successfully sent. Sent ', len(data['data']), ' rows. At ', data['ts'])
        
    # If encounters error then print out the error message
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))  

* The producer is created at port 9092 and would send messages to the specified topic.

In [5]:
# Define the function to generate Kafka producer        
def connect_kafka_producer():
    _producer = None
    
    # Generate the connection at port 9092
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
        
    # If encounters error then print out the error message
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
        
    # Return the producer
    finally:
        return _producer

#### Generate and Run the producer

* The Memory Producer is generated at port 9092 and would produce messages to the `memory` topic. Data included in each message is extracted from the `Streaming_Linux_memory.csv` file. In each iteration, a random number of new records for each machine together with a small number of records coming from the previous iteration are extracted and included in the message together with the current timestamp. The timestamp comes in the form of Unix format.


* The execution of this producer is carried out infinitely and could only be stopped with a keyboard interruption from user.

In [7]:
# Run the main program
if __name__ == '__main__':

    # Specify the topic
    topic = 'memory'
    
     # Read data from the CSV file
    all_machines = read_csv('data/Streaming_Linux_memory.csv')
    
    # Split the dataframe into sub-dataframes by machine number
    machine_4 = all_machines[all_machines.machine==4].sort_values(by=['sequence'])
    machine_5 = all_machines[all_machines.machine==5].sort_values(by=['sequence'])
    machine_6 = all_machines[all_machines.machine==6].sort_values(by=['sequence'])
    machine_7 = all_machines[all_machines.machine==7].sort_values(by=['sequence'])
    machine_8 = all_machines[all_machines.machine==8].sort_values(by=['sequence'])
    
    # Combine sub-dataframes with their sequence tracking counter
    data_tracking = [ [machine_4, 0], [machine_5, 0], [machine_6, 0], [machine_7, 0], [machine_8, 0] ]
    
    # Create the producer
    producer = connect_kafka_producer()
    
    # Create the list of Y rows containing data from previous iteration
    rows_Y = list()

    # Start sending messages
    while True:
        # Move all data from the rows_Y list to the rows_X list
        rows_X = rows_Y
        # Empry the rows_Y list
        rows_Y = list()
        
        # Get records for each machine
        for idx in range(len(data_tracking)):
            
            # Take the dataset and its sequence tracking counter
            dataset = data_tracking[idx][0]
            counter = data_tracking[idx][1]
            
            # Get size of the dataset
            dataset_size = len(dataset)
            
            # Create a random number X from 20 to 80
            X = random.randint(20, 80)
            # Create a random number Y from 0 to 5
            Y = random.randint(0, 5)
            
            # If the data is exhausted then restart from the beginning
            if(counter+X >= dataset_size):
                # Take the remaining rows and the ones in the beginning
                records_X = pd.concat([dataset.iloc[counter:dataset_size], dataset.iloc[0:counter+X-dataset_size]])
                # Take Y rows from X
                records_Y = records_X.sample(n=Y)
                # Reset sequence tracking counter
                data_tracking[idx][1] = counter+X-dataset_size
            else:
                # Take X rows
                records_X = dataset.iloc[counter:(counter+X)]
                # Take Y rows from X
                records_Y = records_X.sample(n=Y)
                # Update sequence tracking counter
                data_tracking[idx][1] = counter + X
                
            # Add X rows to the sending list
            rows_X = rows_X + records_X.to_dict('records')
            # Keep Y rows in rows_Y
            rows_Y = rows_Y + records_Y.to_dict('records')
            
        # Send data together with timestamp
        data = {
            'ts': int(time.time()), 
            'data': rows_X
        }
        publish_message(producer, topic, data)
        
        # Rest for 10 seconds
        sleep(10)

Message successfully sent. Sent  299  rows. At  1635066270
Message successfully sent. Sent  295  rows. At  1635066280


KeyboardInterrupt: 