# FIT3182 Major Assignment Part B Task 1c (Event Producer 3)
## George Tan Juan Sheng (30884128)
### Part B Task 1c
#### Write a python program that loads all the data from hotspot_TERRA_streaming.csv and randomly (with replacement) feeds the data to the stream every 2 seconds. TERRA is another satellite from NASA that reports 4 latitude, longitude, confidence and surface temperature of a location. You will need to append additional information such as producer information to identify the producer and created date & time.

First, we have to connect to our MongoClient and access the collection we have made from Part A (so that we can get the latest date).

In [3]:
import pymongo
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from datetime import datetime,timedelta

client = MongoClient () 
db = client.fit3182_assignment_db
collection = db.partA


Then, we would have to write our program so that we would be able to feed our data to Kafka.

In [None]:
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random

# Reads data from hotspot_TERRA_streaming.csv, puts each row data into a document and appends all such documents into a list.
def read_hotspot_TERRA_streaming():
    hotspot_TERRA_streaming = pd.read_csv('hotspot_TERRA_streaming.csv')

    data = []
    for index,terraRow in hotspot_TERRA_streaming.iterrows():
        document = {}
        document['latitude'] = float(terraRow['latitude'])
        document['longitude'] = float(terraRow['longitude'])
        document['confidence'] = int(terraRow['confidence'])
        document['surface_temperature_celcius'] = int(terraRow['surface_temperature_celcius'])
        data.append(document)
    
    return data

# Gets the latest date in our collection
def get_latest_date():
    latest_date = collection.aggregate([
                {"$sort":{"date":-1}},
                {"$project":{"_id":0,"date":1}},
                {"$limit":1}
                ])
    for document in latest_date:
        latest_date = document['date']
    return latest_date
    
# Publishes message to Kafka
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, value=data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
if __name__ == '__main__':
   
    topic = 'PartB'
    print('Publishing records..')
    producer03 = connect_kafka_producer()
    data = read_hotspot_TERRA_streaming() # Gets all the documents produced from hotspot_TERRA_streaming.csv
    latest_date = get_latest_date() + timedelta(days=1) # After getting latest date, we would add one day to it to get the first date we would use to start feeding data
    secondsPassed= 0 # Tracks how many seconds we should add to our latest date

    while True:
        chosenData = random.choice(data)   # Randomly chooses a document from our list of documents
        curr_date = latest_date + timedelta(seconds=secondsPassed) # Creates the date we will use to feed our data by adding the number of seconds to our latest date from Part A
        chosenData['producer'] = "hotspot_TERRA_streaming"
        chosenData["created_datetime"] = curr_date.strftime("%d/%m/%Y %H:%M:%S")
        publish_message(producer03, topic, chosenData)
        secondsPassed += 17280 # After we insert a hotspot streaming data, add 17280 seconds to secondPassed. 17280 second is equivalent to 24 hours divided by 5. As we would insert 5 hotspot AQUA streaming data per day, adding 17280 seconds each time after we insert a hotspot streaming data simulates the time difference of adding the 5 data in a day
        sleep(2) # Sleep for 2 seconds so that we would be able to insert 5 data per climate streaming data being inserted, as climate streaming data is inserted every 10 seconds




Publishing records..
Message published successfully. Data: {'latitude': -36.4624, 'longitude': 141.0446, 'confidence': 78, 'surface_temperature_celcius': 51, 'producer': 'hotspot_TERRA_streaming', 'created_datetime': '01/01/2022 00:00:00'}
Message published successfully. Data: {'latitude': -38.1185, 'longitude': 143.9514, 'confidence': 71, 'surface_temperature_celcius': 46, 'producer': 'hotspot_TERRA_streaming', 'created_datetime': '01/01/2022 04:48:00'}
Message published successfully. Data: {'latitude': -37.5052, 'longitude': 142.8636, 'confidence': 62, 'surface_temperature_celcius': 54, 'producer': 'hotspot_TERRA_streaming', 'created_datetime': '01/01/2022 09:36:00'}
Message published successfully. Data: {'latitude': -37.45, 'longitude': 148.097, 'confidence': 70, 'surface_temperature_celcius': 37, 'producer': 'hotspot_TERRA_streaming', 'created_datetime': '01/01/2022 14:24:00'}
Message published successfully. Data: {'latitude': -38.116, 'longitude': 143.81799999999998, 'confidence':