# FIT5202 - Data Processing for Big Data S2 2021 
# Assessment 2B-Task1: Flight Producer


Student information
- Family Name: Aggarwal
- Given Name: Naval
- Student ID: 31153054
- Student email: nagg0001@student.monash.edu

### Import Libraries

In [1]:
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
from datetime import datetime,timezone
from pytz import timezone
import csv
import glob

### Analysing and Creating Producer

In [2]:
# This method is used to get flight records
def getFlightRecords():        
    # Path of files
    path = r'datasets/flight*'
    # Using glob to extract files
    all_flight_files = glob.glob(path + ".csv")

    flight_data = []
    
    for each_file in all_flight_files:
        with open(each_file, 'rt') as f:
            reader = csv.DictReader(f)
            
            for row in reader:
                flight_data.append({'YEAR':int(row['YEAR']),'MONTH':int(row['MONTH']),'DAY':int(row['DAY']),'DAY_OF_WEEK':int(row['DAY_OF_WEEK']),'AIRLINE':row['AIRLINE'],'FLIGHT_NUMBER':int(row['FLIGHT_NUMBER']),'TAIL_NUMBER':row['TAIL_NUMBER'],'ORIGIN_AIRPORT':row['ORIGIN_AIRPORT'],'DESTINATION_AIRPORT':row['DESTINATION_AIRPORT'],'SCHEDULED_DEPARTURE':int(row['SCHEDULED_DEPARTURE']),'DEPARTURE_TIME':row['DEPARTURE_TIME'],'DEPARTURE_DELAY':row['DEPARTURE_DELAY'],'TAXI_OUT':row['TAXI_OUT'],'WHEELS_OFF':row['WHEELS_OFF'],'SCHEDULED_TIME':row['SCHEDULED_TIME'],'ELAPSED_TIME':row['ELAPSED_TIME'],'AIR_TIME':row['AIR_TIME'],'DISTANCE':row['DISTANCE'],'WHEELS_ON':row['WHEELS_ON'],'TAXI_IN':row['TAXI_IN'],'SCHEDULED_ARRIVAL':int(row['SCHEDULED_ARRIVAL']),'ARRIVAL_TIME':row['ARRIVAL_TIME'],'ARRIVAL_DELAY':row['ARRIVAL_DELAY'],'DIVERTED':int(row['DIVERTED']),'CANCELLED':int(row['CANCELLED'])})
            
    print("The total number of records are",len(flight_data))
    
    return flight_data

In [3]:
#function to publish message
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)

    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))


In [4]:
#function to connect kafka with producer
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
    

In [5]:
day_list = ['Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# This method is used to fetch keys based on DAY_OF_WEEK
def fetchKeyFlights(data):
    keyFlights = dict()

    for i in range(1, 8):
        keyFlights[i] =  list(filter(lambda y: y['DAY_OF_WEEK'] == i, data))
        print("Records on ", day_list[i-1], " are ",  len(keyFlights[i]))
        
    return keyFlights

In [None]:
%time

if __name__ == '__main__':
    
    #topic name
    topic = 'flightTopic'   
    
    # rows of the csv file
    flightRecords = getFlightRecords()
    
    # connecting
    flightProducer = connect_kafka_producer()
    
    # Creating dictionary based flights occuring on Days 
    KeyFlights = fetchKeyFlights(flightRecords)
    
    # Creating a dictionary to store the indexes for data to be send on days basis 
    start_index_dict = dict()
    
    
    print('Publishing records..')
    
    # Iterating till we reach the end of dataset
    while True:
        
        # List of rows send in the current batch
        publish = []

        # List of rows to be sent as late rows
        publish_late_data = []

        # iteration for each batch X1 and Y1
        for key, value in KeyFlights.items():

            # Checking the index from where the start of dataset needs to send
            if(start_index_dict.get(key) == None):
                start_index = 0
            else:
                start_index = start_index_dict[key]


            # creating number of instances of sub batches in current pass
            A_random = random.randint(70, 100)

            # creating number of instances of sub batches in late pass
            B_random = random.randint(5, 10)


            # Creating the current timestamp
            ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}

            to_send = value[start_index:start_index + A_random]

            # increase the start index by the number of rows taken
            start_index = start_index + A_random
            # append the timestamp into the object to be sent
            data = [dict(item, **ts) for item in to_send]

            # late data
            to_send = value[start_index:start_index + B_random]
            # increase the start index by the number of rows taken
            start_index = start_index + B_random
            # append the timestamp into the object to be sent
            late_data = [dict(item, **ts) for item in to_send]


            publish.extend(data)

            # reseting to start from the beginning
            if start_index >= len(value):
                start_index = 0
            
            # Sleeping the thread, so that there is a difference in timestamp for A1,A2,....A7 
            sleep(1)
            
            # Adding the late rows  
            publish_late_data.extend(late_data)

            # Storing the index for particular day
            start_index_dict[key] = start_index



        # kafka.publish
        publish_message(flightProducer, topic, publish)

        # Extending the publish list with the late rows to be send in the next batch
        publish.extend(publish_late_data)

        # Sleeping the thread for 10 seconds between different batches 
        sleep(10)


CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
The total number of records are 582184
Records on  Monday  are  86317
Records on  Tuesday  are  84449
Records on  Wednesday  are  85607
Records on  Thursday  are  87683
Records on  Friday  are  86253
Records on  Saturday  are  70453
Records on  Sunday  are  81422
Publishing records..
