# 1. Producing the data
In this task, we will implement Apache Kafka producers to simulate real-time data streaming. Spark is not allowed in this part since it’s simulating a streaming data source.  

1.	Your program should send one batch of browsing behaviour data every 5 seconds. One batch consists of a random 500-1000 rows from the browsing behaviour dataset. The CSV shouldn’t be loaded to memory at once to conserve memory (i.e. Read row as needed). Keep track of the start and end event_time. (You can assume the dataset is sorted by event_time.)  
2.	Add an integer column named ‘ts’ for each row, a Unix timestamp in seconds since the epoch. Spead your batch out evenly for 5 seconds.  
a.	For example, if you send a batch of 600 records at 2023-09-01 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1693526400):  
-	Record 1-120: ts = 1693526400   
-	Record 121-240: ts = 1693526401   
-	Record 241-360: ts = 1693526402  
-	….  
3.	Read the transactions between the start and end event_time in 1.1 every 5 seconds (the same frequency as browsing behaviour) and create a batch.  
4.	Send your two batches from 1.1 and 1.3 to Kafka topics with an appropriate name.  
Note 1: In 1.1, “random 500-1000” means the number of rows is random, and the data file is still read sequentially.  
Note 2: All the data except for the ‘ts’ column should be sent in the original String type without changing to any other type. This is because we are simulating a streaming access log and need to reduce the required processing at the source.


In [None]:
import csv
import random
from time import sleep
from kafka3 import KafkaProducer
from datetime import datetime

# Kafka configuration
hostip = "kafka"
browsing_topic = "browsing_behaviour"
transaction_topic = "transactions"

# Function to connect to Kafka producer
def connect_kafka_producer():
    try:
        producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'],
                                 value_serializer=lambda x: x.encode('utf-8'),
                                 api_version=(0, 10))
        return producer
    except Exception as ex:
        print("Error connecting to Kafka:", str(ex))
        return None

# Function to add 'ts' and send batch data to Kafka
def send_to_kafka(producer, topic, data, start_ts):
    batch_size = len(data)
    rows_per_second = batch_size // 5
    remainder = batch_size % 5

    # Loop through the records and send them to Kafka
    for i, row in enumerate(data):
        current_ts = start_ts + (i // rows_per_second)
        if i % rows_per_second == 0 and remainder > 0:
            current_ts += 1
            remainder -= 1
        
        row['ts'] = current_ts
        # Send each row to Kafka
        producer.send(topic, value=','.join([str(v) for v in row.values()]))
    
    # Print the number of records sent instead of individual records
    print(f"Sent {len(data)} records to {topic}")

# Read CSV in a memory-efficient way and fetch random rows
def read_random_batch_from_csv(file_path, batch_size):
    selected_rows = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for i, row in enumerate(reader):
            if len(selected_rows) < batch_size:
                selected_rows.append(row)
            else:
                break
    return selected_rows

# Main function to simulate the browsing and transaction data stream
def simulate_data_streams(browsing_file, transaction_file):
    producer = connect_kafka_producer()
    if producer is None:
        return

    while True:
        # Generate random batch size between 500-1000 for browsing behavior
        browsing_batch_size = random.randint(500, 1000)
        browsing_batch = read_random_batch_from_csv(browsing_file, browsing_batch_size)

        # Generate random batch size between 500-1000 for transactions
        transaction_batch_size = random.randint(500, 1000)
        transaction_batch = read_random_batch_from_csv(transaction_file, transaction_batch_size)

        # Current timestamp
        current_time = datetime.now()

        # Convert to Unix timestamp
        start_ts = int(current_time.timestamp())

        # Send browsing data batch to Kafka
        send_to_kafka(producer, browsing_topic, browsing_batch, start_ts)

        # Send transaction data batch to Kafka
        send_to_kafka(producer, transaction_topic, transaction_batch, start_ts)

        # Sleep for 5 seconds to simulate real-time streaming
        sleep(5)

if __name__ == '__main__':
    # File paths to the browsing behaviour and transaction datasets
    browsing_file = 'browsing_behaviour.csv'
    transaction_file = 'transactions.csv'
    
    simulate_data_streams(browsing_file, transaction_file)


Sent 672 records to browsing_behaviour
Sent 751 records to transactions
Sent 565 records to browsing_behaviour
Sent 870 records to transactions
Sent 714 records to browsing_behaviour
Sent 971 records to transactions
Sent 533 records to browsing_behaviour
Sent 767 records to transactions
Sent 738 records to browsing_behaviour
Sent 926 records to transactions
Sent 918 records to browsing_behaviour
Sent 530 records to transactions
Sent 590 records to browsing_behaviour
Sent 825 records to transactions
Sent 865 records to browsing_behaviour
Sent 635 records to transactions
Sent 660 records to browsing_behaviour
Sent 552 records to transactions
Sent 833 records to browsing_behaviour
Sent 734 records to transactions
Sent 575 records to browsing_behaviour
Sent 538 records to transactions
Sent 849 records to browsing_behaviour
Sent 921 records to transactions
Sent 943 records to browsing_behaviour
Sent 892 records to transactions
Sent 796 records to browsing_behaviour
Sent 696 records to trans