In [1]:
import pandas as pd
from datetime import datetime
import time
import random

from kafka import KafkaProducer

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Pandas to Spark") \
    .getOrCreate()

In [3]:
df = spark.read.csv('fc_account_master.csv', header = True)
df.show(5)

+-------------------+-------------+-------+----------------+-------------+---------------+-----------+
|     account_number|customer_code|product|product_category|acc_open_date|acc_closed_date|active_flag|
+-------------------+-------------+-------+----------------+-------------+---------------+-----------+
|02XYZXYZ10015592101|    KL0255921|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|02XYZXYZ10015593701|    KL0255937|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|02XYZXYZ10015593801|    KL0255938|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|02XYZXYZ10015594801|    KL0255948|    SBA|           SBANU|   1999-03-22|           NULL|          0|
|02XYZXYZ10015597601|    KL0255976|    SBA|           SBPPS|   1999-04-22|           NULL|          0|
+-------------------+-------------+-------+----------------+-------------+---------------+-----------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import base64, expr

super_secret_key = '1HarryPotterAndTheSorcerersStone' # Super secret key (must be 16, 24, or 32 bytes long)
df_encrypted = df.withColumn('account_number', base64(expr(f"aes_encrypt(account_number, '{super_secret_key}', 'ECB')")))

In [5]:
df_encrypted.show(5)

+--------------------+-------------+-------+----------------+-------------+---------------+-----------+
|      account_number|customer_code|product|product_category|acc_open_date|acc_closed_date|active_flag|
+--------------------+-------------+-------+----------------+-------------+---------------+-----------+
|+Y7DNNNCRLxa9tHil...|    KL0255921|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|U7RRmweke5W8oVRNv...|    KL0255937|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|U7RRmweke5W8oVRNv...|    KL0255938|    SBA|           SBPPS|   1999-03-10|           NULL|          0|
|Khed5di6hfdejAOOB...|    KL0255948|    SBA|           SBANU|   1999-03-22|           NULL|          0|
|wt0lfnWsGTalCHEtZ...|    KL0255976|    SBA|           SBPPS|   1999-04-22|           NULL|          0|
+--------------------+-------------+-------+----------------+-------------+---------------+-----------+
only showing top 5 rows



In [7]:
# Collect DataFrame into a list of Rows
rows = df_encrypted.collect()

# Convert each Row to a dictionary
df_list = [row.asDict() for row in rows]

In [8]:
KAFKA_TOPIC = "am"
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

producer = KafkaProducer(bootstrap_servers = KAFKA_BOOTSTRAP_SERVERS, value_serializer = lambda x: x.encode('utf-8'))

In [None]:
print("Kafka Producer Application Started ... ")

for message in df_list:
    message_data = ','.join(str(message[column]) for column in df.columns)
    print("Message Type:", type(message_data))
    print("Message:", message_data)
    producer.send(KAFKA_TOPIC, message_data)
    time.sleep(1)

print("Kafka Producer Application Completed.")