'''
Author: Ngawang Gurung
Date: 2024/07/24
'''

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler
import random, time

import os
from dotenv import load_dotenv

In [18]:
spark = SparkSession.builder \
    .appName("Spark Streaming") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [19]:
KAFKA_TOPIC = "am"
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

In [20]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

In [21]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [24]:
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp")
df1.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [25]:

schema_string = "account_number STRING, customer_code STRING, product STRING, \
                product_category STRING, acc_open_date DATE, acc_closed_date DATE, active_flag INT"
            
df2 = df1.select(from_csv(col("value"), schema_string).alias("mytable"), "timestamp")
df3 = df2.select("mytable.*", "timestamp")

load_dotenv()

SUPER_SECRET_KEY = os.getenv('SUPER_SECRET_KEY') # Super secret key (must be 16, 24, or 32 bytes long)
df_decrypted = df3.withColumn('account_number_decrypted', expr(f"aes_decrypt(unbase64(account_number), '{SUPER_SECRET_KEY}', 'ECB')").cast('string'))

df_decrypted.createOrReplaceTempView("table_view");
df4 = spark.sql("SELECT * FROM table_view")

In [26]:
my_write_stream = df4.writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("testedTable") \
        .start()
my_write_stream.awaitTermination(1)

False

In [27]:
df = spark.sql("SELECT * FROM testedTable")
df.show(20)

+--------------------+-------------+-------+----------------+-------------+---------------+-----------+--------------------+------------------------+
|      account_number|customer_code|product|product_category|acc_open_date|acc_closed_date|active_flag|           timestamp|account_number_decrypted|
+--------------------+-------------+-------+----------------+-------------+---------------+-----------+--------------------+------------------------+
|+Y7DNNNCRLxa9tHil...|    KL0255921|    SBA|           SBPPS|   1999-03-10|           NULL|          0|2024-05-14 13:35:...|     02XYZXYZ10015592101|
|U7RRmweke5W8oVRNv...|    KL0255937|    SBA|           SBPPS|   1999-03-10|           NULL|          0|2024-05-14 13:35:...|     02XYZXYZ10015593701|
|U7RRmweke5W8oVRNv...|    KL0255938|    SBA|           SBPPS|   1999-03-10|           NULL|          0|2024-05-14 13:35:...|     02XYZXYZ10015593801|
|Khed5di6hfdejAOOB...|    KL0255948|    SBA|           SBANU|   1999-03-22|           NULL|         

In [28]:
df_count = df.count()
df_count

31

In [31]:
df_count = df.count()
df_count

36