In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import random, time

In [2]:
spark = SparkSession.builder \
    .appName("Spark Streaming") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [3]:
KAFKA_TOPIC = "fc_txn_base"
# KAFKA_TOPIC = "am"
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

In [4]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", KAFKA_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

In [5]:
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp")
schema_string = "tran_date DATE, account_number STRING, branch STRING, \
                product STRING, lcy_amount DOUBLE, transaction_code STRING, \
                description1 STRING, dc_indicator STRING, is_salary INT"

# schema_string = "account_number STRING, customer_code STRING, product STRING, \
#                 product_category STRING, acc_open_date DATETIME, acc_closed_date DATETIME, active_flag INT"
            
df2 = df1.select(from_csv(col("value"), schema_string).alias("mytable"), "timestamp")
df3 = df2.select("mytable.*", "timestamp")
df3.createOrReplaceTempView("table_view");
df4 = spark.sql("SELECT * FROM table_view")

In [6]:
my_write_stream = df4.writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("testedTable") \
        .start()
my_write_stream.awaitTermination(1)

False

In [8]:
df = spark.sql("SELECT * FROM testedTable")
df.show()

+----------+-------------------+------+-------+----------+----------------+--------------------+------------+---------+--------------------+
| tran_date|     account_number|branch|product|lcy_amount|transaction_code|        description1|dc_indicator|is_salary|           timestamp|
+----------+-------------------+------+-------+----------+----------------+--------------------+------------+---------+--------------------+
|2020-06-09|02XYZXYZ10017529992|    15|    SBA|   14500.0|              CI|ATM WDL /00463701...|    withdraw|        0|2024-06-05 12:42:...|
|2020-06-14|02XYZXYZ10017529992|    15|    SBA|   20000.0|              CI|ATM WDL /00463701...|    withdraw|        0|2024-06-05 12:42:...|
|2020-06-01|02XYZXYZ10017517823|    15|    SBA|    4000.0|              CI|ATM WDL /NICAV701...|    withdraw|        0|2024-06-05 12:42:...|
|2020-06-05|02XYZXYZ10017517823|    15|    SBA|     500.0|              CI|ATM WDL /NICAV701...|    withdraw|        0|2024-06-05 12:42:...|
|2020-06-12|0

In [9]:
df_count = df.count()
df_count

31

In [11]:
df_count = df.count()
df_count

35