## Package Imports

In [19]:
from kafka import KafkaConsumer
import os
SCALA_VERSION = '2.12'
SPARK_VERSION = '3.1.2'
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION} pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION} pyspark-shell'
findspark.init()
import pyspark
# sc = pyspark.SparkContext(appName="myAppName")
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType, TimestampType, StructField
from pyspark.context import SparkContext
from pyspark.sql import functions as F

## Creating Spark session with gathering stream package and some configs around

In [20]:
spark = SparkSession.builder\
        .master('local[*]')\
        .config("spark.streaming.stopGracefullyOnShutdown", True) \
        .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
        .config("spark.sql.shuffle.partitions", 4) \
        .appName("myAppName")\
        .getOrCreate()

## Changed below config approach on later works

In [21]:
## Kafka configs
# kafka_source_config = {
#     # "kafka.sasl.jaas.config": jaas_config,
#     "kafka.bootstrap.servers" : "settled-terrapin-12518-eu2-kafka.upstash.io:9092",
#     "kafka.sasl.mechanism": "SCRAM-SHA-256",
#     "kafka.security.protocol" : "SASL_SSL",
#     "subscribe": "sec_topic",
#     "startingOffsets": "earliest",
#     "failOnDataLoss": "false"
# }

## Only Needed if sink is Kafka

In [22]:
# kafka_sink_config = {
#     "kafka.sasl.jaas.config": jaas_config,
#     "kafka.bootstrap.servers": 'settled-terrapin-12518-eu2-kafka.upstash.io:9092',
#     "kafka.sasl.mechanism": "SCRAM-SHA-256",
#     "kafka.security.protocol": "SASL_SSL",
#     "topic": "sink",
#     "checkpointLocation" : "./checkpoint.txt"
# }

## Read stream from Uptash kafka server

In [23]:
streaming_df = spark.readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers",'settled-terrapin-12518-eu2-kafka.upstash.io:9092')\
    .option("kafka.sasl.mechanism", "SCRAM-SHA-256")\
    .option("kafka.security.protocol", "SASL_SSL")\
    .option("kafka.sasl.jaas.config","""org.apache.kafka.common.security.plain.PlainLoginModule required username="c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCSqaSFgt-fI-8JyIV50sk_wCOG7dRr8LsY" password="Y2FhZGE3ZWQtYzQxOC00ZTdiLWJlZjUtOGRhMjJjN2YwZjU1";""")\
    .option("startingOffsets", "earliest")\
    .option("subscribe", "mysql")\
    .load()
    # .selectExpr("CAST(value AS STRING)");

In [24]:
# df = spark.read\
#     .format("kafka")\
#     .option("kafka.bootstrap.servers",'settled-terrapin-12518-eu2-kafka.upstash.io:9092')\
#     .option("subscribe","sec_topic")\
#     .option("startingOffsets","earliest")\
#     .load()

## Read Stream
# streaming_df = spark\
#     .readStream\
#     .format("kafka")\
#     .options(**kafka_source_config)\
#     .option("kafka.sasl.jaas.config","""org.apache.kafka.common.security.plain.PlainLoginModule required username="c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCTBb5AEffUiTulATzsbFtDRxbvhkO0Wsnc" password="N2E2ZGVjY2UtZDY4YS00MjM4LTk5NTktMjU1OTRiZWQ4Y2Ix";""")\
#     .load()

## Source schema on JSON explode

In [25]:
## Source Schema
df_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("store_id", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("address_id", StringType(), True)
])

## Casting and exploding value column

In [26]:
# Parse value from binay to string
value_df = streaming_df.select(F.from_json(F.col("value").cast("string"),df_schema).alias("value"))

In [27]:
value_df.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- customer_id: string (nullable = true)
 |    |-- store_id: string (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- address_id: string (nullable = true)



In [28]:
# json_expanded_df = json_df.withColumn("name", F.from_json(json_df["value"], df_schema)).select("value.*")
explode_df = value_df.selectExpr("value.store_id")

## Schema control for dataframe to write

In [29]:
# explode_df.printSchema()

## Writing stream to local

In [30]:
# Generate running word count
store_count = explode_df.groupBy("store_id").count()

In [31]:
# explode_df.createOrReplaceTempView("updates")
# spark.sql("select store_id, count(*) from updates")  # returns another streaming DF

In [32]:
# explode_df.isStreaming

In [33]:
# query = store_count \
#     .writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .start()

# query.awaitTermination()


# query = store_count \
#     .writeStream \
#     .format("append") \
#     .outputMode("append").option("path", "C:/Users/Lenovo/Desktop/spark/New folder/spark_stream_job_6/stream_output") \
#     .option("checkpointLocation", "C:/Users/Lenovo/Desktop/spark/New folder/spark_stream_job_6/checkpoint") \
#     .start()

# query.awaitTermination()

In [34]:
# windowedCounts = streaming_df \
#     .withWatermark("timestamp", "1 minutes") \
#     .groupBy(
#         window(streaming_df.timestamp, "10 minutes", "5 minutes"),
#         words.word) \
#     .count()

In [35]:
baboli = store_count.writeStream.format("kafka").outputMode("complete")  \
    .option("kafka.bootstrap.servers",'settled-terrapin-12518-eu2-kafka.upstash.io:9092')\
    .option("kafka.sasl.mechanism", "SCRAM-SHA-256")\
    .option("kafka.security.protocol", "SASL_SSL")\
    .option("kafka.sasl.jaas.config","""org.apache.kafka.common.security.plain.PlainLoginModule required username="c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCSqaSFgt-fI-8JyIV50sk_wCOG7dRr8LsY" password="Y2FhZGE3ZWQtYzQxOC00ZTdiLWJlZjUtOGRhMjJjN2YwZjU1";""")\
    .option("topic", "mysql_write") \
    .option("checkpointLocation", "C:/Users/Lenovo/Desktop/spark/New folder/spark_stream_job_11/checkpoint") \
    .start()


# writeStream
#     .format("kafka")
#     .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
#     .option("topic", "updates")
#     .start()
baboli.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 595c0039-4530-4b47-a3b9-fde7f1043d9e, runId = e82225a6-6de0-4f43-af6b-71008a64d899] terminated with exception: org.apache.kafka.common.errors.TimeoutException: Call(callName=describeTopics, deadlineMs=1719501080816, tries=1, nextAllowedTryMs=1719501080931) timed out at 1719501080831 after 1 attempt(s)

## Only needed to be used 

In [None]:
name_writer.awaitTermination()

## Uptash`s own method to read from topic could be used as alternative

In [17]:
consumer = KafkaConsumer(
    'dingilaz',
    bootstrap_servers='settled-terrapin-12518-eu2-kafka.upstash.io:9092',
    sasl_mechanism='SCRAM-SHA-256',
    security_protocol='SASL_SSL',
    sasl_plain_username='c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCTBb5AEffUiTulATzsbFtDRxbvhkO0Wsnc',
    sasl_plain_password='N2E2ZGVjY2UtZDY4YS00MjM4LTk5NTktMjU1OTRiZWQ4Y2Ix',
    group_id='YOUR_CONSUMER_GROUP',
    auto_offset_reset='earliest'
)

try:
    for message in consumer:
        print(f"Received message: {message.value}")
except KeyboardInterrupt:
    pass
finally:
    consumer.close()

Received message: b'{"name": "Thomas Lopez"}'
Received message: b'{"name": "Sharon Rivera"}'
Received message: b'{"name": "Jonathan Rodriguez"}'
Received message: b'{"name": "Melissa Hammond"}'
Received message: b'{"name": "Daniel Frederick"}'
Received message: b'{"name": "Henry Herman"}'
Received message: b'{"name": "Sherry Cantrell"}'
Received message: b'{"name": "Tommy Smith"}'
Received message: b'{"name": "Tammy Nguyen"}'
Received message: b'{"name": "Edward Estrada"}'
Received message: b'{"name": "Dustin Thomas"}'
Received message: b'{"name": "Katrina Lee"}'
Received message: b'{"name": "Gary Hill"}'
Received message: b'{"name": "Kristin Gonzalez"}'
Received message: b'{"name": "Joel Jackson"}'
Received message: b'{"name": "Timothy Fernandez"}'
Received message: b'{"name": "Marcus Diaz"}'
Received message: b'{"name": "Stephanie Green"}'
Received message: b'{"name": "Brianna Jones"}'
