In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, StructField

In [2]:
spark = SparkSession.builder \
    .appName("KafkaToHDFS") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .config("spark.sql.shuffle.partitions", 4) \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [3]:
kafka_df  = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka_v2:9092") \
    .option("subscribe", "credit_card_trans") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

In [4]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
df = kafka_df.selectExpr("CAST(value AS STRING) as json_str")

In [8]:
schema = StructType([
    StructField("", StringType(), True),
    StructField("trans_date_trans_time", StringType(), True),
    StructField("cc_num", StringType(), True),
    StructField("merchant", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amt", StringType(), True),
    StructField("first", StringType(), True),
    StructField("last", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("lat", StringType(), True),
    StructField("long", StringType(), True),
    StructField("city_pop", StringType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True),
    StructField("trans_num", StringType(), True),
    StructField("unix_time", StringType(), True),
    StructField("merch_lat", StringType(), True),
    StructField("merch_long", StringType(), True),
    StructField("is_fraud", StringType(), True),
    StructField("event_time",StringType(),True)
])

In [11]:
query = df.writeStream \
    .format("parquet") \
    .option("checkpointLocation", "hdfs://hadoop-namenode:9000/user/jovyan/checkpoints/kafka_hdfs_job") \
    .option("path", "hdfs://hadoop-namenode:9000/user/jovyan/output/kafka_parquet") \
    .outputMode("append") \
    .trigger(processingTime="1 hour") \
    .start()
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Gracefully stopping the stream...")
    query.stop()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


Gracefully stopping the stream...


In [51]:
# # to check that data lake works 

# from hdfs import InsecureClient

# client = InsecureClient('http://hadoop-namenode:9870', user='jovyan')

# try:
#     files = client.list('/user/jovyan/output/kafka_parquet')
#     print("Folder exists. Files:")
#     for f in files:
#         print(" -", f)
# except Exception as e:
#     print("Folder not found yet or no files written.")

In [54]:
# to delete created folder in HDFS

# from hdfs import InsecureClient

# client = InsecureClient('http://hadoop-namenode:9870', user='jovyan')

# target_path = '/user/jovyan/output/kafka_parquet'

# try:
#     if client.status(target_path, strict=False):
#         client.delete(target_path, recursive=True)
#         print(f"Deleted HDFS folder: {target_path}")
#     else:
#         print(f" Folder does not exist: {target_path}")
# except Exception as e:
#     print(f" Error deleting folder: {e}")
