## Work with Kafka

### Install neccessary packages

In [1]:
!pip install delta-spark==3.2.0

Collecting delta-spark==3.2.0
  Downloading delta_spark-3.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.0->delta-spark==3.2.0)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading delta_spark-3.2.0-py3-none-any.whl (21 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: py4j, delta-spark
Successfully installed delta-spark-3.2.0 py4j-0.10.9.7


### Import neccessary packages

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from delta import *

### Initialization Spark-session with Kafka support

In [3]:
# Ініціалізація SparkSession з підтримкою Kafka
spark = SparkSession.builder \
                    .appName("HealthDataStreaming") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,io.delta:delta-spark_2.12:3.2.0") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .getOrCreate()

### Create schema for creating DataFrame from retrieved data

In [4]:
# Схема для даних
schema = StructType([
    StructField("patient_id", StringType(), nullable=False),
    StructField("heart_rate", IntegerType(), nullable=True),
    StructField("blood_pressure", StringType(), nullable=True),
    StructField("glucose_level", FloatType(), nullable=True),
    StructField("timestamp", TimestampType(), nullable=True)
])

### Work with data and DeltaLake

In [6]:
# Читання даних з Kafka
df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "192.168.50.52:39092") \
    .option("subscribe", "health-data") \
    .load()

# Перетворення даних
df = df.selectExpr("CAST(value AS STRING)")
df = df.select(from_json(col("value"), schema).alias("data")).select("data.*") 

# Збереження оброблених даних у Delta Lake
query = df.writeStream.format("delta") \
                      .outputMode("append") \
                      .option("checkpointLocation", "/tmp/checkpoints") \
                      .toTable("health_stream_data")

query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [9]:
# Читаєм дані з Delta таблиці
delta_df = spark.read.format("delta").table("health_stream_data")
delta_df.show(delta_df.count(), truncate=False)

+----------+----------+--------------+-------------+--------------------------+
|patient_id|heart_rate|blood_pressure|glucose_level|timestamp                 |
+----------+----------+--------------+-------------+--------------------------+
|23        |71        |116/73        |119.47       |2025-01-07 00:27:36.05879 |
|24        |86        |110/71        |110.97       |2025-01-07 00:27:37.080012|
|25        |95        |113/70        |112.71       |2025-01-07 00:27:38.088398|
|26        |100       |116/77        |70.38        |2025-01-07 00:27:39.095191|
|88        |65        |120/83        |124.56       |2025-01-07 00:28:41.524436|
|32        |88        |133/89        |113.44       |2025-01-07 00:27:45.157241|
|67        |64        |110/82        |75.89        |2025-01-07 00:28:20.370604|
|56        |94        |133/84        |116.04       |2025-01-07 00:28:09.293046|
|16        |77        |114/86        |136.4        |2025-01-07 00:27:29.026485|
|58        |74        |121/88        |10

In [28]:
spark.stop()