In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = (
    SparkSession 
    .builder 
    .appName("Streaming Process Files") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .master("local[*]") 
    .getOrCreate()
)
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.set("spark.sql.streaming.schemaInference", True)

In [3]:
df_01 = spark.read.json("./dataset/device_01.json")
df_01.show()

+----------+--------------------+--------------------+-----------+--------------+--------------------+
|customerId|                data|             eventId|eventOffset|eventPublisher|           eventTime|
+----------+--------------------+--------------------+-----------+--------------+--------------------+
|   CI00103|{[{D001, C, ERROR...|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|
+----------+--------------------+--------------------+-----------+--------------+--------------------+



In [4]:
df_01.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [5]:
df_02 = df_01.withColumn("data_devices", F.explode("data.devices")).drop("data")
df_02.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [6]:
df_03 = \
    df_02 \
        .withColumn("deviceId", F.col("data_devices.deviceId")) \
        .withColumn("measure", F.col("data_devices.measure")) \
        .withColumn("status", F.col("data_devices.status")) \
        .withColumn("temperature", F.col("data_devices.temperature")) \
        .drop("data_devices")

df_03.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [10]:
df_03.show()

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+



## Stream Start

option("cleanSource", "archive")
    - off : 아무런 변화 없음
    - delete : 처리 완료된 파일 삭제
    - archive : 처리 완료된 파일을 "sourceArchiveDir"로 이동

In [7]:
df_01 = (
    spark
    .readStream
    # .option("cleanSource", "archive") # off,delete,archive
    # .option("sourceArchiveDir", "archive/02_file_streaming") # archive되는 파일의 이동 경로
    .option("maxFilesPerTrigger", 1)
    .format("json")
    .load("./dataset/")
)

In [8]:
df_02 = (
        df_01
        .withColumn("data_devices", F.explode("data.devices"))
        .drop("data")
        )

df_03 = (
        df_02
        .withColumn("deviceId", F.col("data_devices.deviceId"))
        .withColumn("measure", F.col("data_devices.measure"))
        .withColumn("status", F.col("data_devices.status"))
        .withColumn("temperature", F.col("data_devices.temperature"))
        .drop("data_devices")
        )

In [5]:
## 처리 결과 콘솔에 출력

stream = (
    df_03
    .writeStream
    .format("console")
    .outputMode("append")
    .option("checkpointLocation", "checkpoint_dir/02_file_streaming")
    .start()
    )

stream.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [9]:
## 처리 결과 메모리에 저장

stream = (
    df_03
    .writeStream
    .format("memory")
    .queryName("memory")
    .outputMode("append")
    .option("checkpointLocation", "checkpoint_dir/02_file_streaming")
    .start()
)

stream.awaitTermination()

In [5]:
## 처리 결과 CSV로 저장

stream = (
    df_03
    .writeStream
    .format("csv")
    .outputMode("append")
    .option("header", "true")
    .option("path", "output/02_file_streaming") # CSV 파일 저장 경로
    .option("checkpointLocation", "checkpoint_dir/02_file_streaming")
    .start()
)

stream.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [10]:
spark.sql("SELECT * FROM memory").show()

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00101|1450324a-c546-417...|      10038|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         20|
|   CI00101|1450324a-c546-417...|      10038|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|          1|
|   CI00101|1450324a-c546-417...|      10038|        device|2023-01-05 11:13:...|    D002|      D|SUCCESS|         21|
|   CI00108|aa90011f-3967-496...|      10003|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         16|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|   