In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)

In [2]:
# kdf = spark.read \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
#     .option("subscribe", "demo") \
#     .option("startingOffsets", "earliest") \
#     .load()

kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", "demo") \
    .option("startingOffsets", "earliest") \
    .load()

# earliest, latest

In [7]:
kdf.rdd.getNumPartitions()

8

#### schema 1 example

In [None]:
schema_1 = T.StructType([
    T.StructField("order_id", T.IntegerType()),
    T.StructField("user_id", T.StringType()),
    T.StructField("total_cost", T.IntegerType()),
    ])

In [None]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema_1).alias("value"))

explode_df = value_df.selectExpr("value.order_id", "value.user_id", "value.total_cost")

processed_df = \
    explode_df \
        .withColumn(
            "filter_cost", 
            F.when(F.col("order_id")%2 == 0, F.col("total_cost")*2).otherwise(F.col("total_cost"))
            )

#### schema 2 example

In [3]:
schema_2 = T.StructType([
    T.StructField("birthdate", T.StringType()),
    T.StructField("blood_group", T.StringType()),
    T.StructField("job", T.StringType()),
    T.StructField("name", T.StringType()),
    T.StructField("residence", T.StringType()),
    T.StructField("sex", T.StringType()),
    T.StructField("ssn", T.StringType()),
    T.StructField("uuid", T.StringType()),
    T.StructField("timestamp", T.TimestampType()),
    ])

In [4]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema_2).alias("value"))

processed_df = value_df.selectExpr(
    "value.birthdate",
    "value.blood_group",
    "value.job",
    "value.name",
    "value.residence",
    "value.sex",
    "value.ssn",
    "value.uuid",
    "value.timestamp"
	)

# processed_df = \
#     explode_df \
#         .withColumn(
#             "filter_cost", 
#             F.when(F.col("order_id")%2 == 0, F.col("total_cost")*2).otherwise(F.col("total_cost"))
#             )

#### 1) write to console
- trigger options
    - trigger(once=True) : 한 번만 Batch되고 종료
    - trigger(processingTime = ) : 특정 시간/기간을 주기로 실시간 데이터를 읽어 작업을 처리

In [5]:
df_console = processed_df.writeStream \
    .format("console") \
    .queryName("Flattened Invoice Writer") \
    .outputMode("append") \
    .option("path", "output/03_kafka_streaming") \
    .option("checkpointLocation", "checkpoint_dir/03_kafka_streaming") \
    .trigger(processingTime="5 seconds") \
    .start()

df_console.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.8/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

#### 2) write to csv

In [None]:
df_csv = processed_df.writeStream \
    .format("csv") \
    .outputMode("append") \
    .option("header", "true") \
    .option("path", "output/03_kafka_streaming") \
    .option("checkpointLocation", "checkpoint_dir/03_kafka_streaming") \
    .trigger(processingTime="5 seconds") \
    .start()

df_csv.awaitTermination()

#### 3) write to memory

In [7]:
df_memory = processed_df.writeStream \
    .format("memory") \
    .queryName("kafka_memory") \
    .outputMode("append") \
    .option("header", "true") \
    .option("path", "output/03_kafka_streaming") \
    .option("checkpointLocation", "checkpoint_dir/03_kafka_streaming") \
    .trigger(processingTime="5 seconds") \
    .start()

df_memory.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [11]:
df = spark.sql("SELECT * FROM kafka_memory")

df.show(5)

+---------+-----------+--------------------+----------------+--------------------+---+-----------+--------------------+
|birthdate|blood_group|                 job|            name|           residence|sex|        ssn|                uuid|
+---------+-----------+--------------------+----------------+--------------------+---+-----------+--------------------+
| 19411206|         O-|Building control ...|       Ann Noble|59842 Moyer Strea...|  F|655-88-0066|EgWAwTYj7SfruZiRZ...|
| 19920928|        AB-| Information officer|   Don Maldonado|767 Nathan Knolls...|  M|312-60-8204|MX2s5iQUsrhGxMoDn...|
| 19790612|        AB+|   Transport planner|    John Webster|6512 Rich Rest Su...|  M|455-24-8570|SUGTQvBWBZCweTVAy...|
| 19100604|         O-|     Engineer, water|     Tracy Lucas|6021 Doyle Turnpi...|  F|195-44-2851|6zoCyKmQtXHuziycU...|
| 20190928|         O-|Teacher, primary ...|Katherine Miller|Unit 5842 Box 487...|  F|393-90-3763|5vJX2eUDwYZ27LKeb...|
+---------+-----------+-----------------