In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .config("spark.driver.extraClassPath", "./jdbc/mysql-connector-j-8.4.0.jar") \
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)

In [3]:
kdf = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", "demo1") \
    .option("startingOffsets", "earliest") \
    .load()

kdf.rdd.getNumPartitions()

5

In [4]:
kdf.show()

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 6E 61 6D 6...|demo1|        0|     0|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        0|     1|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        1|     0|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        2|     0|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        4|     0|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        3|     0|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        3|     1|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        3|     2|2024-05-13 12:54:...|            0|
|null|[7B 22 6E 61 6D 6...|demo1|        3|     3|2024

In [2]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", "user") \
    .option("startingOffsets", "latest") \
    .load()

In [3]:
schema = T.StructType([
    T.StructField("birthdate", T.StringType()),
    T.StructField("blood_group", T.StringType()),
    T.StructField("job", T.StringType()),
    T.StructField("name", T.StringType()),
    T.StructField("residence", T.StringType()),
    T.StructField("sex", T.StringType()),
    T.StructField("ssn", T.StringType()),
    T.StructField("uuid", T.StringType()),
    T.StructField("timestamp", T.TimestampType()),
    ])

In [4]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema).alias("value"))

processed_df = value_df.selectExpr(
    "value.birthdate", 
    "value.blood_group", 
    "value.job",
    "value.name",
    "value.residence",
    "value.sex",
    "value.ssn",
    "value.uuid",
    "value.timestamp"
)

In [5]:
df_agg = processed_df \
    .withWatermark("timestamp", "5 seconds") \
    .groupBy(F.window("timestamp", "5 seconds"),
                          "uuid").count()

# df_order = df_agg.groupby("window").count().orderBy('window')

df_final = df_agg.selectExpr("window.start as start_time", "window.end as end_time", 'uuid', "count").orderBy('start_time')


In [None]:
df_console = df_final.writeStream \
    .format("console") \
    .outputMode("complete") \
    .option("checkpointLocation", "checkpoint_dir/05_window_operations_and_watermarks") \
    .trigger(processingTime="5 seconds") \
    .start()

df_console.awaitTermination()

In [8]:
# df_memory = processed_df.writeStream \
#     .format("memory") \
#     .queryName("kafka_memory") \
#     .outputMode("append") \
#     .option("checkpointLocation", "checkpoint_dir/05_window_operations_and_watermarks") \
#     .trigger(processingTime="5 seconds") \
#     .start()

# df_memory.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [10]:
df = spark.sql("SELECT * FROM kafka_memory")

df.show()

+---------+-----------+--------------------+-----------------+--------------------+---+-----------+--------------------+-------------------+
|birthdate|blood_group|                 job|             name|           residence|sex|        ssn|                uuid|          timestamp|
+---------+-----------+--------------------+-----------------+--------------------+---+-----------+--------------------+-------------------+
| 19670613|         O+|  Analytical chemist| Michael Erickson|80800 Pamela Cany...|  M|254-75-7053|LJoSb6NWRWW27Zkpn...|2024-05-12 15:32:33|
| 20211103|         A-|Estate manager/la...|     Andrea Lopez|440 Wise Burgs\nP...|  F|822-34-7088|f9i6J3xv2FojTszsh...|2024-05-12 15:32:34|
| 20040509|         B+|Human resources o...|      Lisa Tanner|USNS Walker\nFPO ...|  F|764-39-7261|jnVioyWFLrei7qRMF...|2024-05-12 15:32:35|
| 19980121|         O+|        Retail buyer|     Karen Bryant|929 Natalie Mount...|  F|192-41-8135|SEcJmNfmbh3bjixAs...|2024-05-12 15:32:36|
| 19290418|  

In [11]:
df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", "user") \
    .option("startingOffsets", "earliest") \
    .load()

df.show()

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 6E 61 6D 6...| user|        0|     0|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     1|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     2|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     3|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     4|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     5|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     6|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     7|2024-05-10 13:06:...|            0|
|null|[7B 22 6E 61 6D 6...| user|        0|     8|2024

In [9]:
schema = T.StructType([
    T.StructField("birthdate", T.StringType()),
    T.StructField("blood_group", T.StringType()),
    T.StructField("job", T.StringType()),
    T.StructField("name", T.StringType()),
    T.StructField("residence", T.StringType()),
    T.StructField("sex", T.StringType()),
    T.StructField("ssn", T.StringType()),
    T.StructField("uuid", T.StringType()),
    T.StructField("timestamp", T.TimestampType()),
    ])

In [12]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema).alias("value"))

processed_df = value_df.selectExpr(
    "value.birthdate", 
    "value.blood_group", 
    "value.job",
    "value.name",
    "value.residence",
    "value.sex",
    "value.ssn",
    "value.uuid",
    "value.timestamp"
)

In [1]:
from kafka import KafkaConsumer, KafkaAdminClient
from kafka import TopicPartition, OffsetAndMetadata
from kafka.admin import NewPartitions, NewTopic

bootstrap_servers=["kafka1:19091", "kafka2:19092", "kafka3:19093"]
topic_name = "test"

In [2]:
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers = bootstrap_servers,
    auto_offset_reset = "earliest",
)
consumer.partitions_for_topic(topic_name)

{0, 1, 2}

In [36]:
admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
admin_client.list_topics()

['test', 'user', '__consumer_offsets']

In [28]:
# topic 개수 변경
part = NewPartitions(6)
m = {topic_name: part}
admin_client.create_partitions(m)

CreatePartitionsResponse_v1(throttle_time_ms=0, topic_errors=[(topic='user', error_code=0, error_message=None)])

In [32]:
# topic 생성
new_topic = "test"

new_topic_config = {
    'name': new_topic,
    'num_partitions': 3,
    'replication_factor': 1  # Adjust as per your requirement
}
admin_client.create_topics([NewTopic(**new_topic_config)])

CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='test', error_code=0, error_message=None)])

In [71]:
# Subscribe to the topic
consumer.subscribe(topics=[topic])

In [7]:
# Get the total offset
total_offset = 0
for partition in consumer.partitions_for_topic(topic_name):
    tp = TopicPartition(topic_name, partition)
    consumer.seek_to_end(tp)
    print(f"Offset for topic '{topic_name}', partition {partition}: {offset}")
    offset = consumer.position(tp)
    total_offset += offset
    
print(f"Total offset for topic '{topic_name}': {total_offset}")

Offset for topic 'test', partition 0: 0
Offset for topic 'test', partition 1: 0
Offset for topic 'test', partition 2: 0
Total offset for topic 'test': 0
