In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .config("spark.driver.extraClassPath", "./jdbc/mysql-connector-j-8.4.0.jar") \
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)

In [2]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", "user") \
    .option("startingOffsets", "latest") \
    .load()

In [3]:
schema_2 = T.StructType([
    T.StructField("birthdate", T.StringType()),
    T.StructField("blood_group", T.StringType()),
    T.StructField("job", T.StringType()),
    T.StructField("name", T.StringType()),
    T.StructField("residence", T.StringType()),
    T.StructField("sex", T.StringType()),
    T.StructField("ssn", T.StringType()),
    T.StructField("uuid", T.StringType()),
    ])

In [4]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema_2).alias("value"))

processed_df = value_df.selectExpr(
    "value.birthdate", 
    "value.blood_group", 
    "value.job",
    "value.name",
    "value.residence",
    "value.sex",
    "value.ssn",
    "value.uuid"
)

In [5]:
def write_file(df, fmt, path):
    df.write.format(fmt).mode("append").option("header", "true").save(path)
    
def write_db(df, db_name, table_name):
    df.write \
    .mode("append") \
    .format("jdbc") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("url", f"jdbc:mysql://dockercompose-mysql-1:3306/{db_name}") \
    .option("dbtable", table_name) \
    .option("user", "hyunsoo") \
    .option("password", "910506") \
    .save()

def data_output(df, batch_id):
    print(f"BATCH ID : {batch_id}")
    
    # write to MYSQL
    write_db(df, "dataops", "fakeuser")
    
    # write as csv
    write_file(df, "csv", "./csv")
    df.show()
    # write as parquet
    # write_file(df, "parquet", "./parquet")

In [6]:
df_batch = processed_df.writeStream \
    .foreachBatch(data_output) \
    .option("checkpointLocation", "checkpoint_dir/04_write_to_multiple_sinks") \
    .trigger(processingTime="5 seconds") \
    .start()

df_batch.awaitTermination()

BATCH ID : 0
+---------+-----------+--------------------+------------+--------------------+---+-----------+--------------------+
|birthdate|blood_group|                 job|        name|           residence|sex|        ssn|                uuid|
+---------+-----------+--------------------+------------+--------------------+---+-----------+--------------------+
| 19701213|        AB-|Chief Financial O...|Shari Thomas|9374 Carson Road\...|  F|828-03-6474|fNnhESpctSXNbf3WW...|
+---------+-----------+--------------------+------------+--------------------+---+-----------+--------------------+

BATCH ID : 1
+---------+-----------+--------------------+---------------+--------------------+---+-----------+--------------------+
|birthdate|blood_group|                 job|           name|           residence|sex|        ssn|                uuid|
+---------+-----------+--------------------+---------------+--------------------+---+-----------+--------------------+
| 19830703|         O-|Human resourc

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [11]:
sql = "select * from fakeuser"

df = spark.read.format("jdbc") \
                    .option("url", f"jdbc:mysql://spark_streaming-db-1:3306/dataops") \
                    .option("driver", "com.mysql.cj.jdbc.Driver") \
                    .option("query", sql) \
                    .option("user", "hyunsoo") \
                    .option("password", "910506") \
                    .load()

df.show()

+-----+---------+-----------+--------------------+------------------+--------------------+---+-----------+--------------------+
|index|birthdate|blood_group|                 job|              name|           residence|sex|        ssn|                uuid|
+-----+---------+-----------+--------------------+------------------+--------------------+---+-----------+--------------------+
|    0| 19810916|        AB+|Psychologist, pri...|      James Murray|PSC 1544, Box 316...|  M|741-53-6179|Cb7TYKQGDVZrB84Fo...|
|    1| 19580109|        AB-|Child psychothera...|     Sydney Weaver|96697 Marissa Byp...|  F|576-98-7289|4huCF5xaAGPLHyEpN...|
|    2| 19880909|         O+|Production assist...|Dr. Valerie Wagner|4981 Smith Prairi...|  F|196-87-0323|8AdYZ52fqepNjcizR...|
+-----+---------+-----------+--------------------+------------------+--------------------+---+-----------+--------------------+



In [10]:
(
    df.write
    .mode("append")
    .format("jdbc")
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("url", f"jdbc:mysql://spark_streaming-db-1:3306/dataops") \
    .option("dbtable", "dataops")
    .option("user", "hyunsoo")
    .option("password", "910506")
    .save()

)