In [10]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F
import dotenv, os
from data_generator.fakedata import create_fakeuser

env_path = dotenv.find_dotenv()
dotenv.load_dotenv(env_path)

spark = (
    SparkSession 
    .builder 
    .appName("07_write_to_aws_s3") 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .master("local[*]") 
    .getOrCreate()
)

spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.set("spark.sql.streaming.schemaInference", True)
spark.conf.set("spark.streaming.stopGracefullyOnShutdown", True)

In [2]:
sc = spark.sparkContext
access_key = os.environ.get("ACCESS_KEY_ID")
secret_key = os.environ.get("ACCESS_SECRET_KEY")

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")

In [11]:
topic_name = "demo"

kafka_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", topic_name) \
    .option("startingOffsets", "earliest") \
    .load()

In [14]:
kafka_df.orderBy('timestamp').show()

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 6E 61 6D 6...| demo|        2|     0|2024-05-20 01:22:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        0|     0|2024-05-20 01:22:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        2|     1|2024-05-20 01:22:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        2|     2|2024-05-20 01:53:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        1|     0|2024-05-20 01:53:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        2|     3|2024-05-20 01:53:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        0|     1|2024-05-20 01:54:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        1|     1|2024-05-20 01:54:...|            0|
|null|[7B 22 6E 61 6D 6...| demo|        2|     4|2024

In [15]:
schema = T.StructType([
    T.StructField("birthdate", T.StringType()),
    T.StructField("blood_group", T.StringType()),
    T.StructField("job", T.StringType()),
    T.StructField("name", T.StringType()),
    T.StructField("residence", T.StringType()),
    T.StructField("sex", T.StringType()),
    T.StructField("ssn", T.StringType()),
    T.StructField("uuid", T.StringType()),
    T.StructField("timestamp", T.TimestampType()),
    ])

In [16]:
value_df = kafka_df.select(F.from_json(F.col("value").cast("string"), schema).alias("value"))

processed_df = value_df.selectExpr(
    "value.birthdate",
    "value.blood_group",
    "value.job",
    "value.name",
    "value.residence",
    "value.sex",
    "value.ssn",
    "value.uuid",
    "value.timestamp"
)

processed_df.show()
# final_df = processed_df.withColumn("age", (F.lit(2024) - F.substring("birthdate", 0, 4)).cast(T.IntegerType()))

+---------+-----------+--------------------+-----------------+--------------------+---+-----------+--------------------+-------------------+
|birthdate|blood_group|                 job|             name|           residence|sex|        ssn|                uuid|          timestamp|
+---------+-----------+--------------------+-----------------+--------------------+---+-----------+--------------------+-------------------+
| 20200724|         B+|Medical sales rep...|     Rachel Moore|166 Peters Valley...|  F|066-83-1189|9qA2gsGoA2mm9Ka9x...|2024-05-20 01:22:53|
| 19690318|         A+|     Arboriculturist| Jeffery Thompson|999 Simmons River...|  M|247-66-1932|PPtpVZVJvzm6TCByK...|2024-05-20 01:54:00|
| 19360621|         A+|Lecturer, further...|  Richard Trevino|USCGC Burnett\nFP...|  M|354-23-9711|LyF7YaaZhU3PZzHoP...|2024-05-20 01:54:06|
| 20240305|         A+|Accountant, chart...|Taylor Washington|2495 Nelson Field...|  M|796-40-1135|VSAhzdQLTcnkK6YsA...|2024-05-20 01:54:08|
| 19590717|  

In [6]:
df_parquet = final_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "s3a://personal-golight-image-bucket/sparkstreaming") \
    .option("checkpointLocation", "checkpoint_dir/07_write_to_aws_s3") \
    .trigger(processingTime="5 seconds") \
    .start()

df_parquet.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.8/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [9]:
spark.read.parquet("s3a://personal-golight-image-bucket/sparkstreaming/").orderBy('age', ascending=False).show()

+---------+-----------+--------------------+----------------+--------------------+---+-----------+--------------------+-------------------+---+
|birthdate|blood_group|                 job|            name|           residence|sex|        ssn|                uuid|          timestamp|age|
+---------+-----------+--------------------+----------------+--------------------+---+-----------+--------------------+-------------------+---+
| 19091031|         A+|Horticultural con...|    Jason Harris|USNV Fischer\nFPO...|  M|297-01-9442|MSFy8sJtDfrQbN5kN...|2024-05-20 01:54:24|115|
| 19160424|         A+|          Geochemist| Jason Henderson|8539 Simon Loaf A...|  M|164-38-0923|cFpuxFbFs4iG7duti...|2024-05-20 01:54:12|108|
| 19200925|         O+| Hospital pharmacist|   Jill Santiago|5735 Montgomery F...|  F|036-84-2384|G5tV5kQhKSgdZoe4F...|2024-05-20 01:54:14|104|
| 19281122|         A+|  Professor Emeritus|Heather Castillo|8997 Marissa Tunn...|  F|199-41-8371|59Uv6mKSPA7z2SaA7...|2024-05-20 01:54: