In [11]:
import pyspark

In [12]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


In [13]:
 spark = (
    SparkSession.builder.master("local[1]")
    .appName("data preparation")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")
    .getOrCreate()
)

In [14]:
spark

In [15]:
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "topic_nested") \
  .load()
# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select("value").toPandas()
df.printSchema() 

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [16]:
df.show()
# spark.stop()

+----+--------------------+------------+---------+------+--------------------+-------------+
| key|               value|       topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------------+---------+------+--------------------+-------------+
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     0|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     1|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     2|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     3|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     4|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     5|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     6|2023-02-23 13:20:...|            0|
|null|[7B 22 54 69 6D 6...|topic_nested|        0|     7|2023-02-23 13

In [17]:
df.count()

5346

In [43]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, FloatType


schema=(StructType([
  StructField("Timestamp", StringType(), True),
  StructField("string_columns", StructType([
    StructField("Name", StringType(), True),
    StructField("Sex", StringType(), True),
    StructField("Age", StringType(), True),
    StructField("Ticket", StringType(), True),
    StructField("Fare", StringType(), True),
    StructField("Cabin", StringType(), True),
    StructField("Embarked", StringType(), True)
  ]), True),
  StructField("numeric_columns", StructType([
    StructField("PassengerId", IntegerType(), True),
    StructField("Survived", IntegerType(), True),
    StructField("Pclass", IntegerType(), True),
    StructField("SibSp", IntegerType(), True),
    StructField("Parch", IntegerType(), True)
  ]), True)
])
       )

In [19]:
df2 = df.withColumn("message_content", F.from_json(F.col("value").cast("string"), schema))
# df.printSchema()
# df_minimal = df.select("message_content.*") 
# df_minimal.printSchema()


In [20]:
df_minimal = df2.select("message_content.*")

In [21]:
df_minimal.show()

+--------------------+--------------------+----------------+
|           Timestamp|      string_columns| numeric_columns|
+--------------------+--------------------+----------------+
|2020-01-01T13:45:...|{Braund, Mr. Owen...| {1, 0, 3, 1, 0}|
|2020-01-01T13:44:...|{Cumings, Mrs. Jo...| {2, 1, 1, 1, 0}|
|2020-01-01T13:38:...|{Heikkinen, Miss....| {3, 1, 3, 0, 0}|
|2020-01-01T13:32:...|{Futrelle, Mrs. J...| {4, 1, 1, 1, 0}|
|2020-01-01T13:36:...|{Allen, Mr. Willi...| {5, 0, 3, 0, 0}|
|2020-01-01T13:31:...|{Moran, Mr. James...| {6, 0, 3, 0, 0}|
|2020-01-01T13:37:...|{McCarthy, Mr. Ti...| {7, 0, 1, 0, 0}|
|2020-01-01T13:49:...|{Palsson, Master....| {8, 0, 3, 3, 1}|
|2020-01-01T13:33:...|{Johnson, Mrs. Os...| {9, 1, 3, 0, 2}|
|2020-01-01T13:32:...|{Nasser, Mrs. Nic...|{10, 1, 2, 1, 0}|
|2020-01-01T13:32:...|{Sandstrom, Miss....|{11, 1, 3, 1, 1}|
|2020-01-01T13:30:...|{Bonnell, Miss. E...|{12, 1, 1, 0, 0}|
|2020-01-01T13:33:...|{Saundercock, Mr....|{13, 0, 3, 0, 0}|
|2020-01-01T13:30:...|{A

In [31]:
df2=df_minimal.select("Timestamp","string_columns.*","numeric_columns.*")
df2.printSchema()

root
 |-- Timestamp: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)



In [32]:
df2.show() 

+--------------------+--------------------+------+----+----------------+-------+-----+--------+-----------+--------+------+-----+-----+
|           Timestamp|                Name|   Sex| Age|          Ticket|   Fare|Cabin|Embarked|PassengerId|Survived|Pclass|SibSp|Parch|
+--------------------+--------------------+------+----+----------------+-------+-----+--------+-----------+--------+------+-----+-----+
|2020-01-01T13:45:...|Braund, Mr. Owen ...|  male|  22|       A/5 21171|   7.25| null|       S|          1|       0|     3|    1|    0|
|2020-01-01T13:44:...|Cumings, Mrs. Joh...|female|  38|        PC 17599|71.2833|  C85|       C|          2|       1|     1|    1|    0|
|2020-01-01T13:38:...|Heikkinen, Miss. ...|female|  26|STON/O2. 3101282|  7.925| null|       S|          3|       1|     3|    0|    0|
|2020-01-01T13:32:...|Futrelle, Mrs. Ja...|female|  35|          113803|   53.1| C123|       S|          4|       1|     1|    1|    0|
|2020-01-01T13:36:...|Allen, Mr. Willia...|  mal

In [33]:
df2.count()

5346

In [34]:
df2 = df2.na.drop(subset=["Cabin", "Embarked","Age"])

In [35]:
df2.count()

1098

In [38]:
df2 = df2.drop("Pclass","SibSp","Parch")

In [42]:
df2.printSchema()

root
 |-- Timestamp: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)



In [39]:
df2.count()

1098

In [44]:
df2 = df2.withColumn("Fare", F.col("Fare").cast(FloatType()))
df2 = df2.withColumn("Age", F.col("Age").cast(IntegerType()))

In [45]:
df2.show()

+--------------------+--------------------+------+---+-----------+--------+-----------+--------+-----------+--------+
|           Timestamp|                Name|   Sex|Age|     Ticket|    Fare|      Cabin|Embarked|PassengerId|Survived|
+--------------------+--------------------+------+---+-----------+--------+-----------+--------+-----------+--------+
|2020-01-01T13:44:...|Cumings, Mrs. Joh...|female| 38|   PC 17599| 71.2833|        C85|       C|          2|       1|
|2020-01-01T13:32:...|Futrelle, Mrs. Ja...|female| 35|     113803|    53.1|       C123|       S|          4|       1|
|2020-01-01T13:37:...|McCarthy, Mr. Tim...|  male| 54|      17463| 51.8625|        E46|       S|          7|       0|
|2020-01-01T13:32:...|Sandstrom, Miss. ...|female|  4|    PP 9549|    16.7|         G6|       S|         11|       1|
|2020-01-01T13:30:...|Bonnell, Miss. El...|female| 58|     113783|   26.55|       C103|       S|         12|       1|
|2020-01-01T13:33:...|Beesley, Mr. Lawr...|  male| 34|  

In [46]:
json_data = df2.write\
    .format('json')\
    .save('json_data')