# Data preparation

In [1]:
import pyspark


In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [3]:
 spark = (
    SparkSession.builder.master("local[1]")
    .appName("Exam data preparation")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")
    .getOrCreate()
)

In [4]:
spark

In [5]:
titanic_schema = (
    T.StructType()
    .add("PassengerId", T.IntegerType())
    .add("Survived", T.IntegerType())
    .add("Pclass", T.IntegerType())
    .add("Name", T.StringType())
    .add("Sex", T.StringType())
    .add("Age", T.IntegerType())
    .add("SibSp", T.IntegerType())
    .add("Parch", T.IntegerType())
    .add("Ticket", T.StringType())
    .add("Fare", T.FloatType())
    .add("Cabin", T.StringType())
    .add("Embarked", T.StringType())
    .add("Timestamp", T.TimestampType())
)


In [6]:

df_source_batch = spark.read.csv("./data/titanic.csv", schema=titanic_schema)
df_source_batch = df_source_batch.withColumn("value", F.to_json(F.struct(*df_source_batch.columns)).cast(T.StringType()))
dataframe_source_batch_writer = df_source_batch.select("value").write \
                                .format("kafka") \
                                .option("kafka.bootstrap.servers", "localhost:9092") \
                                .option("topic", "titanic_topic")


In [7]:
df_source_batch.select("value").count()

891

In [8]:
# Run twice to have duplicates to drop
dataframe_source_batch_writer.save()
dataframe_source_batch_writer.save()

In [9]:
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "titanic_topic") \
  .option("failOnDataLoss", "true") \
  .load()
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [10]:
df.count()

3564

In [11]:
df.tail(1)

[Row(key=None, value=bytearray(b'{"PassengerId":891,"Survived":0,"Pclass":3,"Name":"Dooley, Mr. Patrick","Sex":"male","Age":32,"SibSp":0,"Parch":0,"Ticket":"370376","Fare":7.75,"Embarked":"Q","Timestamp":"2020-01-01T13:32:25.000Z"}'), topic='titanic_topic', partition=0, offset=3563, timestamp=datetime.datetime(2023, 2, 23, 16, 30, 52, 429000), timestampType=0)]

In [12]:
df= df.withColumn("message_content", F.from_json(F.col("value").cast("string"), titanic_schema))

df_minimal = df.select("message_content.*")

In [13]:
df_minimal.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)



In [14]:
df_minimal = df_minimal.withColumn("Fare", F.col("Fare").cast(T.StringType()))
df_minimal = df_minimal.withColumn("Age", F.col("Age").cast(T.StringType()))

In [15]:
df_to_kafka = df_minimal

df_to_kafka = df_to_kafka.withColumn("string_columns", F.struct([col for col, type_name in df_to_kafka.dtypes if type_name=="string"]))
df_to_kafka = df_to_kafka.withColumn("numeric_columns", F.struct([col for col, type_name in df_to_kafka.dtypes if type_name in ["int","float"]]))

In [16]:
df_to_kafka.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- string_columns: struct (nullable = false)
 |    |-- Name: string (nullable = true)
 |    |-- Sex: string (nullable = true)
 |    |-- Age: string (nullable = true)
 |    |-- Ticket: string (nullable = true)
 |    |-- Fare: string (nullable = true)
 |    |-- Cabin: string (nullable = true)
 |    |-- Embarked: string (nullable = true)
 |-- numeric_columns: struct (nullable = false)
 |    |-- PassengerId: integer (nullable = true)
 |    |-- Survived: integer (nullable = true)
 |    |-- Pc

In [17]:
df_to_kafka = df_to_kafka.select("Timestamp","string_columns", "numeric_columns")

In [18]:
df_to_kafka.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- string_columns: struct (nullable = false)
 |    |-- Name: string (nullable = true)
 |    |-- Sex: string (nullable = true)
 |    |-- Age: string (nullable = true)
 |    |-- Ticket: string (nullable = true)
 |    |-- Fare: string (nullable = true)
 |    |-- Cabin: string (nullable = true)
 |    |-- Embarked: string (nullable = true)
 |-- numeric_columns: struct (nullable = false)
 |    |-- PassengerId: integer (nullable = true)
 |    |-- Survived: integer (nullable = true)
 |    |-- Pclass: integer (nullable = true)
 |    |-- SibSp: integer (nullable = true)
 |    |-- Parch: integer (nullable = true)



In [19]:
df_to_kafka = df_to_kafka.withColumn("data_packed_for_kafka", F.to_json(F.struct(*df_to_kafka.columns)))


In [20]:
df_to_kafka.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- string_columns: struct (nullable = false)
 |    |-- Name: string (nullable = true)
 |    |-- Sex: string (nullable = true)
 |    |-- Age: string (nullable = true)
 |    |-- Ticket: string (nullable = true)
 |    |-- Fare: string (nullable = true)
 |    |-- Cabin: string (nullable = true)
 |    |-- Embarked: string (nullable = true)
 |-- numeric_columns: struct (nullable = false)
 |    |-- PassengerId: integer (nullable = true)
 |    |-- Survived: integer (nullable = true)
 |    |-- Pclass: integer (nullable = true)
 |    |-- SibSp: integer (nullable = true)
 |    |-- Parch: integer (nullable = true)
 |-- data_packed_for_kafka: string (nullable = true)



In [21]:
query = df_to_kafka.select(F.col("data_packed_for_kafka").alias("value")) \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("topic", "topic_nested") \
  .save()


In [22]:
meant_to_be_json = df_to_kafka.select(F.col("data_packed_for_kafka")).tail(1)[0]["data_packed_for_kafka"]

In [23]:
import json 

# prove data is correctly formatted JSON
json.loads(meant_to_be_json)

{'Timestamp': '2020-01-01T13:32:25.000Z',
 'string_columns': {'Name': 'Dooley, Mr. Patrick',
  'Sex': 'male',
  'Age': '32',
  'Ticket': '370376',
  'Fare': '7.75',
  'Embarked': 'Q'},
 'numeric_columns': {'PassengerId': 891,
  'Survived': 0,
  'Pclass': 3,
  'SibSp': 0,
  'Parch': 0}}

In [24]:
spark.stop()