# Data preparation

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [None]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("Exam data preparation")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1",
    )
    .getOrCreate()
)

In [None]:
spark

### Load data from files

In [None]:
landslides_schema = (
    T.StructType()
    .add("id", T.StringType())
    .add("date", T.StringType())
    .add("time", T.StringType())
    .add("country_code", T.StringType())
    .add("state/province", T.StringType())
    .add("population", T.IntegerType())
    .add("city/town", T.StringType())
    .add("distance", T.FloatType())
    .add("latitude", T.FloatType())
    .add("longitude", T.FloatType())
    .add("landslide_size", T.StringType())
    .add("injuries", T.FloatType())
    .add("fatalities", T.FloatType())
)

df_source_batch_landslides = spark.read.parquet(
    "./data/landslides.parquet", schema=landslides_schema
)
df_source_batch_landslides = df_source_batch_landslides.withColumn(
    "value",
    F.to_json(F.struct(*df_source_batch_landslides.columns)).cast(
        T.StringType()
    ),
)
dataframe_source_batch_writer_landslides = (
    df_source_batch_landslides.select("value")
    .write.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "temp_topic")
)

print(df_source_batch_landslides.select("value").count())

# Run twice to have duplicates to drop
dataframe_source_batch_writer_landslides.save()
dataframe_source_batch_writer_landslides.save()

In [None]:
countries_schema = (
    T.StructType()
    .add("country_code", T.StringType())
    .add("country_name", T.StringType())
)

df_source_batch_countries = spark.read.csv(
    "./data/countries.csv", schema=countries_schema
)
df_source_batch_countries = df_source_batch_countries.withColumn(
    "value",
    F.to_json(F.struct(*df_source_batch_countries.columns)).cast(
        T.StringType()
    ),
)
dataframe_source_batch_writer_countries = (
    df_source_batch_countries.select("value")
    .write.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "countries_topic")
)

print(df_source_batch_countries.select("value").count())

dataframe_source_batch_writer_countries.save()

### Prepare landslides data

In [None]:
df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "temp_topic")
    .option("failOnDataLoss", "true")
    .load()
)

df = df.withColumn(
    "message_content",
    F.from_json(F.col("value").cast("string"), landslides_schema),
)
df_minimal = df.select("message_content.*")

location_columns = [
    "country_code",
    "state/province",
    "city/town",
    "population",
    "latitude",
    "longitude",
]
time_columns = ["date", "time"]
df_to_kafka = df_minimal
df_to_kafka = df_to_kafka.withColumn(
    "location_columns", F.struct(location_columns)
)
df_to_kafka = df_to_kafka.withColumn("time_columns", F.struct(time_columns))
df_to_kafka = df_to_kafka.select(
    "id",
    "distance",
    "landslide_size",
    "injuries",
    "fatalities",
    "location_columns",
    "time_columns",
)
df_to_kafka = df_to_kafka.withColumn(
    "data_packed_for_kafka", F.to_json(F.struct(*df_to_kafka.columns))
)
df_to_kafka.printSchema()
query = (
    df_to_kafka.select(F.col("data_packed_for_kafka").alias("value"))
    .write.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "landslides_topic")
    .save()
)

Check for json correctness

In [None]:
meant_to_be_json = df_to_kafka.select(F.col("data_packed_for_kafka")).tail(1)[
    0
]["data_packed_for_kafka"]

In [None]:
import json

# prove data is correctly formatted JSON
json.loads(meant_to_be_json)

In [None]:
spark.stop()