In [38]:
import pyspark

In [39]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, FloatType, TimestampType, MapType

In [40]:
 spark = (
    SparkSession.builder.master("local[1]")
    .appName("pyspark solution")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1")
    .getOrCreate()
)

In [41]:
spark

In [42]:
df = spark.read.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "topic_nested") \
    .load() \
    .selectExpr("CAST(value AS STRING)") \
    .select(F.from_json("value", schema).alias("data")) \
    .select("data.*")

df = df.select("Timestamp", "string_columns.*", "numeric_columns.*")

In [43]:
schema = StructType([
    StructField("Timestamp", TimestampType(), True),
    StructField("string_columns", StructType([
        StructField("Name", StringType(), True),
        StructField("Sex", StringType(), True),
        StructField("Age", StringType(), True),
        StructField("Ticket", StringType(), True),
        StructField("Fare", StringType(), True),
        StructField("Cabin", StringType(), True),
        StructField("Embarked", StringType(), True)
    ]), True),
    StructField("numeric_columns", StructType([
        StructField("PassengerId", IntegerType(), True),
        StructField("Survived", IntegerType(), True),
        StructField("Pclass", IntegerType(), True),
        StructField("SibSp", IntegerType(), True),
        StructField("Parch", IntegerType(), True)
    ]), True)
])

In [44]:
df.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)



In [45]:
df = df.dropDuplicates()

In [46]:
df.count()

891

In [47]:
df.show()

+-------------------+--------------------+------+----+-----------------+-------+-----+--------+-----------+--------+------+-----+-----+
|          Timestamp|                Name|   Sex| Age|           Ticket|   Fare|Cabin|Embarked|PassengerId|Survived|Pclass|SibSp|Parch|
+-------------------+--------------------+------+----+-----------------+-------+-----+--------+-----------+--------+------+-----+-----+
|2020-01-01 13:46:21|"Lovell, Mr. John...|  male|  20|        A/5 21173|   7.25| null|       S|        228|       0|     3|    0|    0|
|2020-01-01 13:39:12|Kimball, Mr. Edwi...|  male|  42|            11753|52.5542|  D19|       S|        622|       1|     1|    1|    0|
|2020-01-01 13:36:29|Alexander, Mr. Wi...|  male|  26|             3474| 7.8875| null|       S|        811|       0|     3|    0|    0|
|2020-01-01 13:44:34|Marechal, Mr. Pierre|  male|null|            11774|   29.7|  C47|       C|        840|       1|     1|    0|    0|
|2020-01-01 13:43:40|Stead, Mr. Willia...|  male

In [48]:
df_na = df.dropna(subset=["Age","Cabin", "Embarked"])

In [49]:
df_na.show()

+-------------------+--------------------+------+---+----------+-------+-----------+--------+-----------+--------+------+-----+-----+
|          Timestamp|                Name|   Sex|Age|    Ticket|   Fare|      Cabin|Embarked|PassengerId|Survived|Pclass|SibSp|Parch|
+-------------------+--------------------+------+---+----------+-------+-----------+--------+-----------+--------+------+-----+-----+
|2020-01-01 13:39:12|Kimball, Mr. Edwi...|  male| 42|     11753|52.5542|        D19|       S|        622|       1|     1|    1|    0|
|2020-01-01 13:43:40|Stead, Mr. Willia...|  male| 62|    113514|  26.55|        C87|       S|        253|       0|     1|    0|    0|
|2020-01-01 13:33:14|Soholt, Mr. Peter...|  male| 19|    348124|   7.65|      F G73|       S|        716|       0|     3|    0|    0|
|2020-01-01 13:33:36|Beesley, Mr. Lawr...|  male| 34|    248698|   13.0|        D56|       S|         22|       1|     2|    0|    0|
|2020-01-01 13:40:51|Carlsson, Mr. Fra...|  male| 33|       69

In [50]:
df_na.count()

183

In [51]:
df_columns = df_na.drop("Pclass","SibSp","Parch")

In [52]:
df_columns.show(5)

+-------------------+--------------------+----+---+------+-------+-----------+--------+-----------+--------+
|          Timestamp|                Name| Sex|Age|Ticket|   Fare|      Cabin|Embarked|PassengerId|Survived|
+-------------------+--------------------+----+---+------+-------+-----------+--------+-----------+--------+
|2020-01-01 13:39:12|Kimball, Mr. Edwi...|male| 42| 11753|52.5542|        D19|       S|        622|       1|
|2020-01-01 13:43:40|Stead, Mr. Willia...|male| 62|113514|  26.55|        C87|       S|        253|       0|
|2020-01-01 13:33:14|Soholt, Mr. Peter...|male| 19|348124|   7.65|      F G73|       S|        716|       0|
|2020-01-01 13:33:36|Beesley, Mr. Lawr...|male| 34|248698|   13.0|        D56|       S|         22|       1|
|2020-01-01 13:40:51|Carlsson, Mr. Fra...|male| 33|   695|    5.0|B51 B53 B55|       S|        873|       0|
+-------------------+--------------------+----+---+------+-------+-----------+--------+-----------+--------+
only showing top 5 

In [53]:
df_final = df_columns.withColumn("Age",F.col("Age").cast(IntegerType())) \
    .withColumn("Fare",F.col("Fare").cast(FloatType())) 

In [54]:
df_final.dtypes

[('Timestamp', 'timestamp'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'float'),
 ('Cabin', 'string'),
 ('Embarked', 'string'),
 ('PassengerId', 'int'),
 ('Survived', 'int')]

In [None]:
df_final.write.json('json_data')