In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json,col, isnan, when, lower
from pyspark.sql.types import StructType, StringType, FloatType, IntegerType, StructField,DoubleType,T
from pyspark.sql import functions as F

In [5]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("solution")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1",
    )
    .getOrCreate()
)

In [6]:
spark

In [7]:
landslides_schema = StructType([
    StructField("id", StringType()),
    StructField("distance", StringType()),
    StructField("landslide_size", StringType()),
    StructField("injuries", StringType()),
    StructField("fatalities", StringType()),
     StructField("location_columns", StructType([
         StructField("country_code", StringType()),
         StructField("state/province", StringType()),
         StructField("city/town", StringType()),
         StructField("population", StringType()),
         StructField("latitude", StringType()),
         StructField("longitude", StringType())
     ])),
        StructField("time_columns", StructType([
        StructField("date", StringType()),
        StructField("time", StringType()),
     ]))
])


In [8]:
data = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "landslides_topic") \
  .option("failOnDataLoss", "true") \
  .load()
data.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [13]:
parse_data = data.selectExpr("CAST(value AS STRING)").select(from_json(col("value"), landslides_schema).alias("data")).select("data.*")
parse_data.show(5)
parse_data.printSchema()

+---+--------+--------------+--------+----------+--------------------+---------------+
| id|distance|landslide_size|injuries|fatalities|    location_columns|   time_columns|
+---+--------+--------------+--------+----------+--------------------+---------------+
| 34| 3.40765|         Small|    null|      null|{US, Virginia, Ch...|{3/2/07, Night}|
| 42| 3.33522|         Small|    null|      null|{US, Ohio, New Ph...|{3/22/07, null}|
| 56| 2.91977|         Small|    null|      null|{US, Pennsylvania...| {4/6/07, null}|
| 59| 2.98682|         Small|    null|      null|{CA, Quebec, Chât...|{4/14/07, null}|
| 61| 5.66542|         Small|    null|       0.0|{US, Kentucky, Pi...|{4/15/07, null}|
+---+--------+--------------+--------+----------+--------------------+---------------+
only showing top 5 rows

root
 |-- id: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- landslide_size: string (nullable = true)
 |-- injuries: string (nullable = true)
 |-- fatalities: string (nu

In [14]:
parse_data.count()

3386

In [17]:
denest_df = parse_data.dropDuplicates()

In [19]:
denest_df.drop('time').show(10)

+----+--------+--------------+--------+----------+--------------------+------------------+
|  id|distance|landslide_size|injuries|fatalities|    location_columns|      time_columns|
+----+--------+--------------+--------+----------+--------------------+------------------+
| 277| 2.79113|        Medium|    null|       3.0|{GT, Guatemala, G...|   {9/22/07, null}|
| 984| 4.86398|        Medium|    null|       0.0|{DO, Santiago, Pe...|   {2/12/09, null}|
|2156| 4.00979|         Small|    null|       0.0|{TT, Diego Martin...|   {7/29/10, null}|
|3130|  3.7758|        Medium|    null|       0.0|{JM, Saint Mary, ...|   {2/16/11, null}|
|3825| 3.75018|        Medium|    null|       0.0|{US, Florida, Wil...|   {7/21/11, null}|
|4613| 4.23278|        Medium|    null|      null|{US, North Caroli...|  {11/14/12, null}|
|4715| 5.63904|        Medium|    null|      null|{US, North Caroli...|   {1/15/13, null}|
|4922| 1.41843|         Small|    null|       0.0|{US, New York, Mi...|{6/16/13, 0:00:00}|

In [22]:

dinest_df = parse_data.select(
    "id",
    "distance",
    "landslide_size",
    "injuries",
    "fatalities",
    "location_columns.*",
    "time_columns.*"
    
)
dinest_df.show(10)

+---+--------+--------------+--------+----------+------------+----------------+----------------+----------+--------+---------+-------+-----+
| id|distance|landslide_size|injuries|fatalities|country_code|  state/province|       city/town|population|latitude|longitude|   date| time|
+---+--------+--------------+--------+----------+------------+----------------+----------------+----------+--------+---------+-------+-----+
| 34| 3.40765|         Small|    null|      null|          US|        Virginia|     Cherry Hill|     16000| 38.6009| -77.2682| 3/2/07|Night|
| 42| 3.33522|         Small|    null|      null|          US|            Ohio|New Philadelphia|     17288| 40.5175| -81.4305|3/22/07| null|
| 56| 2.91977|         Small|    null|      null|          US|    Pennsylvania|     Wilkinsburg|     15930| 40.4377|  -79.916| 4/6/07| null|
| 59| 2.98682|         Small|    null|      null|          CA|          Quebec|     Châteauguay|     42786| 45.3226| -73.7771|4/14/07| null|
| 61| 5.66542

In [23]:
dinest_df = dinest_df.fillna({"latitude": dinest_df.agg({'latitude': 'avg'}).collect()[0][0]})
dinest_df = dinest_df.fillna({"longitude": dinest_df.agg({'longitude': 'avg'}).collect()[0][0]})
dinest_df = dinest_df.fillna({"distance": dinest_df.agg({'distance': 'avg'}).collect()[0][0]})


In [27]:

df_not_null = dinest_df.filter((F.col("date").isNotNull()) & \
            (F.col("state/province").isNotNull()) & \
             (F.col("city/town").isNotNull()))

df_not_null.show()

+---+--------+--------------+--------+----------+------------+-----------------+--------------------+----------+--------+---------+-------+-----+
| id|distance|landslide_size|injuries|fatalities|country_code|   state/province|           city/town|population|latitude|longitude|   date| time|
+---+--------+--------------+--------+----------+------------+-----------------+--------------------+----------+--------+---------+-------+-----+
| 34| 3.40765|         Small|    null|      null|          US|         Virginia|         Cherry Hill|     16000| 38.6009| -77.2682| 3/2/07|Night|
| 42| 3.33522|         Small|    null|      null|          US|             Ohio|    New Philadelphia|     17288| 40.5175| -81.4305|3/22/07| null|
| 56| 2.91977|         Small|    null|      null|          US|     Pennsylvania|         Wilkinsburg|     15930| 40.4377|  -79.916| 4/6/07| null|
| 59| 2.98682|         Small|    null|      null|          CA|           Quebec|         Châteauguay|     42786| 45.3226| -7

In [28]:
dinest_df=dinest_df.withColumn("id", F.col("injuries").cast(T.IntegerType()))\
           .withColumn("injuries", F.col("injuries").cast(T.IntegerType()))\
           .withColumn("fatalities", F.col("fatalities").cast(T.IntegerType()))\
           .withColumn("date", F.col("date").cast(T.DateType()))

NameError: name 'T' is not defined