# Consultas PySpark y limpieza de datos

Importamos pyspark.

In [4]:
# Importaciones
import pyspark
from pyspark.sql import SparkSession, functions as F, types as T

spark = SparkSession.builder.appName("Earthquake").getOrCreate()
spark

Realiza la limpieza de datos del siguiente CSV.

In [5]:
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("earthquake_data_tsunami.csv"))

In [8]:
nulos = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns])
nulos.show()
nulos = nulos.dropna()

+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+
|magnitude|cdi|mmi|sig|nst|dmin|gap|depth|latitude|longitude|Year|Month|tsunami|
+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+
|        0|  0|  0|  0|  0|   0|  0|    0|       0|        0|   0|    0|      0|
+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+



In [7]:
df.dropDuplicates()

DataFrame[magnitude: double, cdi: int, mmi: int, sig: int, nst: int, dmin: double, gap: double, depth: double, latitude: double, longitude: double, Year: int, Month: int, tsunami: int]

Contar cuantos eventos suceden por mes.

In [13]:
eventos_mes = df.groupBy("Month").count().orderBy("Month")
eventos_mes.show()

+-----+-----+
|Month|count|
+-----+-----+
|    1|   70|
|    2|   63|
|    3|   63|
|    4|   77|
|    5|   58|
|    6|   42|
|    7|   56|
|    8|   68|
|    9|   80|
|   10|   69|
|   11|   80|
|   12|   56|
+-----+-----+



Máximo de magnitud de terremotos.

In [24]:
df.createOrReplaceTempView("Earthquake")
maximo_magnitud = spark.sql("SELECT MAX(magnitude) as max_magnitude FROM Earthquake")
maximo_magnitud.show()

+-------------+
|max_magnitude|
+-------------+
|          9.1|
+-------------+



Top 10 años que mas aparecen en el CSV.

In [None]:
#TODO
años_top10 = (df.groupBy("Year").count())
df.show(10)

+---------+---+---+----+---+-----+----+-------+--------+---------+----+-----+-------+
|magnitude|cdi|mmi| sig|nst| dmin| gap|  depth|latitude|longitude|Year|Month|tsunami|
+---------+---+---+----+---+-----+----+-------+--------+---------+----+-----+-------+
|      7.0|  8|  7| 768|117|0.509|17.0|   14.0| -9.7963|  159.596|2022|   11|      1|
|      6.9|  4|  4| 735| 99|2.229|34.0|   25.0| -4.9559|  100.738|2022|   11|      0|
|      7.0|  3|  3| 755|147|3.125|18.0|  579.0|-20.0508| -178.346|2022|   11|      1|
|      7.3|  5|  5| 833|149|1.865|21.0|   37.0|-19.2918| -172.129|2022|   11|      1|
|      6.6|  0|  2| 670|131|4.998|27.0|624.464|-25.5948|  178.278|2022|   11|      1|
|      7.0|  4|  3| 755|142|4.578|26.0|  660.0|-26.0442|  178.381|2022|   11|      1|
|      6.8|  1|  3| 711|136|4.678|22.0|630.379|-25.9678|  178.363|2022|   11|      1|
|      6.7|  7|  6| 797|145|1.151|37.0|   20.0|  7.6712| -82.3396|2022|   10|      1|
|      6.8|  8|  7|1179|175|2.137|92.0|   20.0|   18.3

Magnitud Media por año.

Máximo de magnitud por mes.