# Consultas PySpark y limpieza de datos

Importamos pyspark.

In [2]:
# Importaciones
import pyspark
from pyspark.sql import SparkSession, functions as F, types as T

spark = SparkSession.builder.appName("Earthquake").getOrCreate()
spark

Realiza la limpieza de datos del siguiente CSV.

In [3]:
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("earthquake_data_tsunami.csv"))

In [4]:
nulos = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns])
nulos.show()
nulos = nulos.dropna()

+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+
|magnitude|cdi|mmi|sig|nst|dmin|gap|depth|latitude|longitude|Year|Month|tsunami|
+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+
|        0|  0|  0|  0|  0|   0|  0|    0|       0|        0|   0|    0|      0|
+---------+---+---+---+---+----+---+-----+--------+---------+----+-----+-------+



In [5]:
df.dropDuplicates()

DataFrame[magnitude: double, cdi: int, mmi: int, sig: int, nst: int, dmin: double, gap: double, depth: double, latitude: double, longitude: double, Year: int, Month: int, tsunami: int]

Contar cuantos eventos suceden por mes.

In [6]:
eventos_mes = df.groupBy("Month").count().orderBy("Month")
eventos_mes.show()

+-----+-----+
|Month|count|
+-----+-----+
|    1|   70|
|    2|   63|
|    3|   63|
|    4|   77|
|    5|   58|
|    6|   42|
|    7|   56|
|    8|   68|
|    9|   80|
|   10|   69|
|   11|   80|
|   12|   56|
+-----+-----+



Máximo de magnitud de terremotos.

In [7]:
df.createOrReplaceTempView("Earthquake")
maximo_magnitud = spark.sql("SELECT MAX(magnitude) as max_magnitude FROM Earthquake")
maximo_magnitud.show()

+-------------+
|max_magnitude|
+-------------+
|          9.1|
+-------------+



Top 10 años que mas aparecen en el CSV.

In [21]:
años_top10 = spark.sql("SELECT Year, COUNT(Year) as Count FROM Earthquake GROUP BY Year ORDER BY Count DESC LIMIT 10")
años_top10.show()

+----+-----+
|Year|Count|
+----+-----+
|2015|   53|
|2013|   53|
|2014|   48|
|2018|   43|
|2016|   43|
|2021|   42|
|2010|   41|
|2022|   40|
|2007|   37|
|2017|   36|
+----+-----+



Magnitud Media por año.

In [None]:
magnitud_media = spark.sql("SELECT Year, AVG(Magnitude) as Average FROM Earthquake GROUP BY Year ORDER BY Year DESC")
magnitud_media.show()

+----+------------------+
|Year|           Average|
+----+------------------+
|2022|            6.8125|
|2021|  7.05238095238095|
|2020|  6.91111111111111|
|2019|6.8606060606060595|
|2018| 6.953488372093025|
|2017| 6.811111111111113|
|2016| 6.944186046511627|
|2015| 6.898113207547171|
|2014| 6.843749999999999|
|2013| 6.890566037735849|
|2012| 7.070967741935481|
|2011| 6.988235294117645|
|2010| 7.004878048780489|
|2009|  7.16153846153846|
|2008| 6.900000000000001|
|2007| 7.054054054054053|
|2006|  6.94230769230769|
|2005| 6.942857142857142|
|2004| 6.959374999999999|
|2003| 6.889032258064517|
+----+------------------+
only showing top 20 rows


Máximo de magnitud por mes.

In [29]:
max_magnitud_mes = spark.sql("SELECT Month, MAX(Magnitude) as MaxMagnitude FROM Earthquake GROUP BY Month ORDER BY Month")
max_magnitud_mes.show()

+-----+------------+
|Month|MaxMagnitude|
+-----+------------+
|    1|         8.1|
|    2|         8.8|
|    3|         9.1|
|    4|         8.6|
|    5|         8.3|
|    6|         8.4|
|    7|         8.2|
|    8|         8.2|
|    9|         8.4|
|   10|         7.8|
|   11|         8.3|
|   12|         9.1|
+-----+------------+

