Spark Final Project

In [2]:
# import library
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
# Init SparkSession with HiveSupport
spark = SparkSession \
    .builder \
    .appName("Spark-Hive Connection") \
    .config("spark.sql.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [24]:
# Move csv files to HDFS
!hdfs dfs -mkdir -p /tmp/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv /tmp/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv /tmp/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv /tmp/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv /tmp/final_spark_project/covid_br_data/

In [8]:
# Create dataframe from csv in hdfs
covid_df = spark.read.csv('/tmp/final_spark_project/covid_br_data/*.csv', sep=";", header=True)

In [9]:
# Check dataframe Schema
covid_df.dtypes

[('regiao', 'string'),
 ('estado', 'string'),
 ('municipio', 'string'),
 ('coduf', 'string'),
 ('codmun', 'string'),
 ('codRegiaoSaude', 'string'),
 ('nomeRegiaoSaude', 'string'),
 ('data', 'string'),
 ('semanaEpi', 'string'),
 ('populacaoTCU2019', 'string'),
 ('casosAcumulado', 'string'),
 ('casosNovos', 'string'),
 ('obitosAcumulado', 'string'),
 ('obitosNovos', 'string'),
 ('Recuperadosnovos', 'string'),
 ('emAcompanhamentoNovos', 'string'),
 ('interior/metropolitana', 'string')]

In [10]:
# Check dataframe data
covid_df.show(1,False,True)

-RECORD 0----------------------------
 regiao                 | Brasil     
 estado                 | null       
 municipio              | null       
 coduf                  | 76         
 codmun                 | null       
 codRegiaoSaude         | null       
 nomeRegiaoSaude        | null       
 data                   | 2020-02-25 
 semanaEpi              | 9          
 populacaoTCU2019       | 210147125  
 casosAcumulado         | 0          
 casosNovos             | 0          
 obitosAcumulado        | 0          
 obitosNovos            | 0          
 Recuperadosnovos       | null       
 emAcompanhamentoNovos  | null       
 interior/metropolitana | null       
only showing top 1 row



In [11]:
# Change dataframe Schema acording with the data
covid_df_1 = covid_df.select("regiao",
                             "estado",
                             "municipio",
                             col("coduf").cast("integer"),
                             col("codmun").cast("integer"),
                             col("codRegiaoSaude").cast("integer"),
                             "nomeRegiaoSaude",
                             col("data").cast("date"),
                             col("semanaEpi").cast("integer"),
                             col("populacaoTCU2019").cast("date"),
                             col("casosAcumulado").cast("integer"),
                             col("casosNovos").cast("integer"),
                             col("obitosAcumulado").cast("integer"),
                             col("obitosNovos").cast("integer"),
                             col("Recuperadosnovos").cast("integer"),
                             col("emAcompanhamentoNovos").cast("integer"),
                             col("interior/metropolitana").cast("integer"))


In [12]:
# Show the Hive Database
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [14]:
# Send the data from HDFS To Hive
covid_df_1.write.format("csv").partitionBy("municipio").saveAsTable("covid_br_data")

In [15]:
# Show the table created
spark.sql("show tables").show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| default|covid_br_data|      false|
+--------+-------------+-----------+



Creating dataframes for Visualizations

First View - Hive

In [13]:
Recuperados = spark.sql("select Recuperadosnovos as Casos_Recuperados from covid_br_data order by 1 desc limit 1")
Recuperados.show()

+-----------------+
|Casos_Recuperados|
+-----------------+
|         17262646|
+-----------------+



In [None]:
Acompanhamento = spark.sql("select emAcompanhamentoNovos as Em_Acompanhamento from covid_br_data order by 1 desc limit 1")
Acompanhamento.show()

Second View - HDFS

In [15]:
casosAcumulado = spark.sql("select casosAcumulado as Acumulado from covid_br_data order by 1 desc limit 1")
casosAcumulado.show()

+---------+
|Acumulado|
+---------+
| 18855015|
+---------+



In [18]:
casosNovos = spark.sql("select casosNovos as Casos_Novos from covid_br_data order by 1 desc limit 1")
casosNovos.show()

+-----------+
|Casos_Novos|
+-----------+
|     115228|
+-----------+



In [22]:
Incidencia = spark.sql("select ((casosAcumulado/210147125)*100000) as Incidencia from covid_br_data order by 1 desc limit 1")
Incidencia.show()

+-----------------+
|       Incidencia|
+-----------------+
|8972.292625940041|
+-----------------+



Third View - Kakfa

In [29]:
# Kafka only accept data from a string type value column 
# Put the dataframe result into a json format to further use
Obitos_Acumulados = spark.sql("select obitosAcumulado from covid_br_data order by 1 desc limit 1")
Obitos_Acumulados_json = Obitos_Acumulados.select(to_json(struct("obitosAcumulado")).alias("value"))

In [33]:
Obitos_Acumulados_json.show(1,False,False)

+--------------------------+
|value                     |
+--------------------------+
|{"obitosAcumulado":526892}|
+--------------------------+



In [58]:
obitosNovos = spark.sql("select obitosNovos from covid_br_data order by 1 desc limit 1")
obitosNovos_json = obitosNovos.select(to_json(struct("obitosNovos")).alias("value"))

In [59]:
obitosNovos_json.show(1,False,False)

+--------------------+
|               value|
+--------------------+
|{"obitosNovos":4249}|
+--------------------+



In [81]:
Mortalidade = spark.sql("select cast(((obitosAcumulado/210147125)*100000) as decimal) as Mortalidade from covid_br_data order by 1 desc limit 1")
Mortalidade_json = Mortalidade.select(to_json(struct("Mortalidade")).alias("value"))

In [82]:
Mortalidade_json.show(1,False,False)

+-------------------+
|value              |
+-------------------+
|{"Mortalidade":251}|
+-------------------+



In [95]:
Letalidade = spark.sql("select obitosNovos, casosNovos, cast((obitosNovos/casosNovos)*100 as decimal) as Letalidade from covid_br_data order by 1 desc limit 1")
Letalidade_json = Letalidade.select(to_json(struct("Letalidade")).alias("value"))

In [96]:
Letalidade_json.show(1,False,False)

+----------------+
|value           |
+----------------+
|{"Letalidade":5}|
+----------------+



Saving the dataframes

Save to Hive Table

In [None]:
Recuperados.write.format("csv").saveAsTable("Recuperados")

In [None]:
Acompanhamento.write.format("csv").saveAsTable("Acompanhamento")

In [14]:
# Check the created tables
spark.sql("Show tables").show()

+--------+--------------+-----------+
|database|     tableName|isTemporary|
+--------+--------------+-----------+
| default|acompanhamento|      false|
| default| covid_br_data|      false|
| default|   recuperados|      false|
+--------+--------------+-----------+



Save to HDFS as parquet with snappy compression

In [17]:
casosAcumulado.write.option("compression","snappy").parquet("/user/final_spark_project/casosAcumulado")

In [20]:
casosNovos.write.option("compression","snappy").parquet("/user/final_spark_project/casosNovos")

In [24]:
Incidencia.write.option("compression","snappy").parquet("/user/final_spark_project/Incidencia")

In [25]:
# Check the files in HDFS
!hdfs dfs -ls /user/final_spark_project/

Found 4 items
drwxr-xr-x   - root supergroup          0 2021-11-09 19:52 /user/final_spark_project/Incidencia
drwxr-xr-x   - root supergroup          0 2021-11-09 19:42 /user/final_spark_project/casosAcumulado
drwxr-xr-x   - root supergroup          0 2021-11-09 19:47 /user/final_spark_project/casosNovos
drwxr-xr-x   - root supergroup          0 2021-11-09 18:08 /user/final_spark_project/covid_br_data


Save to Kafka topic

In [31]:
# Saving the dataframe to a kafka topic
(Obitos_Acumulados_json.write
                .format("kafka") 
                .option("kafka.bootstrap.servers","kafka:9092") 
                .option("topic","topic-obitos_acumulados")                
                .save())

In [44]:
# Loading the topic saved to a temp dataframe and show the results
Obitos_Acumulados_df = (spark 
                            .read 
                            .format("kafka") 
                            .option("kafka.bootstrap.servers", "kafka:9092") 
                            .option("subscribe", "topic-obitos_acumulados")  
                            .load())

In [57]:
# Convert the binary result in the dataframe to integer for visualization
schema = StructType().add("obitosAcumulado", IntegerType())
Obitos_Acumulados_df.where("timestamp >= '2021-11-10 12:52'").select(from_json(col("value").cast("string"), schema)).show()

+------------------------------------+
|jsontostructs(CAST(value AS STRING))|
+------------------------------------+
|                            [526892]|
+------------------------------------+



In [66]:
# Saving the dataframe to a kafka topic
(obitosNovos_json.write
                        .format("kafka") 
                        .option("kafka.bootstrap.servers","kafka:9092") 
                        .option("topic","topic-obitos_novos")                
                        .save())

In [67]:
# Loading the topic saved to a temp dataframe and show the results
obitosNovos_df = (spark 
                        .read 
                        .format("kafka") 
                        .option("kafka.bootstrap.servers", "kafka:9092") 
                        .option("subscribe", "topic-obitos_novos")  
                        .load())

In [69]:
# Convert the binary result in the dataframe to integer for visualization
schema = StructType().add("obitosNovos", IntegerType())
obitosNovos_df.select(from_json(col("value").cast("string"), schema)).show()

+------------------------------------+
|jsontostructs(CAST(value AS STRING))|
+------------------------------------+
|                              [4249]|
+------------------------------------+



In [83]:
# Saving the dataframe to a kafka topic
(Mortalidade_json.write
                        .format("kafka") 
                        .option("kafka.bootstrap.servers","kafka:9092") 
                        .option("topic","topic-mortalidade")                
                        .save())

In [84]:
# Loading the topic saved to a temp dataframe and show the results
Mortalidade_df = (spark 
                        .read 
                        .format("kafka") 
                        .option("kafka.bootstrap.servers", "kafka:9092") 
                        .option("subscribe", "topic-mortalidade")  
                        .load())

In [87]:
# Convert the binary result in the dataframe to integer for visualization
schema = StructType().add("Mortalidade", IntegerType())
Mortalidade_df.where("timestamp >= '2021-11-10 13:52'").select(from_json(col("value").cast("string"), schema)).show()

+------------------------------------+
|jsontostructs(CAST(value AS STRING))|
+------------------------------------+
|                               [251]|
+------------------------------------+



In [97]:
# Saving the dataframe to a kafka topic
(Letalidade_json.write
                        .format("kafka") 
                        .option("kafka.bootstrap.servers","kafka:9092") 
                        .option("topic","topic-Letalidade")                
                        .save())

In [102]:
# Loading the topic saved to a temp dataframe and show the results
Letalidade_df = (spark 
                        .read 
                        .format("kafka") 
                        .option("kafka.bootstrap.servers", "kafka:9092") 
                        .option("subscribe", "topic-Letalidade")  
                        .load())

In [109]:
# Convert the binary result in the dataframe to integer for visualization
schema = StructType().add("Letalidade", IntegerType())
Letalidade_df.where("timestamp >= '2021-11-10 14:15'").select(from_json(col("value").cast("string"), schema)).show()

+------------------------------------+
|jsontostructs(CAST(value AS STRING))|
+------------------------------------+
|                                 [5]|
+------------------------------------+



Spark View

In [28]:
spark_df = spark.sql("""select regiao, 
         max(casosAcumulado) as Casos, 
         max(obitosAcumulado) as Obitos, 
         max(cast(((casosAcumulado/210147125)*100000) as decimal(18,2))) as Incidencia, 
         max(cast(((obitosAcumulado/210147125)*100000) as decimal(18,2))) as Mortalidade, 
         max(data) as Atualizacao
         from covid_br_data 
         group by regiao 
         order by regiao""")
spark_df.show()

+------------+--------+------+----------+-----------+-----------+
|      regiao|   Casos|Obitos|Incidencia|Mortalidade|Atualizacao|
+------------+--------+------+----------+-----------+-----------+
|      Brasil|18855015| 99572|   8972.29|     250.73| 2021-07-06|
|Centro-Oeste|  686433|  9980|    326.64|       9.27| 2021-07-06|
|    Nordeste| 1141612|  9993|    543.24|      11.62| 2021-07-06|
|       Norte|  557708|  9992|    265.39|       7.43| 2021-07-06|
|     Sudeste| 3809222| 99989|   1812.65|      62.05| 2021-07-06|
|         Sul| 1308643|   999|    622.73|      15.16| 2021-07-06|
+------------+--------+------+----------+-----------+-----------+



Elastic View

Create a topic in elastic

In [112]:
# Dataframe to Elastic - topic
obitos_acumulados_es = (Obitos_Acumulados_json.write
                            .format('org.elasticsearch.spark.sql')
                            .option('es.nodes', 'localhost')
                            .option('es.port', 9200)
                            .option('es.resource', '%s/%s' % ('index_name', 'doc_type_name'),
                            ).save("user/elastic"))


Py4JJavaError: An error occurred while calling o927.save.
: java.lang.ClassNotFoundException: Failed to find data source: org.elasticsearch.spark.sql. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:657)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:244)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.elasticsearch.spark.sql.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
	... 13 more
