Spark Final Project

In [None]:
# import library
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
# Init SparkSession with HiveSupport
spark = SparkSession \
    .builder \
    .appName("Spark-Hive Connection") \
    .config("spark.sql.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
# Create dataframe from csv in hdfs
covid_df = spark.read.csv('/user/final_spark_project/covid_br_data/*.csv', sep=";", header=True)

In [None]:
# Check dataframe Schema
covid_df.dtypes

In [None]:
# Check dataframe data
covid_df.show(1,False,True)

In [None]:
# Change dataframe Schema acording with the data
covid_df_1 = covid_df.select("regiao",
                             "estado",
                             "municipio",
                             col("coduf").cast("integer"),
                             col("codmun").cast("integer"),
                             col("codRegiaoSaude").cast("integer"),
                             "nomeRegiaoSaude",
                             col("data").cast("date"),
                             col("semanaEpi").cast("integer"),
                             col("populacaoTCU2019").cast("date"),
                             col("casosAcumulado").cast("integer"),
                             col("casosNovos").cast("integer"),
                             col("obitosAcumulado").cast("string"),
                             col("obitosNovos").cast("integer"),
                             col("Recuperadosnovos").cast("integer"),
                             col("emAcompanhamentoNovos").cast("integer"),
                             col("interior/metropolitana").cast("integer"))


In [None]:
# Show the Hive Database
spark.sql("show databases").show()

In [None]:
# Send the data from HDFS To Hive
covid_df_1.write.format("csv").partitionBy("municipio").saveAsTable("Covid_br_data")

In [None]:
# Show the table created
spark.sql("show tables").show()

Creating dataframes for Visualizations

First View

In [None]:
Recuperados = spark.sql("select Recuperadosnovos as Casos_Recuperados from covid_br_data order by 1 desc limit 1")
Recuperados.show()

In [None]:
Acompanhamento = spark.sql("select emAcompanhamentoNovos as Em_Acompanhamento from covid_br_data order by 1 desc limit 1")
Acompanhamento.show()

Second View

In [None]:
casosAcumulado = spark.sql("select casosAcumulado as Acumulado from covid_br_data order by 1 desc limit 1")
casosAcumulado.show()

In [None]:
casosNovos = spark.sql("select casosNovos as Casos_Novos from covid_br_data order by 1 desc limit 1")
casosNovos.show()

In [None]:
Incidencia = spark.sql("select ((casosAcumulado/210147125)*100000) as Incidencia from covid_br_data order by 1 desc limit 1")
Incidencia.show()

Third View

In [None]:
# Kafka only accept data from a string type value column 
# Change the current result for a string type named value column 
Obitos_Acumulados = spark.sql("select obitosAcumulado from covid_br_data order by 1 desc limit 1")
Obitos_Acumulados_string = Obitos_Acumulados.withColumn("value", col("obitosAcumulado").cast(StringType())).drop("obitosAcumulado")
Obitos_Acumulados_string.show()

In [None]:
obitosNovos = spark.sql("select obitosNovos from covid_br_data order by 1 desc limit 1")
obitosNovos_string = obitosNovos.withColumn("value", col("obitosNovos").cast(StringType())).drop("obitosNovos")
obitosNovos_string.show()

In [None]:
Mortalidade = spark.sql("select ((obitosAcumulado/210147125)*100000) as Mortalidade from covid_br_data order by 1 desc limit 1")
Mortalidade_string = Mortalidade.withColumn("value", col("Mortalidade").cast(StringType())).drop("Mortalidade")
Mortalidade_string.show()

In [None]:
Letalidade = spark.sql("select obitosNovos, casosNovos, (obitosNovos/casosNovos)*100 as Letalidade from covid_br_data order by 1 desc limit 1")
Letalidade_string = Letalidade.withColumn("value", col("Letalidade").cast(StringType())).drop("obitosNovos", "casosNovos", "Letalidade")
Letalidade_string.show()

Save to Hive Table

In [None]:
Recuperados.write.format("csv").saveAsTable("Recuperados")

In [None]:
Acompanhamento.write.format("csv").saveAsTable("Acompanhamento")

Save to HDFS as parquet with snappy compression

In [None]:
casosAcumulado.write.option("compression","snappy").parquet("/user/ivan/final_spark_project/casosAcumulado")

In [None]:
casosNovos.write.option("compression","snappy").parquet("/user/ivan/final_spark_project/casosNovos")

In [None]:
Incidencia.write.option("compression","snappy").parquet("/user/ivan/final_spark_project/Incidencia")

Save to Kafka topic

In [None]:
Obitos_Acumulados_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic","topic-Obitos_Acumulados") \
                .save()

In [None]:
obitosNovos_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Obitos_Novos") \
                .option("checkpointLocation","user/ivan/kafka_checkpoint_Obitos_Novos")\
                .option("path","hdfs://namenode:50070/user/ivan/kafka/topic-Obitos_Novos") \
                .save()

In [None]:
Mortalidade_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Mortalidade") \
                .option("checkpointLocation","user/ivan/kafka_checkpoint_Mortalidade")\
                .option("path","hdfs://namenode:50070/user/ivan/kafka/topic-Mortalidade") \
                .save()

In [None]:
Letalidade_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Letalidade") \
                .option("checkpointLocation","user/ivan/kafka_checkpoint_Letalidade")\
                .option("path","hdfs://namenode:50070/user/ivan/kafka/topic-Letalidade") \
                .save()

Spark View

In [None]:
spark_df = spark.sql("select regiao, \
         max(casosAcumulado) as Casos, \
         max(obitosAcumulado) as Obitos, \
         max(cast(((casosAcumulado/210147125)*100000) as decimal(18,2))) as Incidencia, \
         max(cast(((obitosAcumulado/210147125)*100000) as decimal(18,2))) as Mortalidade , \
         max(data) as Atualizacao\
         from covid_br_data \
         group by regiao \
         order by regiao")
spark_df.show()