Spark Final Project

In [5]:
# import library
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [6]:
# Init SparkSession with HiveSupport
spark = SparkSession \
    .builder \
    .appName("Spark-Hive Connection") \
    .config("spark.sql.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
# Move csv files to HDFS
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv /user/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv /user/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv /user/final_spark_project/covid_br_data/
!hdfs dfs -put /mnt/notebooks/covid_br_data/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv /user/final_spark_project/covid_br_data/

In [7]:
# Create dataframe from csv in hdfs
covid_df = spark.read.csv('/user/final_spark_project/covid_br_data/*.csv', sep=";", header=True)

In [8]:
# Check dataframe Schema
covid_df.dtypes

[('regiao', 'string'),
 ('estado', 'string'),
 ('municipio', 'string'),
 ('coduf', 'string'),
 ('codmun', 'string'),
 ('codRegiaoSaude', 'string'),
 ('nomeRegiaoSaude', 'string'),
 ('data', 'string'),
 ('semanaEpi', 'string'),
 ('populacaoTCU2019', 'string'),
 ('casosAcumulado', 'string'),
 ('casosNovos', 'string'),
 ('obitosAcumulado', 'string'),
 ('obitosNovos', 'string'),
 ('Recuperadosnovos', 'string'),
 ('emAcompanhamentoNovos', 'string'),
 ('interior/metropolitana', 'string')]

In [9]:
# Check dataframe data
covid_df.show(1,False,True)

-RECORD 0----------------------------
 regiao                 | Brasil     
 estado                 | null       
 municipio              | null       
 coduf                  | 76         
 codmun                 | null       
 codRegiaoSaude         | null       
 nomeRegiaoSaude        | null       
 data                   | 2020-02-25 
 semanaEpi              | 9          
 populacaoTCU2019       | 210147125  
 casosAcumulado         | 0          
 casosNovos             | 0          
 obitosAcumulado        | 0          
 obitosNovos            | 0          
 Recuperadosnovos       | null       
 emAcompanhamentoNovos  | null       
 interior/metropolitana | null       
only showing top 1 row



In [10]:
# Change dataframe Schema acording with the data
covid_df_1 = covid_df.select("regiao",
                             "estado",
                             "municipio",
                             col("coduf").cast("integer"),
                             col("codmun").cast("integer"),
                             col("codRegiaoSaude").cast("integer"),
                             "nomeRegiaoSaude",
                             col("data").cast("date"),
                             col("semanaEpi").cast("integer"),
                             col("populacaoTCU2019").cast("date"),
                             col("casosAcumulado").cast("integer"),
                             col("casosNovos").cast("integer"),
                             col("obitosAcumulado").cast("string"),
                             col("obitosNovos").cast("integer"),
                             col("Recuperadosnovos").cast("integer"),
                             col("emAcompanhamentoNovos").cast("integer"),
                             col("interior/metropolitana").cast("integer"))


In [11]:
# Show the Hive Database
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [None]:
# Send the data from HDFS To Hive
covid_df_1.write.format("csv").partitionBy("municipio").saveAsTable("Covid_br_data")

In [12]:
# Show the table created
spark.sql("show tables").show()

+--------+--------------+-----------+
|database|     tableName|isTemporary|
+--------+--------------+-----------+
| default|acompanhamento|      false|
| default| covid_br_data|      false|
| default|   recuperados|      false|
+--------+--------------+-----------+



Creating dataframes for Visualizations

First View

In [13]:
Recuperados = spark.sql("select Recuperadosnovos as Casos_Recuperados from covid_br_data order by 1 desc limit 1")
Recuperados.show()

+-----------------+
|Casos_Recuperados|
+-----------------+
|         17262646|
+-----------------+



In [None]:
Acompanhamento = spark.sql("select emAcompanhamentoNovos as Em_Acompanhamento from covid_br_data order by 1 desc limit 1")
Acompanhamento.show()

Second View

In [15]:
casosAcumulado = spark.sql("select casosAcumulado as Acumulado from covid_br_data order by 1 desc limit 1")
casosAcumulado.show()

+---------+
|Acumulado|
+---------+
| 18855015|
+---------+



In [18]:
casosNovos = spark.sql("select casosNovos as Casos_Novos from covid_br_data order by 1 desc limit 1")
casosNovos.show()

+-----------+
|Casos_Novos|
+-----------+
|     115228|
+-----------+



In [22]:
Incidencia = spark.sql("select ((casosAcumulado/210147125)*100000) as Incidencia from covid_br_data order by 1 desc limit 1")
Incidencia.show()

+-----------------+
|       Incidencia|
+-----------------+
|8972.292625940041|
+-----------------+



Third View

In [26]:
# Kafka only accept data from a string type value column 
# Change the current result for a string type named value column 
Obitos_Acumulados = spark.sql("select obitosAcumulado from covid_br_data order by 1 desc limit 1")
Obitos_Acumulados_string = Obitos_Acumulados.withColumn("value", col("obitosAcumulado").cast(StringType())).drop("obitosAcumulado")
Obitos_Acumulados_string.show()

+-----+
|value|
+-----+
|99989|
+-----+



In [29]:
obitosNovos = spark.sql("select obitosNovos from covid_br_data order by 1 desc limit 1")
obitosNovos_string = obitosNovos.withColumn("value", col("obitosNovos").cast(StringType())).drop("obitosNovos")
obitosNovos_string.show()

+-----+
|value|
+-----+
| 4249|
+-----+



In [30]:
Mortalidade = spark.sql("select ((obitosAcumulado/210147125)*100000) as Mortalidade from covid_br_data order by 1 desc limit 1")
Mortalidade_string = Mortalidade.withColumn("value", col("Mortalidade").cast(StringType())).drop("Mortalidade")
Mortalidade_string.show()

+------------------+
|             value|
+------------------+
|250.72529543290204|
+------------------+



In [31]:
Letalidade = spark.sql("select obitosNovos, casosNovos, (obitosNovos/casosNovos)*100 as Letalidade from covid_br_data order by 1 desc limit 1")
Letalidade_string = Letalidade.withColumn("value", col("Letalidade").cast(StringType())).drop("obitosNovos", "casosNovos", "Letalidade")
Letalidade_string.show()

+-----------------+
|            value|
+-----------------+
|4.903522134515072|
+-----------------+



Save to Hive Table

In [None]:
Recuperados.write.format("csv").saveAsTable("Recuperados")

In [None]:
Acompanhamento.write.format("csv").saveAsTable("Acompanhamento")

In [14]:
# Check the created tables
spark.sql("Show tables").show()

+--------+--------------+-----------+
|database|     tableName|isTemporary|
+--------+--------------+-----------+
| default|acompanhamento|      false|
| default| covid_br_data|      false|
| default|   recuperados|      false|
+--------+--------------+-----------+



Save to HDFS as parquet with snappy compression

In [17]:
casosAcumulado.write.option("compression","snappy").parquet("/user/final_spark_project/casosAcumulado")

In [20]:
casosNovos.write.option("compression","snappy").parquet("/user/final_spark_project/casosNovos")

In [24]:
Incidencia.write.option("compression","snappy").parquet("/user/final_spark_project/Incidencia")

In [25]:
# Check the files in HDFS
!hdfs dfs -ls /user/final_spark_project/

Found 4 items
drwxr-xr-x   - root supergroup          0 2021-11-09 19:52 /user/final_spark_project/Incidencia
drwxr-xr-x   - root supergroup          0 2021-11-09 19:42 /user/final_spark_project/casosAcumulado
drwxr-xr-x   - root supergroup          0 2021-11-09 19:47 /user/final_spark_project/casosNovos
drwxr-xr-x   - root supergroup          0 2021-11-09 18:08 /user/final_spark_project/covid_br_data


Save to Kafka topic

In [27]:
Obitos_Acumulados_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic","topic-Obitos_Acumulados") \
                .option("checkpointLocation","user/kafka_checkpoint_Obitos_Acumulados")\
                .option("path","hdfs:///user/kafka/topic-Obitos_Novos") \
                .save()

Py4JJavaError: An error occurred while calling o205.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 22.0 failed 1 times, most recent failure: Lost task 0.0 in stage 22.0 (TID 11739, localhost, executor driver): org.apache.kafka.common.KafkaException: Failed to construct kafka producer
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:433)
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:270)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$.org$apache$spark$sql$kafka010$CachedKafkaProducer$$createKafkaProducer(CachedKafkaProducer.scala:67)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$$anon$1.load(CachedKafkaProducer.scala:46)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$$anon$1.load(CachedKafkaProducer.scala:43)
	at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
	at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
	at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at org.spark_project.guava.cache.LocalCache$Segment.get(LocalCache.java:2257)
	at org.spark_project.guava.cache.LocalCache.get(LocalCache.java:4000)
	at org.spark_project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004)
	at org.spark_project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$.getOrCreate(CachedKafkaProducer.scala:80)
	at org.apache.spark.sql.kafka010.KafkaWriteTask.execute(KafkaWriteTask.scala:44)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply$mcV$sp(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1.apply(KafkaWriter.scala:87)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.common.config.ConfigException: No resolvable bootstrap urls given in bootstrap.servers
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:88)
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:47)
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:408)
	... 31 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:933)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:933)
	at org.apache.spark.sql.kafka010.KafkaWriter$.write(KafkaWriter.scala:87)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.createRelation(KafkaSourceProvider.scala:254)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.common.KafkaException: Failed to construct kafka producer
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:433)
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:270)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$.org$apache$spark$sql$kafka010$CachedKafkaProducer$$createKafkaProducer(CachedKafkaProducer.scala:67)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$$anon$1.load(CachedKafkaProducer.scala:46)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$$anon$1.load(CachedKafkaProducer.scala:43)
	at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
	at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
	at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at org.spark_project.guava.cache.LocalCache$Segment.get(LocalCache.java:2257)
	at org.spark_project.guava.cache.LocalCache.get(LocalCache.java:4000)
	at org.spark_project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004)
	at org.spark_project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874)
	at org.apache.spark.sql.kafka010.CachedKafkaProducer$.getOrCreate(CachedKafkaProducer.scala:80)
	at org.apache.spark.sql.kafka010.KafkaWriteTask.execute(KafkaWriteTask.scala:44)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply$mcV$sp(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1$$anonfun$apply$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1.apply(KafkaWriter.scala:89)
	at org.apache.spark.sql.kafka010.KafkaWriter$$anonfun$write$1.apply(KafkaWriter.scala:87)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.kafka.common.config.ConfigException: No resolvable bootstrap urls given in bootstrap.servers
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:88)
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:47)
	at org.apache.kafka.clients.producer.KafkaProducer.<init>(KafkaProducer.java:408)
	... 31 more


In [None]:
obitosNovos_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Obitos_Novos") \
                .option("checkpointLocation","user/kafka_checkpoint_Obitos_Novos")\
                .option("path","hdfs:///user/kafka/topic-Obitos_Novos") \
                .save()

In [None]:
Mortalidade_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Mortalidade") \
                .option("checkpointLocation","user/kafka_checkpoint_Mortalidade")\
                .option("path","hdfs:///user/kafka/topic-Mortalidade") \
                .save()

In [None]:
Letalidade_string.write\
                .format("kafka") \
                .option("kafka.bootstrap.servers","kafka:9092") \
                .option("topic", "Letalidade") \
                .option("checkpointLocation","user/kafka_checkpoint_Letalidade")\
                .option("path","hdfs:///user/kafka/topic-Letalidade") \
                .save()

Spark View

In [28]:
spark_df = spark.sql("select regiao, \
         max(casosAcumulado) as Casos, \
         max(obitosAcumulado) as Obitos, \
         max(cast(((casosAcumulado/210147125)*100000) as decimal(18,2))) as Incidencia, \
         max(cast(((obitosAcumulado/210147125)*100000) as decimal(18,2))) as Mortalidade , \
         max(data) as Atualizacao\
         from covid_br_data \
         group by regiao \
         order by regiao")
spark_df.show()

+------------+--------+------+----------+-----------+-----------+
|      regiao|   Casos|Obitos|Incidencia|Mortalidade|Atualizacao|
+------------+--------+------+----------+-----------+-----------+
|      Brasil|18855015| 99572|   8972.29|     250.73| 2021-07-06|
|Centro-Oeste|  686433|  9980|    326.64|       9.27| 2021-07-06|
|    Nordeste| 1141612|  9993|    543.24|      11.62| 2021-07-06|
|       Norte|  557708|  9992|    265.39|       7.43| 2021-07-06|
|     Sudeste| 3809222| 99989|   1812.65|      62.05| 2021-07-06|
|         Sul| 1308643|   999|    622.73|      15.16| 2021-07-06|
+------------+--------+------+----------+-----------+-----------+

