<a href="https://colab.research.google.com/github/Isiumlord/ProjetoFinal-AcidentesTerrestres/blob/main/DataSet-CasosCovid-PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1 INSTALANDO PYSPARK

!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 31 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 55.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=06c98746f6216b021c2b5b10ab885810b3b025c7f6135a18f454163abe9834df
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
#2 IMPORTANDO DIVERSAS FUNÇÕES

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
import numpy as np
from pyspark.sql.functions import when, round
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col, avg, count, sum, min, max, mean
from pyspark.sql.functions import rank
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import monotonically_increasing_id 

In [None]:
#3 INTEGRANDO COM A GCP

from google.colab import auth
auth.authenticate_user()
project_id = 'projeto-final-grupo03'
!gcloud config set project projeto-final-grupo03

Updated property [core/project].


In [None]:
#4 IMPORTANDO DO ARQUIVO NA BUCKET PARA O COLAB

!gsutil cp gs://notebooks_pandas_gp03/casos-covid_tratado.csv /tmp/casos-covid_tratado.csv

Copying gs://notebooks_pandas_gp03/casos-covid_tratado.csv...
/ [1 files][614.5 KiB/614.5 KiB]                                                
Operation completed over 1 objects/614.5 KiB.                                    


In [None]:
#5 INICIANDO SESSÃO DO SPARK

spark = (SparkSession.builder
        .master("local")
        .appName("dataframe_projeto")
        .config("spark.ui.port', '4050") 
        .getOrCreate())

In [None]:
#6 IMPORTANDO DF A PARTIR DO ARQUIVO CSV ANTERIORMENTE NORMALIZADO NO PANDAS

df = (spark.read.format('csv')
      .option("inferSchema", True)
      .option("header", True)
      .option('sep', ',')
      .load('/tmp/casos-covid_tratado.csv')
)

In [None]:
#7 MOSTRA O ESQUEMA DO DATAFRAME

df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Data: string (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Novas_Mortes: integer (nullable = true)
 |-- Mortes: integer (nullable = true)
 |-- Novos_Casos: integer (nullable = true)
 |-- Total_De_Casos: integer (nullable = true)



In [None]:
df.show()

+---+----------+------+------------+------+-----------+--------------+
|_c0|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|
+---+----------+------+------------+------+-----------+--------------+
|  0|2020-02-25|    SP|           0|     0|          1|             1|
|  2|2020-02-26|    SP|           0|     0|          0|             1|
|  4|2020-02-27|    SP|           0|     0|          0|             1|
|  6|2020-02-28|    SP|           0|     0|          1|             2|
|  8|2020-02-29|    SP|           0|     0|          0|             2|
| 10|2020-03-01|    SP|           0|     0|          0|             2|
| 12|2020-03-02|    SP|           0|     0|          0|             2|
| 14|2020-03-03|    SP|           0|     0|          0|             2|
| 16|2020-03-04|    SP|           0|     0|          1|             3|
| 18|2020-03-05|    RJ|           0|     0|          1|             1|
| 19|2020-03-05|    SP|           0|     0|          3|             6|
| 21|2

In [None]:
#8 DROPANDO INDEX

df1 = df.drop('_c0')

In [None]:
#9 MUDAR O TIPO DE DATA PARA DATE COM CAST

df1 = df1.withColumn("Data", df.Data.cast("date"))

In [None]:
#10 VERIFICANDO MODIFICAÇÃO

df1.printSchema()
df1.show()

root
 |-- Data: date (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Novas_Mortes: integer (nullable = true)
 |-- Mortes: integer (nullable = true)
 |-- Novos_Casos: integer (nullable = true)
 |-- Total_De_Casos: integer (nullable = true)

+----------+------+------------+------+-----------+--------------+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|
+----------+------+------------+------+-----------+--------------+
|2020-02-25|    SP|           0|     0|          1|             1|
|2020-02-26|    SP|           0|     0|          0|             1|
|2020-02-27|    SP|           0|     0|          0|             1|
|2020-02-28|    SP|           0|     0|          1|             2|
|2020-02-29|    SP|           0|     0|          0|             2|
|2020-03-01|    SP|           0|     0|          0|             2|
|2020-03-02|    SP|           0|     0|          0|             2|
|2020-03-03|    SP|           0|     0|          0|             2|
|2020-03-0

In [None]:
#11 CRIANDO NOVA COLUNA SINALIZANDO REGIÃO DO BRASIL 

df2 = (df1.withColumn('Regiao', F.when((df1.Estado == 'PR') | (df1.Estado == 'RS') | (df1.Estado == 'SC'), 'Sul')
                                .when((df1.Estado == 'SP') | (df1.Estado == 'RJ') | (df1.Estado == 'MG') | (df1.Estado == 'ES'), 'Sudeste')
                                .when((df1.Estado == 'MT') | (df1.Estado == 'DF') | (df1.Estado == 'GO') | (df1.Estado == 'MS'), 'Centro-Oeste')
                                .when((df1.Estado == 'RR') | (df1.Estado == 'AP') | (df1.Estado == 'AM') | (df1.Estado == 'PA') | (df1.Estado == 'AC') | (df1.Estado == 'RO') | (df1.Estado == 'TO'), 'Norte').otherwise('Nordeste')))
df2.show()

+----------+------+------------+------+-----------+--------------+------------+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|      Regiao|
+----------+------+------------+------+-----------+--------------+------------+
|2020-02-25|    SP|           0|     0|          1|             1|     Sudeste|
|2020-02-26|    SP|           0|     0|          0|             1|     Sudeste|
|2020-02-27|    SP|           0|     0|          0|             1|     Sudeste|
|2020-02-28|    SP|           0|     0|          1|             2|     Sudeste|
|2020-02-29|    SP|           0|     0|          0|             2|     Sudeste|
|2020-03-01|    SP|           0|     0|          0|             2|     Sudeste|
|2020-03-02|    SP|           0|     0|          0|             2|     Sudeste|
|2020-03-03|    SP|           0|     0|          0|             2|     Sudeste|
|2020-03-04|    SP|           0|     0|          1|             3|     Sudeste|
|2020-03-05|    RJ|           0|     0| 

In [None]:
#12 AGRUPANDO DADOS POR ESTADO E POR REGIÃO

df2.groupBy('Regiao').count().orderBy('count').show()
df2.groupBy('Estado').count().orderBy('Estado').show()


+------------+-----+
|      Regiao|count|
+------------+-----+
|         Sul| 1865|
|Centro-Oeste| 2479|
|     Sudeste| 2517|
|       Norte| 4304|
|    Nordeste| 5572|
+------------+-----+

+------+-----+
|Estado|count|
+------+-----+
|    AC|  616|
|    AL|  625|
|    AM|  620|
|    AP|  613|
|    BA|  627|
|    CE|  617|
|    DF|  626|
|    ES|  627|
|    GO|  621|
|    MA|  613|
|    MG|  625|
|    MS|  619|
|    MT|  613|
|    PA|  615|
|    PB|  615|
|    PE|  621|
|    PI|  614|
|    PR|  621|
|    RJ|  628|
|    RN|  621|
+------+-----+
only showing top 20 rows



In [None]:
#13 CRIANDO NOVA COLUNA COM MÊS

df2 = df2.withColumn('Mes', F.month(df2.Data))
#df2 = df1.withColumn('mes', F.month(df1.data))
df2.show()

+----------+------+------------+------+-----------+--------------+------------+---+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|      Regiao|Mes|
+----------+------+------------+------+-----------+--------------+------------+---+
|2020-02-25|    SP|           0|     0|          1|             1|     Sudeste|  2|
|2020-02-26|    SP|           0|     0|          0|             1|     Sudeste|  2|
|2020-02-27|    SP|           0|     0|          0|             1|     Sudeste|  2|
|2020-02-28|    SP|           0|     0|          1|             2|     Sudeste|  2|
|2020-02-29|    SP|           0|     0|          0|             2|     Sudeste|  2|
|2020-03-01|    SP|           0|     0|          0|             2|     Sudeste|  3|
|2020-03-02|    SP|           0|     0|          0|             2|     Sudeste|  3|
|2020-03-03|    SP|           0|     0|          0|             2|     Sudeste|  3|
|2020-03-04|    SP|           0|     0|          1|             3|     Sudes

In [None]:
#14 CRIANDO NOVA COLUNA COM ANO

df2 = df2.withColumn('Ano', F.year(df2.Data))

df2.show()

+----------+------+------------+------+-----------+--------------+------------+---+----+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|      Regiao|Mes| Ano|
+----------+------+------------+------+-----------+--------------+------------+---+----+
|2020-02-25|    SP|           0|     0|          1|             1|     Sudeste|  2|2020|
|2020-02-26|    SP|           0|     0|          0|             1|     Sudeste|  2|2020|
|2020-02-27|    SP|           0|     0|          0|             1|     Sudeste|  2|2020|
|2020-02-28|    SP|           0|     0|          1|             2|     Sudeste|  2|2020|
|2020-02-29|    SP|           0|     0|          0|             2|     Sudeste|  2|2020|
|2020-03-01|    SP|           0|     0|          0|             2|     Sudeste|  3|2020|
|2020-03-02|    SP|           0|     0|          0|             2|     Sudeste|  3|2020|
|2020-03-03|    SP|           0|     0|          0|             2|     Sudeste|  3|2020|
|2020-03-04|    SP|  

In [None]:
#15 AGRUPANDO QUANTIDADE DE INFORMAÇÕES POR MÊS E ANO

df2.groupBy('Mes').count().orderBy('Mes').show()

df2.groupBy('Ano').count().orderBy('count').show()

+---+-----+
|Mes|count|
+---+-----+
|  1|  837|
|  2|  761|
|  3| 1342|
|  4| 1620|
|  5| 1674|
|  6| 1620|
|  7| 1674|
|  8| 1674|
|  9| 1620|
| 10| 1674|
| 11| 1404|
| 12|  837|
+---+-----+

+----+-----+
| Ano|count|
+----+-----+
|2020| 7935|
|2021| 8802|
+----+-----+



In [None]:
#16 FILTRO PARA SABER QUANTAS VEZES O NÚMERO DE MORTES DIÁRIAS ATINGIU 200 X ESTADO

df2.filter(df.Novas_Mortes >= 200).groupBy('Estado').count().orderBy('count').show()

+------+-----+
|Estado|count|
+------+-----+
|    AM|    1|
|    PA|    2|
|    SC|    2|
|    GO|    7|
|    CE|   16|
|    RS|   29|
|    PR|   42|
|    MG|   83|
|    RJ|   99|
|    SP|  289|
+------+-----+



In [None]:
#17 FILTRO PARA SABER QUANTAS VEZES O NÚMERO DE MORTES DIÁRIAS ATINGIU 200 X REGIÃO

df2.filter(df.Novas_Mortes >= 200).groupBy('Regiao').count().orderBy('count').show()

+------------+-----+
|      Regiao|count|
+------------+-----+
|       Norte|    3|
|Centro-Oeste|    7|
|    Nordeste|   16|
|         Sul|   73|
|     Sudeste|  471|
+------------+-----+



In [None]:
df2.printSchema()
df2.show()

root
 |-- Data: date (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Novas_Mortes: integer (nullable = true)
 |-- Mortes: integer (nullable = true)
 |-- Novos_Casos: integer (nullable = true)
 |-- Total_De_Casos: integer (nullable = true)
 |-- Regiao: string (nullable = false)
 |-- Mes: integer (nullable = true)
 |-- Ano: integer (nullable = true)

+----------+------+------------+------+-----------+--------------+------------+---+----+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos|      Regiao|Mes| Ano|
+----------+------+------------+------+-----------+--------------+------------+---+----+
|2020-02-25|    SP|           0|     0|          1|             1|     Sudeste|  2|2020|
|2020-02-26|    SP|           0|     0|          0|             1|     Sudeste|  2|2020|
|2020-02-27|    SP|           0|     0|          0|             1|     Sudeste|  2|2020|
|2020-02-28|    SP|           0|     0|          1|             2|     Sudeste|  2|2020|
|2020-02-29| 

In [None]:
#18 MÉDIA DE NOVAS MORTES POR ESTADO 

df2.groupBy('Estado').agg(round(mean("Novas_Mortes")).alias("Media_Novas_Mortes_Estado")).orderBy('Media_Novas_Mortes_Estado').show(27)

+------+-------------------------+
|Estado|Media_Novas_Mortes_Estado|
+------+-------------------------+
|    AC|                      3.0|
|    RR|                      3.0|
|    AP|                      3.0|
|    TO|                      6.0|
|    SE|                     10.0|
|    AL|                     10.0|
|    RO|                     11.0|
|    PI|                     12.0|
|    RN|                     12.0|
|    PB|                     15.0|
|    MS|                     16.0|
|    MA|                     17.0|
|    DF|                     18.0|
|    ES|                     21.0|
|    AM|                     22.0|
|    MT|                     23.0|
|    PA|                     27.0|
|    PE|                     32.0|
|    SC|                     32.0|
|    GO|                     39.0|
|    CE|                     40.0|
|    BA|                     43.0|
|    RS|                     58.0|
|    PR|                     66.0|
|    MG|                     90.0|
|    RJ|            

In [None]:
#19 MEDIA DE NOVAS MORTES POR MES ENTRE 2020 / 2021

df2.groupBy(('Mes'), F.col('Ano')).agg(round(mean("Novas_Mortes")).alias("Media_Novas_Mortes_Mes")).orderBy('Media_Novas_Mortes_Mes').show(24)

+---+----+----------------------+
|Mes| Ano|Media_Novas_Mortes_Mes|
+---+----+----------------------+
|  3|2020|                   0.0|
|  2|2020|                   0.0|
|  4|2020|                   7.0|
| 11|2021|                   8.0|
| 10|2021|                  13.0|
| 11|2020|                  16.0|
| 10|2020|                  19.0|
|  9|2021|                  20.0|
| 12|2020|                  26.0|
|  5|2020|                  28.0|
|  9|2020|                  28.0|
|  8|2021|                  29.0|
|  1|2021|                  35.0|
|  8|2020|                  35.0|
|  6|2020|                  38.0|
|  7|2020|                  39.0|
|  2|2021|                  40.0|
|  7|2021|                  46.0|
|  6|2021|                  68.0|
|  5|2021|                  70.0|
|  3|2021|                  80.0|
|  4|2021|                 102.0|
+---+----+----------------------+



In [None]:
#20 MOSTRANDO O MAIOR VALOR NA COLUNA 'NÚMERO DE CASOS NOVOS' AGRUPADO POR ESTADO

df2.groupBy(F.col('Estado')).max('Novos_Casos').orderBy('max(Novos_Casos)').show(27)

+------+----------------+
|Estado|max(Novos_Casos)|
+------+----------------+
|    AC|             863|
|    AL|            1312|
|    TO|            1442|
|    PI|            1907|
|    RR|            2430|
|    RO|            2477|
|    SE|            2597|
|    MA|            2805|
|    AP|            3022|
|    MS|            3034|
|    DF|            3171|
|    MT|            3417|
|    ES|            3532|
|    PA|            4387|
|    AM|            5009|
|    PE|            6487|
|    PB|            8425|
|    GO|            8716|
|    BA|            8822|
|    CE|           12619|
|    MG|           16479|
|    SP|           27706|
|    SC|           30913|
|    RN|           36374|
|    PR|           45020|
|    RS|           64036|
|    RJ|          105200|
+------+----------------+



In [None]:
#21 COMPARANDO MÉDIA DE NOVAS MORTES POR ESTADO X ANO

df2.groupBy(F.col('Estado'), F.col('Ano')).agg(round(mean("Novas_Mortes")).alias("Media_Novas_Mortes_Ano")).orderBy('Estado').show(54)

+------+----+----------------------+
|Estado| Ano|Media_Novas_Mortes_Ano|
+------+----+----------------------+
|    AC|2021|                   3.0|
|    AC|2020|                   3.0|
|    AL|2021|                  12.0|
|    AL|2020|                   8.0|
|    AM|2020|                  18.0|
|    AM|2021|                  26.0|
|    AP|2020|                   3.0|
|    AP|2021|                   3.0|
|    BA|2021|                  56.0|
|    BA|2020|                  30.0|
|    CE|2020|                  34.0|
|    CE|2021|                  45.0|
|    DF|2021|                  21.0|
|    DF|2020|                  14.0|
|    ES|2020|                  17.0|
|    ES|2021|                  25.0|
|    GO|2021|                  54.0|
|    GO|2020|                  23.0|
|    MA|2020|                  16.0|
|    MA|2021|                  18.0|
|    MG|2020|                  40.0|
|    MG|2021|                 135.0|
|    MS|2021|                  22.0|
|    MS|2020|                   8.0|
|

In [None]:
#22 RANQUEANDO NÚMERO DE NOVOS CASOS POR ESTADO, MES E ANO

window1 = Window.orderBy(col("Novos_Casos").desc())
df2 = df2.withColumn('RANK', row_number().over(window1))
df2.select(df2.RANK, df2.Estado, df2.Mes, df2.Ano, df2.Novos_Casos).show()

+----+------+---+----+-----------+
|RANK|Estado|Mes| Ano|Novos_Casos|
+----+------+---+----+-----------+
|   1|    RJ|  9|2021|     105200|
|   2|    RS|  7|2021|      64036|
|   3|    PR|  3|2021|      45020|
|   4|    RN|  6|2021|      36374|
|   5|    PR|  1|2021|      32436|
|   6|    SC|  8|2020|      30913|
|   7|    SP|  6|2021|      27706|
|   8|    SP|  4|2021|      26567|
|   9|    SP|  9|2021|      23586|
|  10|    SP|  6|2021|      23227|
|  11|    SP|  3|2021|      23169|
|  12|    SP|  6|2021|      23122|
|  13|    SP|  6|2021|      23097|
|  14|    SP|  6|2021|      23033|
|  15|    SP|  6|2021|      22875|
|  16|    SP|  4|2021|      22794|
|  17|    SP|  9|2021|      22678|
|  18|    SP|  6|2021|      22582|
|  19|    SP|  4|2021|      21521|
|  20|    SP| 11|2020|      21515|
+----+------+---+----+-----------+
only showing top 20 rows



In [None]:
df2.show()

In [None]:
#23 RANQUEANDO NÚMERO DE NOVAS MORTES POR ESTADO, MES E ANO

window1 = Window.orderBy(col("Novas_Mortes").desc())
df2 = df2.withColumn('RANK', row_number().over(window1))
df2.select(df2.RANK, df2.Estado, df2.Mes, df2.Ano, df2.Novas_Mortes).show(100)

+----+------+---+----+------------+
|RANK|Estado|Mes| Ano|Novas_Mortes|
+----+------+---+----+------------+
|   1|    SP|  4|2021|        1389|
|   2|    SP|  4|2021|        1299|
|   3|    SP|  4|2021|        1282|
|   4|    SP|  3|2021|        1209|
|   5|    SP|  3|2021|        1193|
|   6|    SP|  3|2021|        1160|
|   7|    SP|  4|2021|        1122|
|   8|    SP|  4|2021|        1095|
|   9|    SP|  4|2021|        1082|
|  10|    SP|  4|2021|        1060|
|  11|    SP|  3|2021|        1051|
|  12|    SP|  4|2021|        1044|
|  13|    SP|  3|2021|        1021|
|  14|    SP|  4|2021|        1008|
|  15|    SP|  4|2021|         977|
|  16|    SP|  5|2021|         898|
|  17|    SP|  6|2021|         897|
|  18|    SP|  4|2021|         889|
|  19|    SP|  4|2021|         876|
|  20|    SP|  4|2021|         875|
|  21|    SP|  4|2021|         863|
|  22|    SP|  5|2021|         849|
|  23|    SP|  6|2021|         843|
|  24|    SP|  6|2021|         836|
|  25|    SP|  6|2021|      

In [None]:
#24 SALVANDO O ARQUIVO NORMALIZADO NO BUCKET

(df2.write.format("csv").option("header", "true")
                        .option("inferschema", "true")
                        .option("delimiter", ",")
                        .save("casos-covid_pyspark_"))

!gsutil cp -r casos-covid_pyspark_ gs://notebooks_pyspark_sql_gp03

Copying file://casos-covid_pyspark_/.part-00000-2306826b-e486-4e90-9219-a2472a4cbf82-c000.csv.crc [Content-Type=application/octet-stream]...
Copying file://casos-covid_pyspark_/part-00000-2306826b-e486-4e90-9219-a2472a4cbf82-c000.csv [Content-Type=text/csv]...
Copying file://casos-covid_pyspark_/._SUCCESS.crc [Content-Type=application/octet-stream]...
Copying file://casos-covid_pyspark_/_SUCCESS [Content-Type=application/octet-stream]...
\ [4 files][871.8 KiB/871.8 KiB]                                                
Operation completed over 4 objects/871.8 KiB.                                    


In [None]:
#25 CARREGA ARQUIVO
!gsutil cp gs://notebooks_pyspark_sql_gp03/casos-covid_pyspark_/part-00000-2306826b-e486-4e90-9219-a2472a4cbf82-c000.csv /tmp/casos-covid_pyspark.csv

df = spark.read.csv('/tmp/casos-covid_pyspark.csv', header=True, sep=",")
# REGISTRA UMA TABELA TEMPORÁRIA
df.createOrReplaceTempView("temp")
# SELECIONA TODOS OS DADOS DA TABELA TEMPORÁRIA
spark.sql("select * from temp limit 5").show()
# Select count of data in table
spark.sql("select count(*) as total_count from temp").show()

# CARREGA ARQUIVO
!gsutil cp gs://notebooks_pyspark_sql_gp03/pedagio_pyspark/part-00000-b847fefe-675e-4a0e-9424-246331a9e024-c000.csv /tmp/pedagio_pyspark.csv

df = spark.read.csv('/tmp/pedagio_pyspark.csv', header=True, sep=",")


Copying gs://notebooks_pyspark_sql_gp03/casos-covid_pyspark_/part-00000-2306826b-e486-4e90-9219-a2472a4cbf82-c000.csv...
/ [1 files][865.0 KiB/865.0 KiB]                                                
Operation completed over 1 objects/865.0 KiB.                                    
+----------+------+------------+------+-----------+--------------+-------+---+----+----+
|      Data|Estado|Novas_Mortes|Mortes|Novos_Casos|Total_De_Casos| Regiao|Mes| Ano|RANK|
+----------+------+------------+------+-----------+--------------+-------+---+----+----+
|2021-04-06|    SP|        1389| 78554|      22794|       2554841|Sudeste|  4|2021|   1|
|2021-04-08|    SP|        1299| 80742|      21004|       2597366|Sudeste|  4|2021|   2|
|2021-04-13|    SP|        1282| 84380|      18397|       2667241|Sudeste|  4|2021|   3|
|2021-03-30|    SP|        1209| 73492|      21360|       2446680|Sudeste|  3|2021|   4|
|2021-03-26|    SP|        1193| 70696|      21489|       2392374|Sudeste|  3|2021|   5|
+---

In [None]:
#26 SOMA NÚMERO DE NOVOS CASOS POR MÊS DE JANEIRO A SETEMBRO EM 2020

spark.sql("SELECT Mes, SUM(Novos_Casos) AS totalNovosCasos_2020 FROM temp WHERE ano = 2020 AND Mes BETWEEN 2 AND 9 GROUP BY Mes ORDER BY Mes ASC").show()


+---+--------------------+
|Mes|totalNovosCasos_2020|
+---+--------------------+
|  2|                 2.0|
|  3|              5822.0|
|  4|             81302.0|
|  5|            429011.0|
|  6|            896532.0|
|  7|           1257782.0|
|  8|           1244378.0|
|  9|            902536.0|
+---+--------------------+



In [None]:
#27 TOTAL DE NOVOS CASOS POR ANO

spark.sql("SELECT Estado, SUM(Novos_Casos) AS totalNovosCasos_2020 FROM temp WHERE ano = 2020 GROUP BY Estado ORDER BY Estado").show(27)
spark.sql("SELECT Estado, SUM(Novos_Casos) AS totalNovosCasos_2021 FROM temp WHERE ano = 2021 GROUP BY Estado ORDER BY Estado").show(27)

+------+--------------------+
|Estado|totalNovosCasos_2020|
+------+--------------------+
|    AC|             41620.0|
|    AL|            104818.0|
|    AM|            201013.0|
|    AP|             68201.0|
|    BA|            493400.0|
|    CE|            335992.0|
|    DF|            251701.0|
|    ES|            248251.0|
|    GO|            309110.0|
|    MA|            200938.0|
|    MG|            542909.0|
|    MS|            134750.0|
|    MT|            180451.0|
|    PA|            293802.0|
|    PB|            166484.0|
|    PE|            222166.0|
|    PI|            143179.0|
|    PR|            416566.0|
|    RJ|            434648.0|
|    RN|            118999.0|
|    RO|             95729.0|
|    RR|             68710.0|
|    RS|            449674.0|
|    SC|            492583.0|
|    SE|            112505.0|
|    SP|           1462297.0|
|    TO|             90536.0|
+------+--------------------+

+------+--------------------+
|Estado|totalNovosCasos_2021|
+------+-