In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum as sum_, first, round
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'null') \
              .csv('s3a://bucket-raw-upa-connect/oximetria_paciente.csv')

# Converter 'valor' para DoubleType e 'fk_upa' para IntegerType
a = a.withColumn('valor', col('valor').cast(DoubleType())) \
     .withColumn('fk_paciente', col('fk_paciente').cast(IntegerType()))

a.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-71ab9ef5-de6c-4fa5-b177-6c44edd47569;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 431ms :: artifacts dl 31ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 by [com.amazonaws#aws-java-sdk-bundle;1.12.262] in [default]
	---------------------------------------------------------------------
	|     

+-----------+-------------------+-----+-----------+
|id_oximetro|          data_hora|valor|fk_paciente|
+-----------+-------------------+-----+-----------+
|          1|2025-04-26 21:04:36| 96.0|         25|
|          2|2025-04-26 21:04:35| 95.0|         25|
|          3|2025-04-26 21:04:34| 97.0|         25|
|          4|2025-04-26 21:09:36| 95.0|         19|
|          5|2025-04-26 21:09:35| 95.0|         19|
|          6|2025-04-26 21:09:34| 94.0|         19|
|          7|2025-04-26 21:14:36| 96.0|         92|
|          8|2025-04-26 21:14:35| 96.0|         92|
|          9|2025-04-26 21:14:34| 95.0|         92|
|         10|2025-04-26 21:19:36| 99.0|        117|
|         11|2025-04-26 21:19:35| 99.0|        117|
|         12|2025-04-26 21:19:34| 99.0|        117|
|         13|2025-04-26 21:24:36| 97.0|         61|
|         14|2025-04-26 21:24:35| 97.0|         61|
|         15|2025-04-26 21:24:34| 97.0|         61|
|         16|2025-04-26 21:29:36| 97.0|        112|
|         17

In [2]:
nulls_valor = a.select(sum_(col("valor").isNull().cast("decimal")).alias("qtd_nulls"))

# Mostra o resultado
nulls_valor.show()

+---------+
|qtd_nulls|
+---------+
|       82|
+---------+



In [3]:
# Nulls
nulls = a.filter(col('valor').isNull())

nulls.show()

# Remover linhas onde a coluna 'valor' está nula
a = a.filter(col('valor').isNotNull())

a.show()

+-----------+-------------------+-----+-----------+
|id_oximetro|          data_hora|valor|fk_paciente|
+-----------+-------------------+-----+-----------+
|         25|2025-04-26 21:44:36| null|         33|
|         37|2025-04-26 22:04:36| null|         38|
|         64|2025-04-26 22:49:36| null|        105|
|         76|2025-04-26 23:09:36| null|        110|
|         79|2025-04-26 23:14:36| null|         25|
|         82|2025-04-26 23:19:36| null|        113|
|        100|2025-04-26 23:49:36| null|         32|
|        130|2025-04-27 00:39:36| null|        103|
|        145|2025-04-27 01:04:36| null|        119|
|        169|2025-04-27 01:44:36| null|        118|
|        175|2025-04-27 01:54:36| null|         80|
|        190|2025-04-27 02:19:36| null|         86|
|        199|2025-04-27 02:34:36| null|        114|
|        226|2025-04-27 03:19:36| null|         18|
|        250|2025-04-27 03:59:36| null|        106|
|        253|2025-04-27 04:04:36| null|         99|
|        259

In [4]:
# Mostrar possíveis outliers
outliers = a.filter((col('valor') <= 0) | (col('valor') > 100))
print('Outliers encontrados:')
outliers.show()

Outliers encontrados:
+-----------+-------------------+-----+-----------+
|id_oximetro|          data_hora|valor|fk_paciente|
+-----------+-------------------+-----+-----------+
|        103|2025-04-26 23:54:36|107.0|        109|
|        104|2025-04-26 23:54:35|107.0|        109|
|        105|2025-04-26 23:54:34|106.0|        109|
+-----------+-------------------+-----+-----------+



In [5]:
# Remover os outliers
a = a.filter((col('valor') >= 0) & (col('valor') <= 100))

# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame:')
a.show()

DataFrame:
+-----------+-------------------+-----+-----------+
|id_oximetro|          data_hora|valor|fk_paciente|
+-----------+-------------------+-----+-----------+
|          1|2025-04-26 21:04:36| 96.0|         25|
|          2|2025-04-26 21:04:35| 95.0|         25|
|          3|2025-04-26 21:04:34| 97.0|         25|
|          4|2025-04-26 21:09:36| 95.0|         19|
|          5|2025-04-26 21:09:35| 95.0|         19|
|          6|2025-04-26 21:09:34| 94.0|         19|
|          7|2025-04-26 21:14:36| 96.0|         92|
|          8|2025-04-26 21:14:35| 96.0|         92|
|          9|2025-04-26 21:14:34| 95.0|         92|
|         10|2025-04-26 21:19:36| 99.0|        117|
|         11|2025-04-26 21:19:35| 99.0|        117|
|         12|2025-04-26 21:19:34| 99.0|        117|
|         13|2025-04-26 21:24:36| 97.0|         61|
|         14|2025-04-26 21:24:35| 97.0|         61|
|         15|2025-04-26 21:24:34| 97.0|         61|
|         16|2025-04-26 21:29:36| 97.0|        112|
|

In [6]:
# Agrupa por paciente, pega a primeira data_hora e calcula a média do valor
a_agrupada = a.groupBy('fk_paciente').agg(
    first('data_hora').alias('data_hora'),
    round(avg('valor'), 1).alias('media_oximetria')
)

# Ordena pelo fk_paciente (se quiser)
a_agrupada = a_agrupada.orderBy('data_hora')

# Exibe o resultado
print('DataFrame final:')
a_agrupada.show()

DataFrame final:


[Stage 9:>                                                          (0 + 1) / 1]

+-----------+-------------------+---------------+
|fk_paciente|          data_hora|media_oximetria|
+-----------+-------------------+---------------+
|         25|2025-04-26 21:04:36|           96.8|
|         19|2025-04-26 21:09:36|           94.7|
|         92|2025-04-26 21:14:36|           97.7|
|        117|2025-04-26 21:19:36|           97.5|
|         61|2025-04-26 21:24:36|           96.6|
|        112|2025-04-26 21:29:36|           96.4|
|         33|2025-04-26 21:34:36|           96.5|
|         17|2025-04-26 21:39:36|           82.3|
|        105|2025-04-26 21:49:36|           96.6|
|         13|2025-04-26 21:54:36|           97.1|
|         88|2025-04-26 21:59:36|           97.1|
|         38|2025-04-26 22:04:35|           97.0|
|         44|2025-04-26 22:14:36|           95.6|
|         54|2025-04-26 22:19:36|           96.8|
|        107|2025-04-26 22:24:36|           98.4|
|         15|2025-04-26 22:29:36|           98.3|
|        127|2025-04-26 22:34:36|           96.2|


                                                                                

In [None]:
a_agrupada.coalesce(1) \
    .write \
    .option('header', 'true') \
    .mode('overwrite') \
    .csv('s3a://bucket-trusted-upa-connect/oximetria_paciente_tratado.csv')

spark.stop()

25/04/28 21:06:06 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/04/28 21:06:06 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                