In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.0')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'null') \
              .csv('s3a://bucket-raw-upa-connect/umidade_ambiente.csv')

# Converter 'valor' para DoubleType e 'fk_paciente' para IntegerType
a = a.withColumn('valor', col('valor').cast(DoubleType())) \
     .withColumn('fk_upa', col('fk_upa').cast(IntegerType()))

a.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-07abeda6-361f-40e8-8367-9d2a2e6331c5;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
:: resolution report :: resolve 369ms :: artifacts dl 9ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.375 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------------------------------

+----------+-------------------+-----+------+
|id_umidade|          data_hora|valor|fk_upa|
+----------+-------------------+-----+------+
|         1|2025-04-26 21:04:36| 40.6|     1|
|         2|2025-04-26 21:09:36| 40.7|     1|
|         3|2025-04-26 21:14:36| 40.7|     1|
|         4|2025-04-26 21:19:36| 40.5|     1|
|         5|2025-04-26 21:24:36| 40.3|     1|
|         6|2025-04-26 21:29:36| 40.3|     1|
|         7|2025-04-26 21:34:36| 40.7|     1|
|         8|2025-04-26 21:39:36| null|     1|
|         9|2025-04-26 21:44:36| null|     1|
|        10|2025-04-26 21:49:36| null|     1|
|        11|2025-04-26 21:54:36| null|     1|
|        12|2025-04-26 21:59:36| null|     1|
|        13|2025-04-26 22:04:36| null|     1|
|        14|2025-04-26 22:09:36| null|     1|
|        15|2025-04-26 22:14:36| null|     1|
|        16|2025-04-26 22:19:36| 58.2|     1|
|        17|2025-04-26 22:24:36| 57.7|     1|
|        18|2025-04-26 22:29:36| 58.1|     1|
|        19|2025-04-26 22:34:36| n

In [2]:
# Nulls
nulls = a.filter(col('valor').isNull())

nulls.show()

+----------+-------------------+-----+------+
|id_umidade|          data_hora|valor|fk_upa|
+----------+-------------------+-----+------+
|         8|2025-04-26 21:39:36| null|     1|
|         9|2025-04-26 21:44:36| null|     1|
|        10|2025-04-26 21:49:36| null|     1|
|        11|2025-04-26 21:54:36| null|     1|
|        12|2025-04-26 21:59:36| null|     1|
|        13|2025-04-26 22:04:36| null|     1|
|        14|2025-04-26 22:09:36| null|     1|
|        15|2025-04-26 22:14:36| null|     1|
|        19|2025-04-26 22:34:36| null|     1|
|        34|2025-04-26 23:49:36| null|     1|
|        52|2025-04-27 01:19:36| null|     1|
|        83|2025-04-27 03:54:36| null|     1|
|        88|2025-04-27 04:19:36| null|     1|
|       196|2025-04-27 13:19:36| null|     1|
|       197|2025-04-27 13:24:36| null|     1|
|       295|2025-04-26 21:34:36| null|     2|
|       296|2025-04-26 21:39:36| null|     2|
|       297|2025-04-26 21:44:36| null|     2|
|       307|2025-04-26 22:34:36| n

In [3]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Definir uma janela ordenada pelo data_hora
window_spec = Window.partitionBy('fk_upa').orderBy('data_hora') \
                    .rowsBetween(Window.unboundedPreceding, 0)

# Criar uma nova coluna que faz um last() ignorando NULLs
a = a.withColumn('valor_preenchido', F.last('valor', ignorenulls=True).over(window_spec))

# Substituir a coluna original pelo preenchido
a = a.drop('valor').withColumnRenamed('valor_preenchido', 'valor')

# Mostrar resultado
a.show(30)

[Stage 3:>                                                          (0 + 1) / 1]

+----------+-------------------+------+-----+
|id_umidade|          data_hora|fk_upa|valor|
+----------+-------------------+------+-----+
|         1|2025-04-26 21:04:36|     1| 40.6|
|         2|2025-04-26 21:09:36|     1| 40.7|
|         3|2025-04-26 21:14:36|     1| 40.7|
|         4|2025-04-26 21:19:36|     1| 40.5|
|         5|2025-04-26 21:24:36|     1| 40.3|
|         6|2025-04-26 21:29:36|     1| 40.3|
|         7|2025-04-26 21:34:36|     1| 40.7|
|         8|2025-04-26 21:39:36|     1| 40.7|
|         9|2025-04-26 21:44:36|     1| 40.7|
|        10|2025-04-26 21:49:36|     1| 40.7|
|        11|2025-04-26 21:54:36|     1| 40.7|
|        12|2025-04-26 21:59:36|     1| 40.7|
|        13|2025-04-26 22:04:36|     1| 40.7|
|        14|2025-04-26 22:09:36|     1| 40.7|
|        15|2025-04-26 22:14:36|     1| 40.7|
|        16|2025-04-26 22:19:36|     1| 58.2|
|        17|2025-04-26 22:24:36|     1| 57.7|
|        18|2025-04-26 22:29:36|     1| 58.1|
|        19|2025-04-26 22:34:36|  

                                                                                

In [5]:
# Mostrar possíveis outliers
outliers = a.filter((col('valor') < 0) | (col('valor') > 100))
print('Outliers encontrados:')
outliers.show()

Outliers encontrados:
+----------+-------------------+------+-----+
|id_umidade|          data_hora|fk_upa|valor|
+----------+-------------------+------+-----+
|       127|2025-04-27 07:34:36|     1|110.2|
|       438|2025-04-27 09:29:36|     2|139.8|
|       452|2025-04-27 10:39:36|     2|121.6|
|       525|2025-04-27 16:44:36|     2|146.4|
|       583|2025-04-26 21:34:36|     3|-27.2|
|       748|2025-04-27 11:19:36|     3|133.2|
|       749|2025-04-27 11:24:36|     3| -9.6|
|       869|2025-04-26 21:24:36|     4|134.1|
|       946|2025-04-27 03:49:36|     4|127.4|
|      1043|2025-04-27 11:54:36|     4|114.8|
|      1077|2025-04-27 14:44:36|     4| -2.9|
|      1298|2025-04-27 09:09:36|     5|-10.0|
|      1307|2025-04-27 09:54:36|     5| -9.4|
|      1341|2025-04-27 12:44:36|     5|142.1|
|      1408|2025-04-27 18:19:36|     5|107.7|
|      1427|2025-04-27 19:54:36|     5|111.9|
|      1453|2025-04-26 22:04:36|     6| -3.1|
|      1495|2025-04-27 01:34:36|     6| -9.6|
|      1504|

In [6]:
mediana = a.approxQuantile("valor", [0.5], 0.01)[0]
print(f'Mediana calculada: {mediana}')

# Substituir os outliers pela mediana
a = a.withColumn('valor', 
                when((col('valor') < 0) | (col('valor') > 100), mediana)
                .otherwise(col('valor')))

# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame final:')
a.show(70)

Mediana calculada: 50.6
DataFrame final:
+----------+-------------------+------+-----+
|id_umidade|          data_hora|fk_upa|valor|
+----------+-------------------+------+-----+
|         1|2025-04-26 21:04:36|     1| 40.6|
|         2|2025-04-26 21:09:36|     1| 40.7|
|         3|2025-04-26 21:14:36|     1| 40.7|
|         4|2025-04-26 21:19:36|     1| 40.5|
|         5|2025-04-26 21:24:36|     1| 40.3|
|         6|2025-04-26 21:29:36|     1| 40.3|
|         7|2025-04-26 21:34:36|     1| 40.7|
|         8|2025-04-26 21:39:36|     1| 40.7|
|         9|2025-04-26 21:44:36|     1| 40.7|
|        10|2025-04-26 21:49:36|     1| 40.7|
|        11|2025-04-26 21:54:36|     1| 40.7|
|        12|2025-04-26 21:59:36|     1| 40.7|
|        13|2025-04-26 22:04:36|     1| 40.7|
|        14|2025-04-26 22:09:36|     1| 40.7|
|        15|2025-04-26 22:14:36|     1| 40.7|
|        16|2025-04-26 22:19:36|     1| 58.2|
|        17|2025-04-26 22:24:36|     1| 57.7|
|        18|2025-04-26 22:29:36|     1|