In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'null') \
              .csv('s3a://bucket-raw-upa-connect/temperatura_ambiente.csv')

# Converter 'valor' para DoubleType e 'fk_upa' para IntegerType
a = a.withColumn('valor', col('valor').cast(DoubleType())) \
     .withColumn('fk_upa', col('fk_upa').cast(IntegerType()))

a.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e4570040-22aa-4375-a301-0bd9977b2a16;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 552ms :: artifacts dl 35ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 by [com.amazonaws#aws-java-sdk-bundle;1.12.262] in [default]
	---------------------------------------------------------------------
	|     

+-----------------------+-------------------+-----+------+
|id_temperatura_ambiente|          data_hora|valor|fk_upa|
+-----------------------+-------------------+-----+------+
|                      1|2025-04-26 21:04:36| 22.7|     1|
|                      2|2025-04-26 21:09:36| 22.5|     1|
|                      3|2025-04-26 21:14:36| 22.4|     1|
|                      4|2025-04-26 21:19:36| 22.5|     1|
|                      5|2025-04-26 21:24:36| 22.5|     1|
|                      6|2025-04-26 21:29:36| 22.5|     1|
|                      7|2025-04-26 21:34:36| 22.5|     1|
|                      8|2025-04-26 21:39:36| 22.4|     1|
|                      9|2025-04-26 21:44:36| 22.3|     1|
|                     10|2025-04-26 21:49:36| 22.2|     1|
|                     11|2025-04-26 21:54:36| 22.4|     1|
|                     12|2025-04-26 21:59:36| 22.5|     1|
|                     13|2025-04-26 22:04:36| 22.5|     1|
|                     14|2025-04-26 22:09:36| 22.5|     

In [2]:
# Nulls
nulls = a.filter(col('valor').isNull())

nulls.show()

+-----------------------+-------------------+-----+------+
|id_temperatura_ambiente|          data_hora|valor|fk_upa|
+-----------------------+-------------------+-----+------+
|                     21|2025-04-26 22:44:36| null|     1|
|                    100|2025-04-27 05:19:36| null|     1|
|                    195|2025-04-27 13:14:36| null|     1|
|                    196|2025-04-27 13:19:36| null|     1|
|                    197|2025-04-27 13:24:36| null|     1|
|                    198|2025-04-27 13:29:36| null|     1|
|                    199|2025-04-27 13:34:36| null|     1|
|                    200|2025-04-27 13:39:36| null|     1|
|                    222|2025-04-27 15:29:36| null|     1|
|                    285|2025-04-27 20:44:36| null|     1|
|                    432|2025-04-27 08:59:36| null|     2|
|                    439|2025-04-27 09:34:36| null|     2|
|                    487|2025-04-27 13:34:36| null|     2|
|                    515|2025-04-27 15:54:36| null|     

In [3]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Definir uma janela ordenada pelo data_hora
window_spec = Window.partitionBy('fk_upa').orderBy('data_hora') \
                    .rowsBetween(Window.unboundedPreceding, 0)

# Criar uma nova coluna que faz um last() ignorando NULLs
a = a.withColumn('valor_preenchido', F.last('valor', ignorenulls=True).over(window_spec))

# Substituir a coluna original pelo preenchido
a = a.drop('valor').withColumnRenamed('valor_preenchido', 'valor')

# Mostrar resultado
a.show(30)

[Stage 3:>                                                          (0 + 1) / 1]

+-----------------------+-------------------+------+-----+
|id_temperatura_ambiente|          data_hora|fk_upa|valor|
+-----------------------+-------------------+------+-----+
|                      1|2025-04-26 21:04:36|     1| 22.7|
|                      2|2025-04-26 21:09:36|     1| 22.5|
|                      3|2025-04-26 21:14:36|     1| 22.4|
|                      4|2025-04-26 21:19:36|     1| 22.5|
|                      5|2025-04-26 21:24:36|     1| 22.5|
|                      6|2025-04-26 21:29:36|     1| 22.5|
|                      7|2025-04-26 21:34:36|     1| 22.5|
|                      8|2025-04-26 21:39:36|     1| 22.4|
|                      9|2025-04-26 21:44:36|     1| 22.3|
|                     10|2025-04-26 21:49:36|     1| 22.2|
|                     11|2025-04-26 21:54:36|     1| 22.4|
|                     12|2025-04-26 21:59:36|     1| 22.5|
|                     13|2025-04-26 22:04:36|     1| 22.5|
|                     14|2025-04-26 22:09:36|     1| 22.

                                                                                

In [4]:
# Mostrar possíveis outliers
outliers = a.filter((col('valor') < 10) | (col('valor') > 40))
print('Outliers encontrados:')
outliers.show()

Outliers encontrados:
+-----------------------+-------------------+------+-----+
|id_temperatura_ambiente|          data_hora|fk_upa|valor|
+-----------------------+-------------------+------+-----+
|                     41|2025-04-27 00:24:36|     1|-14.8|
|                    110|2025-04-27 06:09:36|     1|-16.2|
|                    112|2025-04-27 06:19:36|     1| 96.7|
|                    190|2025-04-27 12:49:36|     1| -8.3|
|                    211|2025-04-27 14:34:36|     1|-24.9|
|                    235|2025-04-27 16:34:36|     1| -3.2|
|                    246|2025-04-27 17:29:36|     1| 66.6|
|                    256|2025-04-27 18:19:36|     1| 68.9|
|                    377|2025-04-27 04:24:36|     2|-10.2|
|                    483|2025-04-27 13:14:36|     2|-26.7|
|                    518|2025-04-27 16:09:36|     2|-13.5|
|                    664|2025-04-27 04:19:36|     3|-47.3|
|                    678|2025-04-27 05:29:36|     3| 68.6|
|                    719|2025-04-2

In [5]:
mediana = a.approxQuantile("valor", [0.5], 0.01)[0]
print(f'Mediana calculada: {mediana}')

# Substituir os outliers pela mediana
a = a.withColumn('valor', 
                when((col('valor') < 10) | (col('valor') > 40), mediana)
                .otherwise(col('valor')))

# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame final:')
a.show(70)

Mediana calculada: 22.9
DataFrame final:
+-----------------------+-------------------+------+-----+
|id_temperatura_ambiente|          data_hora|fk_upa|valor|
+-----------------------+-------------------+------+-----+
|                      1|2025-04-26 21:04:36|     1| 22.7|
|                      2|2025-04-26 21:09:36|     1| 22.5|
|                      3|2025-04-26 21:14:36|     1| 22.4|
|                      4|2025-04-26 21:19:36|     1| 22.5|
|                      5|2025-04-26 21:24:36|     1| 22.5|
|                      6|2025-04-26 21:29:36|     1| 22.5|
|                      7|2025-04-26 21:34:36|     1| 22.5|
|                      8|2025-04-26 21:39:36|     1| 22.4|
|                      9|2025-04-26 21:44:36|     1| 22.3|
|                     10|2025-04-26 21:49:36|     1| 22.2|
|                     11|2025-04-26 21:54:36|     1| 22.4|
|                     12|2025-04-26 21:59:36|     1| 22.5|
|                     13|2025-04-26 22:04:36|     1| 22.5|
|              

In [6]:
a.coalesce(1) \
    .write \
    .option('header', 'true') \
    .mode('overwrite') \
    .csv('s3a://bucket-trusted-upa-connect/temperatura_ambiente_tratado.csv')

spark.stop()

25/04/28 20:35:36 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/04/28 20:35:37 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                