In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'NULL') \
              .csv('s3a://bucket-raw-upa-connect/camera_visao_comp.csv')

# Converter 'valor' para DoubleType e 'fk_paciente' para IntegerType
a = a.withColumn('qtd_pessoas', col('qtd_pessoas').cast(DoubleType())) \
     .withColumn('fk_upa', col('fk_upa').cast(IntegerType()))

a.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-05d894ff-e912-4281-9250-71d7100521b6;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 512ms :: artifacts dl 26ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 by [com.amazonaws#aws-java-sdk-bundle;1.12.262] in [default]
	---------------------------------------------------------------------
	|     

+---------+-------------------+-----------+------+
|id_camera|          data_hora|qtd_pessoas|fk_upa|
+---------+-------------------+-----------+------+
|        1|2025-04-26 21:04:36|      110.0|     1|
|        2|2025-04-26 21:09:36|      113.0|     1|
|        3|2025-04-26 21:14:36|      111.0|     1|
|        4|2025-04-26 21:19:36|      115.0|     1|
|        5|2025-04-26 21:24:36|      118.0|     1|
|        6|2025-04-26 21:29:36|      119.0|     1|
|        7|2025-04-26 21:34:36|      117.0|     1|
|        8|2025-04-26 21:39:36|      116.0|     1|
|        9|2025-04-26 21:44:36|      116.0|     1|
|       10|2025-04-26 21:49:36|      117.0|     1|
|       11|2025-04-26 21:54:36|      116.0|     1|
|       12|2025-04-26 21:59:36|       null|     1|
|       13|2025-04-26 22:04:36|      118.0|     1|
|       14|2025-04-26 22:09:36|      117.0|     1|
|       15|2025-04-26 22:14:36|      115.0|     1|
|       16|2025-04-26 22:19:36|      112.0|     1|
|       17|2025-04-26 22:24:36|

                                                                                

In [2]:
# Nulls
nulls = a.filter(col('qtd_pessoas').isNull())

nulls.show()

+---------+-------------------+-----------+------+
|id_camera|          data_hora|qtd_pessoas|fk_upa|
+---------+-------------------+-----------+------+
|       12|2025-04-26 21:59:36|       null|     1|
|       20|2025-04-26 22:39:36|       null|     1|
|       76|2025-04-27 03:19:36|       null|     1|
|       95|2025-04-27 04:54:36|       null|     1|
|      113|2025-04-27 06:24:36|       null|     1|
|      123|2025-04-27 07:14:36|       null|     1|
|      126|2025-04-27 07:29:36|       null|     1|
|      138|2025-04-27 08:29:36|       null|     1|
|      148|2025-04-27 09:19:36|       null|     1|
|      162|2025-04-27 10:29:36|       null|     1|
|      172|2025-04-27 11:19:36|       null|     1|
|      175|2025-04-27 11:34:36|       null|     1|
|      183|2025-04-27 12:14:36|       null|     1|
|      186|2025-04-27 12:29:36|       null|     1|
|      199|2025-04-27 13:34:36|       null|     1|
|      202|2025-04-27 13:49:36|       null|     1|
|      219|2025-04-27 15:14:36|

In [3]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Definir uma janela ordenada pelo data_hora
window_spec = Window.partitionBy('fk_upa').orderBy('data_hora') \
                    .rowsBetween(Window.unboundedPreceding, 0)

# Criar uma nova coluna que faz um last() ignorando NULLs
a = a.withColumn('valor_preenchido', F.last('qtd_pessoas', ignorenulls=True).over(window_spec))

# Substituir a coluna original pelo preenchido
a = a.drop('qtd_pessoas').withColumnRenamed('valor_preenchido', 'qtd_pessoas')

# Mostrar resultado
a.show(30)

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-------------------+------+-----------+
|id_camera|          data_hora|fk_upa|qtd_pessoas|
+---------+-------------------+------+-----------+
|        1|2025-04-26 21:04:36|     1|      110.0|
|        2|2025-04-26 21:09:36|     1|      113.0|
|        3|2025-04-26 21:14:36|     1|      111.0|
|        4|2025-04-26 21:19:36|     1|      115.0|
|        5|2025-04-26 21:24:36|     1|      118.0|
|        6|2025-04-26 21:29:36|     1|      119.0|
|        7|2025-04-26 21:34:36|     1|      117.0|
|        8|2025-04-26 21:39:36|     1|      116.0|
|        9|2025-04-26 21:44:36|     1|      116.0|
|       10|2025-04-26 21:49:36|     1|      117.0|
|       11|2025-04-26 21:54:36|     1|      116.0|
|       12|2025-04-26 21:59:36|     1|      116.0|
|       13|2025-04-26 22:04:36|     1|      118.0|
|       14|2025-04-26 22:09:36|     1|      117.0|
|       15|2025-04-26 22:14:36|     1|      115.0|
|       16|2025-04-26 22:19:36|     1|      112.0|
|       17|2025-04-26 22:24:36|

                                                                                

In [4]:
# Mostrar possíveis outliers
outliers = a.filter(col('qtd_pessoas') < 0)
print('Outliers encontrados:')
outliers.show()

Outliers encontrados:
+---------+---------+------+-----------+
|id_camera|data_hora|fk_upa|qtd_pessoas|
+---------+---------+------+-----------+
+---------+---------+------+-----------+



In [5]:
mediana = a.approxQuantile("qtd_pessoas", [0.5], 0.01)[0]
print(f'Mediana calculada: {mediana}')

# Substituir os outliers pela mediana
a = a.withColumn('qtd_pessoas', 
                when((col('qtd_pessoas') < 0), mediana)
                .otherwise(col('qtd_pessoas')))

# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame final:')
a.show(70)

Mediana calculada: 152.0
DataFrame final:
+---------+-------------------+------+-----------+
|id_camera|          data_hora|fk_upa|qtd_pessoas|
+---------+-------------------+------+-----------+
|        1|2025-04-26 21:04:36|     1|      110.0|
|        2|2025-04-26 21:09:36|     1|      113.0|
|        3|2025-04-26 21:14:36|     1|      111.0|
|        4|2025-04-26 21:19:36|     1|      115.0|
|        5|2025-04-26 21:24:36|     1|      118.0|
|        6|2025-04-26 21:29:36|     1|      119.0|
|        7|2025-04-26 21:34:36|     1|      117.0|
|        8|2025-04-26 21:39:36|     1|      116.0|
|        9|2025-04-26 21:44:36|     1|      116.0|
|       10|2025-04-26 21:49:36|     1|      117.0|
|       11|2025-04-26 21:54:36|     1|      116.0|
|       12|2025-04-26 21:59:36|     1|      116.0|
|       13|2025-04-26 22:04:36|     1|      118.0|
|       14|2025-04-26 22:09:36|     1|      117.0|
|       15|2025-04-26 22:14:36|     1|      115.0|
|       16|2025-04-26 22:19:36|     1|  

In [6]:
a.coalesce(1) \
    .write \
    .option('header', 'true') \
    .mode('overwrite') \
    .csv('s3a://bucket-trusted-upa-connect/camera_visao_comp_tratado.csv')

spark.stop()

25/04/28 20:58:10 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/04/28 20:58:11 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                