In [12]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when, to_date, dayofweek
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'NULL') \
              .csv('s3a://bucket-raw-upa-connect-eduardo/visao_computacional.csv')

# Converter 'valor' para DoubleType e 'fk_paciente' para IntegerType
a = a.withColumn('qtd_pessoas', col('qtd_pessoas').cast(DoubleType())) \
     .withColumn('fk_upa', col('fk_upa').cast(IntegerType()))

a.show()

+---------+-------------------+-----------+------+
|id_camera|          data_hora|qtd_pessoas|fk_upa|
+---------+-------------------+-----------+------+
|        1|2025-04-07 18:29:28|      193.0|     1|
|        2|2025-04-07 18:29:28|      234.0|     1|
|        3|2025-04-07 18:29:28|      238.0|     1|
|        4|2025-04-07 18:29:28|      241.0|     1|
|        5|2025-04-07 18:29:28|      235.0|     1|
|        6|2025-04-07 18:29:28|      245.0|     1|
|        7|2025-04-07 18:29:28|      241.0|     1|
|        8|2025-04-07 18:29:28|      239.0|     1|
|        9|2025-04-07 18:29:28|      239.0|     1|
|       10|2025-04-07 18:29:28|      240.0|     1|
|       11|2025-04-07 18:29:28|      240.0|     1|
|       12|2025-04-07 18:29:29|      240.0|     1|
|       13|2025-04-07 18:29:29|      239.0|     1|
|       14|2025-04-07 18:29:29|      240.0|     1|
|       15|2025-04-07 18:29:30|      240.0|     1|
|       16|2025-04-07 18:29:30|      200.0|     1|
|       17|2025-04-07 18:29:30|

In [3]:
# Nulls
nulls = a.filter(col('qtd_pessoas').isNull())

nulls.show()

+---------+---------+-----------+------+
|id_camera|data_hora|qtd_pessoas|fk_upa|
+---------+---------+-----------+------+
+---------+---------+-----------+------+



In [7]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Definir uma janela ordenada pelo data_hora
window_spec = Window.partitionBy('fk_upa').orderBy('data_hora') \
                    .rowsBetween(Window.unboundedPreceding, 0)

# Criar uma nova coluna que faz um last() ignorando NULLs
a = a.withColumn('valor_preenchido', F.last('qtd_pessoas', ignorenulls=True).over(window_spec))

# Substituir a coluna original pelo preenchido
a = a.drop('qtd_pessoas').withColumnRenamed('valor_preenchido', 'qtd_pessoas')

# Mostrar resultado
a.show(30)

+---------+-------------------+------+-----------+
|id_camera|          data_hora|fk_upa|qtd_pessoas|
+---------+-------------------+------+-----------+
|        1|2025-04-07 18:29:28|     1|      193.0|
|        2|2025-04-07 18:29:28|     1|      234.0|
|        3|2025-04-07 18:29:28|     1|      238.0|
|        4|2025-04-07 18:29:28|     1|      241.0|
|        5|2025-04-07 18:29:28|     1|      235.0|
|        6|2025-04-07 18:29:28|     1|      245.0|
|        7|2025-04-07 18:29:28|     1|      241.0|
|        8|2025-04-07 18:29:28|     1|      239.0|
|        9|2025-04-07 18:29:28|     1|      239.0|
|       10|2025-04-07 18:29:28|     1|      240.0|
|       11|2025-04-07 18:29:28|     1|      240.0|
|       12|2025-04-07 18:29:29|     1|      240.0|
|       13|2025-04-07 18:29:29|     1|      239.0|
|       14|2025-04-07 18:29:29|     1|      240.0|
|       15|2025-04-07 18:29:30|     1|      240.0|
|       16|2025-04-07 18:29:30|     1|      200.0|
|       17|2025-04-07 18:29:30|

                                                                                

In [5]:
# Mostrar possíveis outliers
outliers = a.filter(col('qtd_pessoas') < 0)
print('Outliers encontrados:')
outliers.show()

Outliers encontrados:
+---------+---------+-----------+------+
|id_camera|data_hora|qtd_pessoas|fk_upa|
+---------+---------+-----------+------+
+---------+---------+-----------+------+



In [16]:
mediana = a.approxQuantile("qtd_pessoas", [0.5], 0.01)[0]
print(f'Mediana calculada: {mediana}')

# Substituir os outliers pela mediana
a = a.withColumn('qtd_pessoas', 
                when((col('qtd_pessoas') < 0), mediana)
                .otherwise(col('qtd_pessoas')))

a = a.withColumn('dia', to_date('data_hora'))
a = a.withColumn('numero_dia_semana', dayofweek('data_hora'))


novo_df = a.groupBy('dia', 'numero_dia_semana').agg(
    avg('qtd_pessoas').alias('media_pessoas')
).orderBy('dia')


# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame final:')
novo_df.show()

Mediana calculada: 47.0
DataFrame final:
+----------+-----------------+------------------+
|       dia|numero_dia_semana|     media_pessoas|
+----------+-----------------+------------------+
|2025-04-07|                2|236.86486486486487|
|2025-04-21|                2|45.166666666666664|
|2025-04-22|                3| 67.72222222222223|
|2025-04-23|                4|35.851851851851855|
|2025-04-24|                5|              34.0|
|2025-04-25|                6| 35.27777777777778|
|2025-04-26|                7|36.888888888888886|
|2025-04-27|                1|33.888888888888886|
+----------+-----------------+------------------+



In [17]:
novo_df.coalesce(1).write.option('header', 'true').mode('overwrite').csv('s3a://bucket-trusted-upa-connect-eduardo/camera_visao_comp_tratado.csv')

spark.stop()

25/04/29 01:13:34 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/04/29 01:13:35 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                