In [None]:
#Parameters
archive_name  = ""
raw_bucket    = ""
output_bucket = ""

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum as sum_, first, round, min, when, lit, to_date, to_timestamp, date_format, hour
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, TimestampType

In [None]:
%pip install boto3 pandas

import boto3
import pandas as pd
import io

# Inicializa o cliente S3
s3_client = boto3.client('s3')

response = s3_client.get_object(Bucket=raw_bucket, Key=archive_name)
    
df = pd.read_csv(io.BytesIO(response['Body'].read()))
  
# # Concatena todos os DataFrames em um só
# df_final = pd.concat(df, ignore_index=True)
df['data_hora'] = pd.to_datetime(df['data_hora'])

In [None]:
# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

TabelaCompleta = spark.createDataFrame(df)

df_camera = TabelaCompleta.filter(
    (col("fk_sensor") == 1)
).filter(col("valor").isNotNull())


df_camera_filtrado = df_camera.withColumn("data_hora", to_timestamp(col("data_hora"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("qtde_pessoas", col("valor").cast(IntegerType())) \
    .withColumn("fk_upa", col("fk_upa").cast(IntegerType())) \
    .drop("valor") \
    .drop("fk_unid_medida") \
    .drop("fk_paciente") \
    .drop("fk_sensor")

In [None]:
df_camera_filtrado = df_camera_filtrado.filter(
        (col("qtde_pessoas") >= 0) & (col("qtde_pessoas") <= 200)
    ) \
    .orderBy(col("fk_upa"))

In [None]:
df_final_semanal = df_camera_filtrado.withColumn("data", to_date(col("data_hora"))) \
                .drop("data_hora")  

df_final_semanal = df_final_semanal.groupBy('data', 'fk_upa').agg(
    avg('qtde_pessoas').cast(IntegerType()).alias('media_pessoas')
).orderBy('fk_upa', 'data')


df_final_diario = df_camera_filtrado.withColumn("data", to_date(col("data_hora"))) \
                .withColumn("hora", hour(col("data_hora"))) \
                .drop("data_hora")                

df_final_diario = df_final_diario.groupBy('data', 'hora', 'fk_upa').agg(
    avg('qtde_pessoas').cast(IntegerType()).alias('media_pessoas')
).orderBy('fk_upa', 'data', 'hora')


In [None]:
from datetime import datetime, timezone

csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)

key_trusted_data = datetime.now().strftime('%Y-%m-%d-%HH%MM%SS')

# Envia para o S3
s3_client.put_object(
    Bucket=output_bucket,
    Key=key_trusted_data,
    Body=csv_buffer.getvalue()
)