In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum as sum_, first, round
from pyspark.sql.types import DoubleType, IntegerType

# Configurações do Spark
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.901')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

# Criar sessão Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Ler o CSV, já tratando "NULL" como valor nulo
a = spark.read.option('delimiter', ',') \
              .option('header', 'true') \
              .option('nullValue', 'NULL') \
              .csv('s3a://bucket-raw-upa-connect/temperatura_paciente.csv')

# Converter 'valor' para DoubleType e 'fk_paciente' para IntegerType
a = a.withColumn('valor', col('valor').cast(DoubleType())) \
     .withColumn('fk_paciente', col('fk_paciente').cast(IntegerType()))

a.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c15fbd62-15c4-40ff-a413-0417472975ee;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
downloading https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar ...
	[SUCCESSFUL ] org.apache.hadoop#hadoop-aws;3.3.4!hadoop-aws.jar (55ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar ...
	[SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.12.262!aws-java-sdk-bundle.jar (3265ms)
downloading https://repo1.maven.org/maven2/org/wildfly/openssl/wildfly-openssl/1.0.7.Final/wildfly-op

+-----------------------+-------------------+-----+-----------+
|id_temperatura_paciente|          data_hora|valor|fk_paciente|
+-----------------------+-------------------+-----+-----------+
|                      1|2025-04-26 21:31:43| 36.9|          7|
|                      2|2025-04-26 21:31:42| 36.9|          7|
|                      3|2025-04-26 21:31:41| 36.9|          7|
|                      4|2025-04-26 21:36:43| 36.5|         35|
|                      5|2025-04-26 21:36:42| 36.5|         35|
|                      6|2025-04-26 21:36:41| 36.6|         35|
|                      7|2025-04-26 21:41:43| 36.8|         11|
|                      8|2025-04-26 21:41:42| 36.7|         11|
|                      9|2025-04-26 21:41:41| 36.9|         11|
|                     10|2025-04-26 21:46:43| 36.4|         92|
|                     11|2025-04-26 21:46:42| 36.3|         92|
|                     12|2025-04-26 21:46:41| 36.3|         92|
|                     13|2025-04-26 21:5

In [2]:
nulls_valor = a.select(sum_(col("valor").isNull().cast("decimal")).alias("qtd_nulls"))

# Mostra o resultado
nulls_valor.show()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+
|qtd_nulls|
+---------+
|       82|
+---------+



                                                                                

In [3]:
# Nulls
nulls = a.filter(col('valor').isNull())

nulls.show()

# Remover linhas onde a coluna 'valor' está nula
a = a.filter(col('valor').isNotNull())

a.show()

+-----------------------+-------------------+-----+-----------+
|id_temperatura_paciente|          data_hora|valor|fk_paciente|
+-----------------------+-------------------+-----+-----------+
|                     13|2025-04-26 21:51:43| null|         69|
|                     22|2025-04-26 22:06:43| null|         65|
|                     43|2025-04-26 22:41:43| null|         72|
|                     46|2025-04-26 22:46:43| null|         26|
|                     55|2025-04-26 23:01:43| null|         83|
|                     61|2025-04-26 23:11:43| null|         72|
|                     73|2025-04-26 23:31:43| null|         73|
|                     88|2025-04-26 23:56:43| null|         53|
|                     97|2025-04-27 00:11:43| null|         41|
|                    100|2025-04-27 00:16:43| null|         59|
|                    109|2025-04-27 00:31:43| null|         60|
|                    115|2025-04-27 00:41:43| null|        113|
|                    145|2025-04-27 01:3

In [4]:
# Mostrar possíveis outliers
outliers = a.filter((col('valor') <= 34) | (col('valor') > 42))
print('Outliers encontrados:')
outliers.show()

# Remover os outliers
a = a.filter((col('valor') >= 34) & (col('valor') <= 42))

# Mostrar resultado final (sem nulos e sem outliers)
print('DataFrame final:')
a.show()

Outliers encontrados:
+-----------------------+-------------------+-----+-----------+
|id_temperatura_paciente|          data_hora|valor|fk_paciente|
+-----------------------+-------------------+-----+-----------+
|                     19|2025-04-26 22:01:43| 54.3|          6|
|                     20|2025-04-26 22:01:42| 54.5|          6|
|                     21|2025-04-26 22:01:41| 54.4|          6|
|                    187|2025-04-27 02:41:43| 14.1|         70|
|                    188|2025-04-27 02:41:42| 14.1|         70|
|                    189|2025-04-27 02:41:41| 13.9|         70|
|                    427|2025-04-27 09:21:43| 80.8|         43|
|                    428|2025-04-27 09:21:42| 80.5|         43|
|                    429|2025-04-27 09:21:41| 81.2|         43|
|                    613|2025-04-27 14:31:43| 89.9|         52|
|                    614|2025-04-27 14:31:42| 90.2|         52|
|                    615|2025-04-27 14:31:41| 89.7|         52|
|                 

In [5]:
# Agrupa por paciente, pega a primeira data_hora e calcula a média do valor
a_agrupada = a.groupBy('fk_paciente').agg(
    first('data_hora').alias('data_hora'),
    round(avg('valor'), 1).alias('media_temperatura')
)

# Ordena pelo fk_paciente (se quiser)
a_agrupada = a_agrupada.orderBy('data_hora')

# Exibe o resultado
print('DataFrame final:')
a_agrupada.show()

DataFrame final:


[Stage 9:>                                                          (0 + 1) / 1]

+-----------+-------------------+-----------------+
|fk_paciente|          data_hora|media_temperatura|
+-----------+-------------------+-----------------+
|          7|2025-04-26 21:31:43|             36.9|
|         35|2025-04-26 21:36:43|             36.8|
|         11|2025-04-26 21:41:43|             36.8|
|         92|2025-04-26 21:46:43|             36.7|
|         69|2025-04-26 21:51:42|             37.1|
|         57|2025-04-26 21:56:43|             37.0|
|         65|2025-04-26 22:06:42|             37.1|
|         28|2025-04-26 22:11:43|             36.6|
|         44|2025-04-26 22:16:43|             36.9|
|        104|2025-04-26 22:21:43|             36.1|
|         16|2025-04-26 22:26:43|             37.0|
|        127|2025-04-26 22:31:43|             37.0|
|         72|2025-04-26 22:41:42|             36.6|
|         26|2025-04-26 22:46:42|             37.0|
|          5|2025-04-26 22:51:43|             36.7|
|        111|2025-04-26 22:56:43|             36.8|
|         83

                                                                                

In [7]:
a_agrupada.coalesce(1) \
    .write \
    .option('header', 'true') \
    .mode('overwrite') \
    .csv('s3a://bucket-trusted-upa-connect/temperatura_paciente_tratado.csv')

spark.stop()

Py4JJavaError: An error occurred while calling o107.csv.
: java.util.NoSuchElementException: None.get
	at scala.None$.get(Option.scala:529)
	at scala.None$.get(Option.scala:527)
	at org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker$.metrics(BasicWriteStatsTracker.scala:239)
	at org.apache.spark.sql.execution.command.DataWritingCommand.metrics(DataWritingCommand.scala:55)
	at org.apache.spark.sql.execution.command.DataWritingCommand.metrics$(DataWritingCommand.scala:55)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics$lzycompute(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics$lzycompute(commands.scala:109)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics(commands.scala:109)
	at org.apache.spark.sql.execution.SparkPlanInfo$.fromSparkPlan(SparkPlanInfo.scala:63)
	at org.apache.spark.sql.execution.SparkPlanInfo$.$anonfun$fromSparkPlan$3(SparkPlanInfo.scala:75)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.SparkPlanInfo$.fromSparkPlan(SparkPlanInfo.scala:75)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:115)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:847)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/site-packages/py4j/clientserver.py", line 540, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
