# 📂 Parte 1 Análise Estática (Spark SQL + PostgreSQL + S3)

### 1 - Realize a leitura da tabela apostas do PostgreSQL e transforme a coluna timestamp corretamente.

In [4]:
from pyspark.sql.functions import to_timestamp
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Leitura PostgreSQL") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.27,org.postgresql:postgresql:42.2.27,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .getOrCreate()

url = "jdbc:postgresql://localhost:5432/betalert"
properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

table = "apostas"

df = spark.read.jdbc(url=url, table=table, properties=properties)

# convertendo a data timestamp de string para timestamp
df = df.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"))

df.printSchema()
df.show(10)
spark.stop()


25/06/02 22:14:51 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


root
 |-- aposta_id: string (nullable = true)
 |-- apostador_id: string (nullable = true)
 |-- jogo_id: string (nullable = true)
 |-- valor: decimal(38,18) (nullable = true)
 |-- odd: decimal(38,18) (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- resultado: string (nullable = true)

+---------+------------+-------+--------------------+--------------------+-------------------+---------+
|aposta_id|apostador_id|jogo_id|               valor|                 odd|          timestamp|resultado|
+---------+------------+-------+--------------------+--------------------+-------------------+---------+
| b61f4f08|         u79| jogo31|1813.570000000000...|4.490000000000000000|2025-01-05 15:42:00|   perdeu|
| 88c44f52|         u23| jogo33|830.6000000000000...|2.870000000000000000|2025-01-05 09:54:00| pendente|
| 4c32051c|         u72| jogo72|1829.920000000000...|2.090000000000000000|2025-01-03 23:34:00| pendente|
| 1baa492c|         u70| jogo67|1614.200000000000...|4.210000000000

### 2 - Realize a leitura da tabela transacoes_financeiras e normalize o nome da coluna de valor.

In [2]:
from pyspark.sql.types import DecimalType
from pyspark.sql import SparkSession
from pyspark.sql.functions import round, col

spark = SparkSession.builder \
    .appName("Leitura PostgreSQL") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.27") \
    .getOrCreate()

url = "jdbc:postgresql://localhost:5432/betalert"
properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

table = "transacoes_financeiras"

df = spark.read.jdbc(url=url, table=table, properties=properties)

# convertendo a coluna para manter só dois valores decimais, mas sem arredondamento para manter o valor real
df = df.withColumn("valor", col("valor").cast(DecimalType(10, 2)))

df.printSchema()
df.show(10)
spark.stop()


root
 |-- id: integer (nullable = true)
 |-- apostador_id: string (nullable = true)
 |-- valor: decimal(10,2) (nullable = true)
 |-- tipo: string (nullable = true)
 |-- data: timestamp (nullable = true)

+---+------------+--------+--------+-------------------+
| id|apostador_id|   valor|    tipo|               data|
+---+------------+--------+--------+-------------------+
|  1|         u69|14890.81|deposito|2025-01-01 10:00:00|
|  2|         u94| 5616.48|deposito|2025-01-01 10:01:00|
|  3|         u95|16376.42|   saque|2025-01-01 10:02:00|
|  4|         u11|15335.80|deposito|2025-01-01 10:03:00|
|  5|         u88|19559.30|   saque|2025-01-01 10:04:00|
|  6|         u32| 4797.95|deposito|2025-01-01 10:05:00|
|  7|         u14| 1494.86|   saque|2025-01-01 10:06:00|
|  8|         u23|15609.39|deposito|2025-01-01 10:07:00|
|  9|          u8|14916.72|deposito|2025-01-01 10:08:00|
| 10|         u92|12104.17|   saque|2025-01-01 10:09:00|
+---+------------+--------+--------+-------------------

### 3 - Faça o join entre apostas e o arquivo apostadores.csv do S3 para incluir o país e dados extras.

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Leitura PostgreSQL") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "admin123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

url = "jdbc:postgresql://localhost:5432/betalert"
properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

# Em tese, o join faz uma combinação de cada saque por aposta, então seria necessário só comparar
# as diferenças entre saque e depósito.

transactions = spark.read.jdbc(url=url, table="transacoes_financeiras", properties=properties)
bets = spark.read.jdbc(url=url, table="apostas", properties=properties)

apostadores = spark.read.csv("s3a://betalogs/apostadores.csv", header=True, inferSchema=True)

bets = bets.withColumnRenamed("valor", "bet_valor")
transactions = transactions.withColumnRenamed("valor", "transaction_valor")

bets = bets.withColumnRenamed("pais", "pais_bet")
transactions = transactions.withColumnRenamed("pais", "pais_transacao")


bets_transactions = transactions.join(
    bets, transactions.apostador_id == bets.apostador_id, "inner"
).drop(bets["apostador_id"])

resultado_final = bets_transactions.join(
    apostadores, bets_transactions.apostador_id == apostadores.id, "inner"
).drop(apostadores["id"])

resultado_final.show(10)


25/06/02 22:13:14 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://betalogs/apostadores.csv.
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:53)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRela

Py4JJavaError: An error occurred while calling o136.csv.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:724)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:551)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:404)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:538)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2592)
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2686)
	... 29 more


# 🔍 Parte 2 Detecção de Padrões

Observando se uma aposta é Flash por meio da subtração de duas time stamps em unix (segundos desde 1970)

In [None]:
from pyspark.sql.functions import unix_timestamp, abs

deposits = resultado_final.where(resultado_final.tipo == "deposito")

# depositos onde a diferença entre data e timestamp é menor que 10 segundos
flash_deposits = deposits.where(
    abs(unix_timestamp("data") - unix_timestamp("timestamp")) < 10
)

flash_deposits.show(20)

Exiba apostas-relâmpago com valor acima de R$500.

In [None]:
gt_500_flash_deposits = flash_deposits.where(flash_deposits.bet_valor > 500)
gt_500_flash_deposits.show(20)

Exiba apostas-relâmpago com valor acima de R$10.000.

In [None]:
gt_10000_flash_deposits = flash_deposits.where(flash_deposits.bet_valor > 10000)
gt_10000_flash_deposits.show()

Detecte jogadores que realizaram 10 ou mais apostas em um mesmo jogo.

In [None]:
from pyspark.sql.functions import count

heavy_bettors = bets.groupBy("apostador_id", "jogo_id") \
    .agg(count("*").alias("num_apostas")) \
    .where(col("num_apostas") >= 10)

heavy_bettors.show()

Exiba o total e a média de valores apostados por país.

In [None]:
from pyspark.sql.functions import sum, avg

bets_by_country = resultado_final.groupBy("pais").agg(
    sum("bet_valor").alias("total_apostado"),
    avg("bet_valor").alias("media_apostada")
)

bets_by_country.show()

spark.stop()


# 📡 Parte 3 Streaming em Tempo Real (Kafka + Spark Structured Streaming)


Modifiquei pesadamente o docker compose para conseguir usar o jupyter notebook no projeto. Extrai os scripts em python da imagem, e removi a network do docker compose, mapeando todas as portas na minha máquina, já que não foi possível consumir o kafka na rede virtual do docker

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, when
from pyspark.sql.types import StructType, StringType, TimestampType, FloatType

spark = SparkSession.builder \
    .appName("KafkaStreamingApp") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.27,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "admin123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# Kafka
df = spark.readStream \
    .format('kafka') \
    .option("kafka.bootstrap.servers", "localhost:9092,localhost:9093") \
    .option('subscribe', 'stream_apostas') \
    .load()

# Leitura do CSV com apostadores
apostadores = spark.read.csv("s3a://betalogs/apostadores.csv", header=True, inferSchema=True)

# Schema da mensagem Kafka
schema = (StructType()
          .add("aposta_id", StringType())
          .add("apostador_id", StringType())
          .add("jogo_id", StringType())
          .add("valor", FloatType())
          .add("odd", FloatType())
          .add("timestamp", TimestampType())
          )

# Parse do JSON + transformação
df_valores = df.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("dados")) \
    .select("dados.*")

# Join com apostadores
df_valores = df_valores.join(apostadores, apostadores.id == df_valores.apostador_id, "inner")

# 🚨 Coluna 'suspeita' para apostas acima de R$12.000 e odd acima de 15
df_valores = df_valores.withColumn(
    "suspeita",
    when((col("valor") > 12000) & (col("odd") > 15), True).otherwise(False)
)

# Saída no console
query = df_valores.writeStream \
    .format("console") \
    .trigger(processingTime='1 minute') \
    .option("truncate", False) \
    .start()

query.awaitTermination()