In [1]:
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import os, sys
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.sql import functions as F, Window



In [2]:
PG_URL  = 'jdbc:postgresql://localhost:5432/graphs'
PG_USER = 'spark_ingest'
PG_PASS = 'GYleZAI2pTBKJYl9W1PL'
PG_SCHEMA_IN = 'raw'
PG_SCHEMA_OUT = 'saml_d'
PG_TABLE_IN =  'saml_d'
PG_TABLE_OUT1 =  'accounts'
PG_TABLE_OUT2 =  'transferences'
JDBC_JAR = r"C:\spark\spark-4.0.1-bin-hadoop3\jars\postgresql-42.7.4.jar"  
JDBC_BATCHSIZE = 10000
JDBC_FETCHSIZE = 10000

In [3]:
spark = (
    SparkSession.builder
    .appName("ieee-fraud-jupyter")
    .config("spark.jars", JDBC_JAR)
    .config("spark.driver.extraClassPath", JDBC_JAR)
    .config("spark.executor.extraClassPath", JDBC_JAR)
    .config("spark.sql.ansi.enabled", "false") #Para poder usar la API de pandas pues no soporta modo ansi
    .config("spark.pyspark.driver.python", sys.executable)
    .config("spark.pyspark.python", sys.executable)
    .config("spark.sql.execution.arrow.pyspark.enabled", "false")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

In [4]:
df = (
    spark.read.format("jdbc")
    .option("url", PG_URL)
    .option("dbtable", f"{PG_SCHEMA_IN}.{PG_TABLE_IN}")  
    .option("user", PG_USER)
    .option("password", PG_PASS)
    .option("driver", "org.postgresql.Driver")
    .option("partitionColumn", "id")
    .option("lowerBound", "1")
    .option("upperBound", "10000000")
    .option("numPartitions", "6")
    .option("fetchsize", str(JDBC_FETCHSIZE))
    .load()
)

In [5]:
df.show(5)
df.printSchema()

+-------------------+----------+--------------+----------------+-------+----------------+-----------------+--------------------+----------------------+------------+-------------+-----------------+------+
|               time|      date|sender_account|receiver_account| amount|payment_currency|received_currency|sender_bank_location|receiver_bank_location|payment_type|is_laundering|  laundering_type|    id|
+-------------------+----------+--------------+----------------+-------+----------------+-----------------+--------------------+----------------------+------------+-------------+-----------------+------+
|2025-10-13 15:04:29|2023-02-03|    6974085996|       504652662|5569.77|     Swiss franc|        UK pounds|                  UK|                    UK|  Debit card|            0|   Normal_Fan_Out|352178|
|2025-10-13 22:43:06|2023-01-26|    1834030408|      8486490456|8540.22|       UK pounds|        UK pounds|                  UK|                    UK|      Cheque|            0|   Nor

In [6]:
pairs = (
    df.select(F.col("sender_account").alias("account"),
              F.col("sender_bank_location").alias("location"))
      .unionByName(
          df.select(F.col("receiver_account").alias("account"),
                    F.col("receiver_bank_location").alias("location"))
      )
      .filter(F.col("account").isNotNull() & F.col("location").isNotNull())
)

counts = pairs.groupBy("account", "location").count()

w = Window.partitionBy("account").orderBy(F.col("count").desc(), F.col("location").asc())
accounts = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select("account", "location")
)


In [7]:
(accounts.write
 .format("jdbc")
 .option("url", PG_URL)
 .option("dbtable", f"{PG_SCHEMA_OUT}.{PG_TABLE_OUT1}")
 .option("user", PG_USER)
 .option("password", PG_PASS)
 .option("driver", "org.postgresql.Driver")
 .option("batchsize", str(JDBC_BATCHSIZE))
 .option("truncate", "true") 
 .mode("overwrite")  # o 'append'
 .save())


In [10]:
transfers = df.withColumn(
    "datetime",
    F.to_timestamp(
        F.concat_ws(" ", F.col("date"), F.date_format(F.col("time"), "HH:mm:ss")),
        "yyyy-MM-dd HH:mm:ss"
    )
)

# Selecciona solo columnas que vas a escribir y fuerza tipos JVM simples
# (ajusta el listado a tus columnas verdaderas)
cols_out = [
    F.col("id").cast("long").alias("id"),
    F.col("datetime").cast("timestamp").alias("date_time"),
    F.col("sender_account").cast("long").alias("sender_account"),
    F.col("receiver_account").cast("long").alias("receiver_account"),
    F.col("amount").cast("double").alias("amount"),
    F.col("payment_currency").cast("string").alias("payment_currency"),
    F.col("received_currency").cast("string").alias("received_currency"),
    F.col("payment_type").cast("string").alias("payment_type"),
    F.col("is_laundering").cast("integer").alias("is_laundering"),
    F.col("laundering_type").cast("string").alias("laundering_type")
]
transfers = transfers.select(*cols_out)

# Materializa en JVM (evita recomputar nada de pandas)
transfers = transfers.persist()
_ = transfers.count()


In [11]:
(transfers.write
 .format("jdbc")
 .option("url", PG_URL)
 .option("dbtable", f"{PG_SCHEMA_OUT}.{PG_TABLE_OUT2}")
 .option("user", PG_USER)
 .option("password", PG_PASS)
 .option("driver", "org.postgresql.Driver")
 .option("stringtype", "unspecified")
 .option("batchsize", str(JDBC_BATCHSIZE))
 #.option("truncate", "true")
 .mode("overwrite")
 .save())


In [17]:
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL

connection_url = URL.create(
    drivername='postgresql+psycopg2',
    username=PG_USER,
    password=PG_PASS,  
    host='localhost',
    port=5432,
    database='graphs',
    query={'sslmode': 'disable'},
)
engine = create_engine(connection_url)

with engine.begin() as conn:
    conn.execute(text(
        f"ALTER TABLE {PG_SCHEMA_OUT}.{PG_TABLE_OUT1} "
        f"ADD CONSTRAINT {PG_SCHEMA_OUT}_{PG_TABLE_OUT1}_pkey PRIMARY KEY (account)"
    ))
    conn.execute(text(
        f"ALTER TABLE {PG_SCHEMA_OUT}.{PG_TABLE_OUT2} "
        f"ADD CONSTRAINT {PG_SCHEMA_OUT}_{PG_TABLE_OUT2}_pkey PRIMARY KEY (id)"
    ))