In [1]:
from pyspark.sql import SparkSession
import pyspark.pandas as ps



In [2]:
PG_URL  = 'jdbc:postgresql://localhost:5432/graphs'
PG_USER = 'spark_ingest'
PG_PASS = 'GYleZAI2pTBKJYl9W1PL'
PG_SCHEMA_IN = 'raw'
PG_SCHEMA_OUT = 'saml_d'
PG_TABLE_IN =  'saml_d'
PG_TABLE_OUT1 =  'accounts'
PG_TABLE_OUT2 =  'transactions'
JDBC_JAR = r"C:\spark\spark-4.0.1-bin-hadoop3\jars\postgresql-42.7.4.jar"  
JDBC_BATCHSIZE = 10000
JDBC_FETCHSIZE = 10000

In [3]:
spark = (
    SparkSession.builder
    .appName("ieee-fraud-jupyter")
    .config("spark.jars", JDBC_JAR)
    .config("spark.driver.extraClassPath", JDBC_JAR)
    .config("spark.executor.extraClassPath", JDBC_JAR)
    .config("spark.sql.ansi.enabled", "false") #Para poder usar la API de pandas pues no soporta modo ansi
    .getOrCreate()
)

In [4]:
sdf = (
    spark.read.format("jdbc")
    .option("url", PG_URL)
    .option("dbtable", f"{PG_SCHEMA_IN}.{PG_TABLE_IN}")  
    .option("user", PG_USER)
    .option("password", PG_PASS)
    .option("driver", "org.postgresql.Driver")
    .option("partitionColumn", "id")
    .option("lowerBound", "1")
    .option("upperBound", "10000000")
    .option("numPartitions", "6")
    .option("fetchsize", str(JDBC_FETCHSIZE))
    .load()
)

In [5]:
sdf.show(5)
sdf.printSchema()

+-------------------+----------+--------------+----------------+---------+----------------+-----------------+--------------------+----------------------+------------+-------------+--------------------+------+
|               time|      date|sender_account|receiver_account|   amount|payment_currency|received_currency|sender_bank_location|receiver_bank_location|payment_type|is_laundering|     laundering_type|    id|
+-------------------+----------+--------------+----------------+---------+----------------+-----------------+--------------------+----------------------+------------+-------------+--------------------+------+
|2025-10-13 15:47:40|2022-12-29|     685305587|      1504939866|  8499.53|       UK pounds|        UK pounds|                  UK|                    UK|  Debit card|            0|       Normal_Fan_In|102350|
|2025-10-13 10:45:41|2023-01-27|    2169668400|      8270301079|   526.93|       UK pounds|     Mexican Peso|                  UK|                Mexico|Cross-borde

In [6]:
df = sdf.pandas_api(index_col="id")

In [7]:
df.head()

Unnamed: 0_level_0,time,date,sender_account,receiver_account,amount,payment_currency,received_currency,sender_bank_location,receiver_bank_location,payment_type,is_laundering,laundering_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
112529,2025-10-13 10:46:32,2023-01-23,9958871364,2555658583,4912.0,UK pounds,Indian rupee,UK,India,Cross-border,0,Normal_Fan_Out
112530,2025-10-13 23:03:43,2023-01-31,1152776992,439915917,1225.17,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Small_Fan_Out
112531,2025-10-13 22:34:49,2023-01-04,33382572,4726164558,7005.31,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_Out
112532,2025-10-13 11:20:03,2023-01-15,2127996378,1083543454,3123.91,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Small_Fan_Out
112533,2025-10-13 17:58:45,2023-01-03,9949631050,4823784703,10552.03,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In


In [8]:
accounts_df = df[['sender_account', 'receiver_account', 'sender_bank_location', 'receiver_bank_location']]
transact_df = df[['time', 'date', 'sender_account', 'receiver_account', 'amount',
       'payment_currency', 'received_currency', 'payment_type', 'is_laundering',
       'laundering_type']]

In [9]:
from pyspark.sql import functions as F, Window

pairs = (
    sdf.select(F.col("sender_account").alias("account"),
              F.col("sender_bank_location").alias("location"))
      .unionByName(
          sdf.select(F.col("receiver_account").alias("account"),
                    F.col("receiver_bank_location").alias("location"))
      )
      .filter(F.col("account").isNotNull() & F.col("location").isNotNull())
)

counts = pairs.groupBy("account", "location").count()

w = Window.partitionBy("account").orderBy(F.col("count").desc(), F.col("location").asc())
accounts_dim = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select("account", "location")
)


In [13]:
(accounts_dim.write
 .format("jdbc")
 .option("url", PG_URL)
 .option("dbtable", f"{PG_SCHEMA_OUT}.{PG_TABLE_OUT1}")
 .option("user", PG_USER)
 .option("password", PG_PASS)
 .option("driver", "org.postgresql.Driver")
 .option("batchsize", str(JDBC_BATCHSIZE))
 .option("truncate", "true") 
 .mode("overwrite")  # o 'append'
 .save())


In [15]:
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL

connection_url = URL.create(
    drivername='postgresql+psycopg2',
    username=PG_USER,
    password=PG_PASS,  
    host='localhost',
    port=5432,
    database='graphs',
    query={'sslmode': 'disable'},
)
engine = create_engine(connection_url)

with engine.begin() as conn:
    conn.execute(text(
        "ALTER TABLE saml_d.accounts "
        "ADD CONSTRAINT saml_d_accounts_pkey PRIMARY KEY (account)"
    ))