In [1]:
import sys, os
os.environ["HADOOP_HOME"] = r"C:\hadoop"
os.environ["PATH"] = r"C:\hadoop\bin;" + os.environ.get("PATH", "")
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.pandas as ps
from pathlib import Path
from py2neo import Graph
import numpy as np
import matplotlib.pyplot as plt



# Eda "clásico" (tabular)

## Conversión a parquet

Este análisis es más fácil de realizar sobre PostgreSQL directamente. Así me puedo aprovechar de las ventajas de los DataFrames de Spark y su pacerido a Pandas.

In [2]:
PG_URL  = 'jdbc:postgresql://localhost:5432/graphs'
PG_USER = 'spark_ingest'
PG_PASS = 'GYleZAI2pTBKJYl9W1PL'
PG_SCHEMA = 'saml_d'
PG_TABLE1 =  'accounts'
PG_TABLE2 =  'transferences'
PG_TABLE3 =  'statements'
JDBC_JAR = r"C:\spark\spark-4.0.1-bin-hadoop3\jars\postgresql-42.7.4.jar"  
JDBC_BATCHSIZE = 10000
JDBC_FETCHSIZE = 10000
PYTHON = sys.executable  # python del kernel Jupyter
PARQUET_BASE_PATH = Path("./data/parquet")
PARQUET_ACCOUNTS = str(PARQUET_BASE_PATH / "accounts.parquet")
PARQUET_TRANSFERENCES = str(PARQUET_BASE_PATH / "transferences.parquet")
PARQUET_STATEMENTS = str(PARQUET_BASE_PATH / "statements.parquet")
PARQUET_BASE_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
spark = (
    SparkSession.builder
    .appName("ieee-fraud-jupyter")
    .config("spark.jars", JDBC_JAR)
    .config("spark.driver.extraClassPath", JDBC_JAR)
    .config("spark.executor.extraClassPath", JDBC_JAR)
    .config("spark.sql.ansi.enabled", "false") #Para poder usar la API de pandas pues no soporta modo ansi
    .config("spark.pyspark.driver.python", PYTHON)
    .config("spark.pyspark.python", PYTHON)
    .config("spark.executorEnv.PYSPARK_PYTHON", PYTHON)
    .config("spark.sql.execution.arrow.pyspark.enabled", "false")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")  # Opcional: mejora performance
    .config("spark.sql.parquet.compression.codec", "snappy")  # Compresión balanceada
    .config("spark.sql.parquet.writeLegacyFormat", "false")   # Formato moderno
    .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")  # Para timestamps
    .getOrCreate()
)

In [4]:
def postgres_to_parquet():
    """
    Lee datos desde PostgreSQL y los guarda como Parquet localmente.
    Solo ejecutar UNA VEZ para crear los archivos Parquet.
    """
    
    print("\n" + "="*70)
    print("PASO 1: LEYENDO ACCOUNTS DESDE POSTGRESQL")
    print("="*70)
    
    # ACCOUNTS: Corregido el dbtable duplicado
    accounts_df = (
        spark.read.format("jdbc")
        .option("url", PG_URL)
        .option("user", PG_USER)
        .option("password", PG_PASS)
        .option("driver", "org.postgresql.Driver")
        .option("dbtable", f"(SELECT account, location FROM {PG_SCHEMA}.{PG_TABLE1}) t")
        .option("partitionColumn", "account")
        .option("lowerBound", "1")
        .option("upperBound", "10000000")
        .option("numPartitions", "4")
        .option("fetchsize", str(JDBC_FETCHSIZE))
        .load()
    )
    
    count_accounts = accounts_df.count()
    print(f"✓ Leídas {count_accounts:,} cuentas")
    
    # Guardar accounts como Parquet
    print(f"\nGuardando en: {PARQUET_ACCOUNTS}")
    accounts_df.write.mode("overwrite").parquet(PARQUET_ACCOUNTS)
    print("✓ Accounts guardado exitosamente")
    
    print("\n" + "="*70)
    print("PASO 2: LEYENDO TRANSFERENCES DESDE POSTGRESQL")
    print("="*70)
    
    # TRANSFERENCES: Corregido según tu schema real
    tx_df = (
        spark.read.format("jdbc")
        .option("url", PG_URL)
        .option("user", PG_USER)
        .option("password", PG_PASS)
        .option("driver", "org.postgresql.Driver")
        .option("dbtable", f"""(
            SELECT 
                id,
                date_time as ts,
                sender_account,
                receiver_account,
                amount,
                payment_currency,
                received_currency,
                payment_type,
                is_laundering,
                laundering_type
            FROM {PG_SCHEMA}.{PG_TABLE2}
        ) t""")
        .option("partitionColumn", "id")
        .option("lowerBound", "1")
        .option("upperBound", "10000000")
        .option("numPartitions", "8")  # Más particiones para 9.5M registros
        .option("fetchsize", str(JDBC_FETCHSIZE))
        .load()
    )
    
    count_tx = tx_df.count()
    print(f"✓ Leídas {count_tx:,} transacciones")
    
    # Agregar columnas de fecha/hora útiles para análisis
    print("\nAgregando columnas derivadas de timestamp...")
    tx_enriched = (
        tx_df
        .withColumn("ts", F.col("ts").cast("timestamp"))
        .withColumn("event_date", F.to_date("ts"))
        .withColumn("event_hour", F.hour("ts").cast("int"))  # 0-23 como int
        .withColumn("event_week", F.date_trunc("week", "ts"))
        .withColumn("event_month", F.date_trunc("month", "ts"))
        .withColumn("event_year", F.year("ts"))
    )
    
    print("✓ Columnas agregadas: event_date, event_hour, event_week, event_month, event_year")
    
    # Guardar transferences con PARTICIONAMIENTO por mes
    # Esto acelera queries que filtran por fecha
    print(f"\nGuardando en: {PARQUET_TRANSFERENCES}")
    print("Particionando por event_year y event_month (esto puede tomar unos minutos)...")
    
    tx_enriched.write \
        .mode("overwrite") \
        .partitionBy("event_year", "event_month") \
        .parquet(PARQUET_TRANSFERENCES)
    
    print("✓ Transferences guardado exitosamente con particionamiento")
    
    print("\n" + "="*70)
    print("CONVERSIÓN COMPLETADA")
    print("="*70)
    print(f"\nArchivos Parquet creados:")
    print(f"  • Accounts:      {PARQUET_ACCOUNTS}")
    print(f"  • Transferences: {PARQUET_TRANSFERENCES}")
    print(f"\nEstadísticas:")
    print(f"  • Total cuentas:       {count_accounts:,}")
    print(f"  • Total transacciones: {count_tx:,}")

In [5]:
def load_from_parquet():
    """
    Carga datos desde Parquet. Usar esto en tus análisis diarios.
    MUY rápido comparado con leer desde PostgreSQL.
    """
    print("\nCargando datos desde Parquet...")
    
    accounts_df = spark.read.parquet(PARQUET_ACCOUNTS)
    tx_df = spark.read.parquet(PARQUET_TRANSFERENCES)
    
    print(f"✓ Accounts cargado: {accounts_df.count():,} registros")
    print(f"✓ Transferences cargado: {tx_df.count():,} registros")
    
    return accounts_df, tx_df

In [6]:
def load_transferences_filtered(start_date=None, end_date=None, 
                               columns=None, only_laundering=False):
    """
    Carga transacciones con filtros - aprovecha el particionamiento y 
    la lectura columnar de Parquet.
    
    Args:
        start_date (str): Fecha inicio formato 'YYYY-MM-DD'
        end_date (str): Fecha fin formato 'YYYY-MM-DD'
        columns (list): Lista de columnas a leer (None = todas)
        only_laundering (bool): Solo transacciones de lavado
    
    Returns:
        DataFrame filtrado
    """
    # Leer solo columnas necesarias (¡aquí está la magia de Parquet!)
    if columns:
        df = spark.read.parquet(PARQUET_TRANSFERENCES).select(columns)
    else:
        df = spark.read.parquet(PARQUET_TRANSFERENCES)
    
    # Aplicar filtros (Parquet hace predicate pushdown)
    if start_date:
        df = df.filter(F.col("event_date") >= start_date)
    if end_date:
        df = df.filter(F.col("event_date") <= end_date)
    if only_laundering:
        df = df.filter(F.col("is_laundering") == 1)
    
    return df

In [7]:
postgres_to_parquet()


PASO 1: LEYENDO ACCOUNTS DESDE POSTGRESQL
✓ Leídas 855,460 cuentas

Guardando en: data\parquet\accounts.parquet


Py4JJavaError: An error occurred while calling o71.parquet.
: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:817)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1415)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1620)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:739)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2078)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2122)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:961)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2078)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2122)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:46)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:194)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$writeAndCommit$3(FileFormatWriter.scala:275)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:481)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:275)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:306)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:189)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:195)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:117)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:115)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:129)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$2(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:272)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:125)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:106)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:295)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:124)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:237)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$1(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$eagerlyExecute$1(QueryExecution.scala:154)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:169)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:164)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:360)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:356)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:446)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:164)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyCommandExecuted$1(QueryExecution.scala:126)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:131)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:192)
	at org.apache.spark.sql.classic.DataFrameWriter.runCommand(DataFrameWriter.scala:622)
	at org.apache.spark.sql.classic.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:241)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:369)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1583)


In [14]:
spark.version
spark._jsc.hadoopConfiguration().get("fs.azure", "")  # solo para forzar el JVM; ignora el valor
print(spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion())


3.4.1


In [4]:
accounts_df = (
    spark.read.format("jdbc")
    .option("url", PG_URL)
    .option("dbtable", f"{PG_SCHEMA}.{PG_TABLE1}")  
    .option("user", PG_USER)
    .option("password", PG_PASS)
    .option("driver", "org.postgresql.Driver")
    .option("dbtable",f"(SELECT account FROM {PG_SCHEMA}.{PG_TABLE1}) t")
    .option("partitionColumn", "account")
    .option("lowerBound", "1")
    .option("upperBound", "10000000")
    .option("numPartitions", "2")
    .option("fetchsize", str(JDBC_FETCHSIZE))
    .load()
)

tx_df = (
    spark.read.format("jdbc")
    .option("url", PG_URL)
    .option("dbtable", f"{PG_SCHEMA}.{PG_TABLE2}")  
    .option("user", PG_USER)
    .option("password", PG_PASS)
    .option("driver", "org.postgresql.Driver")
    .option("partitionColumn", "id")
    .option(
      "dbtable",
      f"""
      (SELECT id, sender_account AS A, receiver_account AS B,
              amount, currency, ts, is_fraud
       FROM {PG_SCHEMA}.{PG_TABLE2}) t
      """
    )
    .option("lowerBound", "1")
    .option("upperBound", "10000000")
    .option("numPartitions", "6")
    .option("fetchsize", str(JDBC_FETCHSIZE))
    .load()
)

In [15]:
# Convierte a timestamp y crea particiones jerárquicas
tx = (
  tx_df
   .withColumn("ts", F.col("ts").cast("timestamp"))
   .withColumn("event_date",  F.to_date("ts"))                   # yyyy-mm-dd
   .withColumn("event_hour",  F.date_format("ts","HH"))          # 00..23 (string)
   .withColumn("event_week",  F.date_trunc("week","ts"))         # lunes 00:00 UTC
)

{"ts": "2025-10-20 09:22:02.147", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `ts` cannot be resolved. Did you mean one of the following? [`id`, `amount`, `date_time`, `payment_type`, `is_laundering`]. SQLSTATE: 42703", "context": {"file": "line 4 in cell [15]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o72.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `ts` cannot be resolved. Did you mean one of the following? [`id`, `amount`, `date_time`, `payment_type`, `is_laundering`]. SQLSTATE: 42703;\n'Project [id#2L, date_time#3, sender_account#4L, receiver_account#5L, amount#6, payment_currency#7, received_currency#8, payment_type#9, is_laundering#10, laundering_type#11,

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `ts` cannot be resolved. Did you mean one of the following? [`id`, `amount`, `date_time`, `payment_type`, `is_laundering`]. SQLSTATE: 42703;
'Project [id#2L, date_time#3, sender_account#4L, receiver_account#5L, amount#6, payment_currency#7, received_currency#8, payment_type#9, is_laundering#10, laundering_type#11, cast('ts as timestamp) AS ts#320]
+- Relation [id#2L,date_time#3,sender_account#4L,receiver_account#5L,amount#6,payment_currency#7,received_currency#8,payment_type#9,is_laundering#10,laundering_type#11] JDBCRelation(saml_d.transferences) [numPartitions=6]


In [None]:
(
  tx.repartition(200, "event_date")         # reparte por la partición de salida
    .sortWithinPartitions("A","B","ts")     # ayuda con lecturas por dyad
    .write.mode("overwrite")
    .partitionBy("event_date")              # ¡clave!
    .option("compression","snappy")
    .option("maxRecordsPerFile", 1_000_000) # archivos ~grandes (menos metadatos)
    .parquet("/data/tx_raw")
)

# (
#   tx.repartition(200, "event_hour")         # reparte por la partición de salida
#     .sortWithinPartitions("A","B","ts")     # ayuda con lecturas por dyad
#     .write.mode("overwrite")
#     .partitionBy("event_hour")              # ¡clave!
#     .option("compression","snappy")
#     .option("maxRecordsPerFile", 1_000_000) # archivos ~grandes (menos metadatos)
#     .parquet("/data/tx_agg_1h")
# )

# (
#   tx.repartition(200, "event_week")         # reparte por la partición de salida
#     .sortWithinPartitions("A","B","ts")     # ayuda con lecturas por dyad
#     .write.mode("overwrite")
#     .partitionBy("event_week")              # ¡clave!
#     .option("compression","snappy")
#     .option("maxRecordsPerFile", 1_000_000) # archivos ~grandes (menos metadatos)
#     .parquet("/data/tx_agg_w")
# )

# Cargas incrementales (p. ej. por día):
# tx.filter("event_date = '2023-07-01'").write.mode("overwrite").partitionBy("event_date").parquet(...)


In [None]:
# Definimos ventanas fijas con la función window()
def agg_by_window(df, window_spec, path_out):
    g = (df
         .groupBy(
             "A","B",
             F.window(F.col("ts"), window_spec).alias("w")
          )
         .agg(
             F.count("*").alias("cnt"),
             F.sum("amount").alias("sum_amt"),
             F.max("amount").alias("max_amt"),
             F.mean("amount").alias("mean_amt"),
             F.expr("percentile_approx(amount, 0.9)").alias("p90_amt"),
             F.max(F.col("is_fraud").cast("int")).alias("has_fraud")
          )
         .withColumn("win_start", F.col("w.start"))
         .withColumn("win_end",   F.col("w.end"))
         .drop("w")
         .withColumn("event_date", F.to_date("win_start"))
        )

    (g.repartition(200, "event_date")
       .write.mode("overwrite")
       .partitionBy("event_date")
       .option("compression","snappy")
       .parquet(path_out)
    )

# Ejecuta para 1h, 6h, 1d, 7d
agg_by_window(tx, "1 hour", "/data/tx_agg_1h")
agg_by_window(tx, "6 hours", "/data/tx_agg_6h")
agg_by_window(tx, "1 day", "/data/tx_agg_1d")
agg_by_window(tx, "7 days", "/data/tx_agg_7d")


## Descripción dataset

In [5]:
accounts_dfps = accounts_df.pandas_api()
accounts_dfps.head()

Unnamed: 0,account,location
0,153883,UK
1,155774,UK
2,285416,UK
3,348111,UK
4,438458,UK


In [6]:
accounts_dfps.dtypes

account      int64
location    object
dtype: object

In [7]:
len(accounts_dfps)

855460

In [8]:
tx_dfps = tx_df.pandas_api()
tx_dfps.head()

Unnamed: 0,id,date_time,sender_account,receiver_account,amount,payment_currency,received_currency,payment_type,is_laundering,laundering_type
0,1102611,2023-02-02 18:46:36,9555376618,634808998,5994.24,UK pounds,UK pounds,Cheque,0,Normal_Small_Fan_Out
1,1102612,2023-02-04 22:56:18,4061082122,9328386398,8170.33,UK pounds,UK pounds,Debit card,0,Normal_Fan_Out
2,1102613,2023-01-19 08:26:14,6783007914,5817209694,2191.82,UK pounds,UK pounds,ACH,0,Normal_Small_Fan_Out
3,1102614,2023-01-28 18:45:34,1441864356,1809883233,568.49,UK pounds,UK pounds,ACH,0,Normal_Small_Fan_Out
4,1102615,2023-01-07 15:54:41,2039169094,5711985048,8542.31,UK pounds,UK pounds,Credit card,0,Normal_Small_Fan_Out


In [9]:
len(tx_dfps)

9504852

In [10]:
tx_dfps.dtypes

id                            int64
date_time            datetime64[ns]
sender_account                int64
receiver_account              int64
amount                      float64
payment_currency             object
received_currency            object
payment_type                 object
is_laundering                 int32
laundering_type              object
dtype: object

## Análisis univariado

In [12]:
accounts_df.select(F.countDistinct("location").alias("countries")).show()

+---------+
|countries|
+---------+
|       18|
+---------+



In [13]:
accounts_df.groupBy("location").count().orderBy(F.desc("count")).show(50, truncate=False)

+-----------+------+
|location   |count |
+-----------+------+
|UK         |803407|
|Mexico     |3149  |
|Nigeria    |3111  |
|Pakistan   |3100  |
|Japan      |3095  |
|Albania    |3086  |
|Turkey     |3072  |
|Austria    |3072  |
|Spain      |3066  |
|India      |3057  |
|USA        |3056  |
|Switzerland|3053  |
|Germany    |3051  |
|UAE        |3042  |
|France     |3038  |
|Morocco    |3031  |
|Netherlands|3006  |
|Italy      |2968  |
+-----------+------+



In [14]:
tx_df.select("amount", "is_laundering").summary("count","mean","stddev","min","25%","50%","75%","max").show()

+-------+------------------+--------------------+
|summary|            amount|       is_laundering|
+-------+------------------+--------------------+
|  count|           9504852|             9504852|
|   mean| 8762.967600927219|0.001038732638866...|
| stddev|25614.952999598154| 0.03221263389639035|
|    min|              3.73|                   0|
|    25%|           2143.74|                   0|
|    50%|           6113.63|                   0|
|    75%|          10457.93|                   0|
|    max|      1.26184984E7|                   1|
+-------+------------------+--------------------+



In [12]:
from pyspark.mllib.stat import KernelDensity  

# ~5% aleatorio sin reemplazo
sample = tx_df.sample(withReplacement=False, fraction=0.01, seed=42)

sdf = sample.selectExpr("CAST(amount AS DOUBLE) amount").where("amount IS NOT NULL")

# banda (Silverman)
stats = sdf.agg(F.count("*").alias("n"), F.stddev("amount").alias("sd")).first()
n, sd = stats.n, float(stats.sd) if stats.sd is not None else 1.0
bw = 1.06 * sd * (n ** (-1/5)) if n and sd else 1.0

q_lo, q_hi = sdf.approxQuantile("amount", [0.001, 0.999], 1e-3)
xs = np.linspace(q_lo, q_hi, 256)

rdd = sdf.rdd.map(lambda r: float(r[0]))

kd = KernelDensity()
kd.setSample(rdd)
kd.setBandwidth(bw)
ys = kd.estimate(xs)

# Graficar en el driver
plt.plot(xs, ys)
plt.xlabel("amount")
plt.ylabel("density")
plt.title("KDE de amount en Spark")
plt.show()


Py4JJavaError: An error occurred while calling o354.estimateKernelDensity.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 29.0 failed 1 times, most recent failure: Lost task 5.0 in stage 29.0 (TID 60) (DESKTOP-453KJ4K executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:252)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:143)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:178)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:261)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.net.SocketTimeoutException: Timed out while waiting for the Python worker to connect back
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:234)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2579)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$1(RDD.scala:1236)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1230)
	at org.apache.spark.mllib.stat.KernelDensity.estimate(KernelDensity.scala:91)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.estimateKernelDensity(PythonMLLibAPI.scala:1078)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:252)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:143)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:178)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:261)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Timed out while waiting for the Python worker to connect back
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:234)
	... 21 more


In [12]:
import numpy as np
from pyspark.sql import functions as F

sdf = tx_df.selectExpr("CAST(amount AS DOUBLE) amount").where("amount IS NOT NULL")
q_lo, q_hi = sdf.approxQuantile("amount", [0.001, 0.999], 1e-3)

# Construye N bins uniformes entre quantiles
N = 200
edges = np.linspace(q_lo, q_hi, N + 1)

# Usa el histograma de Spark sobre el RDD (implementación en JVM)
counts, bins = sdf.select("amount").rdd.flatMap(lambda r: [r[0]]).histogram(list(edges))

# Densidad (normaliza por ancho de bin y total)
widths = np.diff(bins)
area = float(sum(counts) * np.mean(widths))  # aproximación
density = np.array(counts) / (np.sum(counts) * widths)

# Suaviza en el driver con una convolución gaussiana
def gaussian_smooth(y, sigma_bins=2.0, k=15):
    x = np.arange(-k, k+1)
    g = np.exp(-(x**2)/(2*sigma_bins**2))
    g /= g.sum()
    return np.convolve(y, g, mode="same")

y_smooth = gaussian_smooth(density, sigma_bins=2.5)

x_centers = 0.5 * (bins[:-1] + bins[1:])

# Graficar (matplotlib)
plt.plot(x_centers, y_smooth); plt.xlabel("amount"); plt.ylabel("density"); plt.show()




Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 26.0 failed 1 times, most recent failure: Lost task 3.0 in stage 26.0 (TID 51) (DESKTOP-453KJ4K executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:252)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:143)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:178)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:261)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.net.SocketTimeoutException: Timed out while waiting for the Python worker to connect back
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:234)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2505)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2524)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2549)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1057)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1056)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:203)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:252)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:143)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:158)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:178)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:261)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Timed out while waiting for the Python worker to connect back
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:234)
	... 18 more


In [None]:
probs = [i/1000 for i in range(1001)]
qs = sdf.approxQuantile("amount", probs, 1e-3)

plt.plot(qs, probs); plt.xlabel("amount"); plt.ylabel("ECDF"); plt.show()
sdf = tx_df.where("amount > 0").selectExpr("LOG(amount) AS amount")
# y reutilizas cualquiera de los métodos anteriores

In [None]:
tx_df.groupBy("sender_account").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("receiver_account").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("payment_currency").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("received_currency").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("payment_type").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("is_laundering").count().orderBy(F.desc("count")).show(50, truncate=False)

In [None]:
tx_df.groupBy("laundering_type").count().orderBy(F.desc("count")).show(50, truncate=False)