# Preparación

In [1]:
from ETL_pg2neo4j.diagnostics import get_machine_stats
from ETL_pg2neo4j.spark_session import get_spark
from ETL_pg2neo4j.load_config import (
    NEO4J_URI, NEO4J_USER, NEO4J_PASS, NEO4J_DDBB
)

# 1) Detectar recursos de la máquina
stats = get_machine_stats()

# 2) Crear SparkSession con jars de Postgres + Neo4j ya configurados
spark, jdbc_props = get_spark(stats)

# (Opcional) Verifica rápidamente
spark.version, stats


[SPARK] usando jars locales -> /opt/spark/extra-jars/postgresql-42.7.4.jar,/opt/spark/extra-jars/neo4j-connector-apache-spark_2.12-5.3.10_for_spark_3.jar
[SPARK] classpath reforzado (driver/executors) -> /opt/spark/extra-jars/postgresql-42.7.4.jar:/opt/spark/extra-jars/neo4j-connector-apache-spark_2.12-5.3.10_for_spark_3.jar


ERROR StatusLogger Reconfiguration failed: No configuration found for '5ffd2b27' at 'null' in 'null'
ERROR StatusLogger Reconfiguration failed: No configuration found for 'Default' at 'null' in 'null'
25/11/23 14:29:15 WARN Utils: Your hostname, AsusMare resolves to a loopback address: 127.0.1.1; using 192.168.100.4 instead (on interface wlp2s0)
25/11/23 14:29:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/11/23 14:29:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/23 14:29:16 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


[CHECK] Postgres JDBC REALMENTE visible
[CHECK] Neo4j connector REALMENTE visible


('3.5.3',
 {'cpu_cores': 8,
  'mem_total_bytes': 16644382720,
  'mem_used_bytes': 12503138304,
  'mem_available_bytes': 4141244416})

In [2]:
# Query Cypher para nodos: trae solo lo que te interesa ahora
node_query = """
MATCH (a:Account)
RETURN
    id(a)                  AS neo4j_id,
    a.account_number       AS account_number,
    a.location             AS location,
    a.current_balance      AS current_balance,
    a.first_seen           AS first_seen,
    a.last_seen            AS last_seen
"""

nodes_df = (
    spark.read
        .format("org.neo4j.spark.DataSource")
        .option("url", NEO4J_URI)
        .option("authentication.type", "basic")
        .option("authentication.basic.username", NEO4J_USER)
        .option("authentication.basic.password", NEO4J_PASS)
        .option("database", NEO4J_DDBB)
        .option("query", node_query)
        .load()
)

nodes_df.printSchema()
nodes_df.show(5)


root
 |-- neo4j_id: long (nullable = true)
 |-- account_number: long (nullable = true)
 |-- location: string (nullable = true)
 |-- current_balance: double (nullable = true)
 |-- first_seen: timestamp (nullable = true)
 |-- last_seen: timestamp (nullable = true)

+--------+--------------+--------+------------------+-------------------+-------------------+
|neo4j_id|account_number|location|   current_balance|         first_seen|          last_seen|
+--------+--------------+--------+------------------+-------------------+-------------------+
|       0|    4651102136|      UK|51445.619999999995|2023-03-17 21:35:31|2023-03-18 18:45:28|
|       1|    3647155427|      UK|         -68020.94|2023-03-10 06:34:09|2023-03-10 20:02:05|
|       2|    8986636663|      UK|2804.2599999999998|2022-10-27 06:16:50|2023-05-20 08:41:51|
|       3|     196361780|      UK|         131638.48|2022-10-17 16:03:31|2023-07-07 13:12:55|
|       4|    3775525103|      UK|           15667.7|2022-11-06 08:09:43|2023-

In [3]:
edge_query = """
MATCH (src:Account)-[r:TX]->(dst:Account)
RETURN
    id(r)                        AS neo4j_rel_id,
    r.id                         AS id,
    src.account_number           AS src,
    dst.account_number           AS dst,
    r.timestamp                  AS timestamp,
    r.amount                     AS amount,
    r.payment_currency           AS payment_currency,
    r.received_currency          AS received_currency,
    r.payment_type               AS payment_type,
    r.is_laundering              AS is_laundering,
    r.laundering_type            AS laundering_type,
    r.masked                     AS masked,
    r.src_delta                  AS src_delta,
    r.src_balance_before         AS src_balance_before,
    r.src_balance_after          AS src_balance_after,
    r.src_seq                    AS src_seq,
    r.src_currency               AS src_currency,
    r.dst_delta                  AS dst_delta,
    r.dst_balance_before         AS dst_balance_before,
    r.dst_balance_after          AS dst_balance_after,
    r.dst_seq                    AS dst_seq,
    r.dst_currency               AS dst_currency
"""

edges_df = (
    spark.read
        .format("org.neo4j.spark.DataSource")
        .option("url", NEO4J_URI)
        .option("authentication.type", "basic")
        .option("authentication.basic.username", NEO4J_USER)
        .option("authentication.basic.password", NEO4J_PASS)
        .option("database", NEO4J_DDBB)
        .option("query", edge_query)
        .load()
)

edges_df.printSchema()
edges_df.show(5)


root
 |-- neo4j_rel_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- src: long (nullable = true)
 |-- dst: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- amount: double (nullable = true)
 |-- payment_currency: string (nullable = true)
 |-- received_currency: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- is_laundering: long (nullable = true)
 |-- laundering_type: string (nullable = true)
 |-- masked: long (nullable = true)
 |-- src_delta: double (nullable = true)
 |-- src_balance_before: double (nullable = true)
 |-- src_balance_after: double (nullable = true)
 |-- src_seq: long (nullable = true)
 |-- src_currency: string (nullable = true)
 |-- dst_delta: double (nullable = true)
 |-- dst_balance_before: double (nullable = true)
 |-- dst_balance_after: double (nullable = true)
 |-- dst_seq: long (nullable = true)
 |-- dst_currency: string (nullable = true)

+-------------------+-------+----------+----------+------------