### Init Context

In [1]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_nested_banking/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 1, 1),
                       spark_conf=spark_config,
                       delete_unused_columns=True,
                       spark_master='local[*]')

2025-08-11 18:44:26,603:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-08-11 18:44:26,954:INFO:thetaray.common.logging:load_risks took: 0.1738901138305664
2025-08-11 18:44:27,530:INFO:thetaray.common.logging:=== Started updating schema ===
2025-08-11 18:44:27,603:INFO:thetaray.common.logging:=== Started updating schema on Postgres ===
2025-08-11 18:44:42,788:INFO:thetaray.common.logging:found 150 tables in solution public schema
2025-08-11 18:44:42,798:INFO:thetaray.common.logging:found 150 tables in solution public schema
2025-08-11 18:44:42,807:INFO:thetaray.common.logging:found 150 tables in solution public schema
2025-08-11 18:44:42,821:INFO:thetaray.common.logging:found 150 tables in solution public schema
2025-08-11 18:44:42,830:INFO:thetaray.common.logging:found 150 tables in solution public schema
2025-08-11 18:44:42,838:INFO:thetaray.common.logging

Added `alias` successfully.


2025-08-11 18:44:58,635:INFO:thetaray.common.logging:=== Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.


### Imports

In [2]:
from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.evaluation import load_evaluated_activities, read_alerted_activities
from thetaray.api.graph import publish_edges, publish_nodes

from domains.demo_nested_banking.datasets.customers import customers_dataset
from domains.demo_nested_banking.datasets.transactions import transactions_dataset
from domains.demo_nested_banking.evaluation_flows.ef import evaluation_flow
from thetaray.common.data_environment import DataEnvironment


from datetime import datetime

import pandas as pd
from pyspark.sql import functions as f
from pyspark.sql.types import *

### Nodes

In [3]:
customer_df = dataset_functions.read(context, customers_dataset().identifier, data_environment=DataEnvironment.PUBLIC)

nodes_df = (
    customer_df.select("customer_id", "customer_country_id", "customer_name")
    .withColumnRenamed("customer_id", "id")
    .withColumnRenamed("customer_country_id", "CT")
    .withColumnRenamed("customer_name", "NM")
    .withColumn("AN", f.col("id"))
    .withColumn("AD", f.lit(""))
    .withColumn("effective_date", f.lit(context.execution_date))
).select("id", "CT", "NM","AD", "AN","effective_date")

trx_df = dataset_functions.read(context, transactions_dataset().identifier, data_environment=DataEnvironment.PUBLIC)
trx_df = trx_df.where(f.col("counterparty_id").isNotNull())

cp_nodes_df = (
    trx_df.select("counterparty_id", "transaction_timestamp", "counterparty_country_code", "counterparty_name")
    .withColumnRenamed("counterparty_id", "id")
    .withColumnRenamed("counterparty_country_code", "CT")
    .withColumnRenamed("counterparty_name", "NM")
    .withColumn("AD", f.lit(""))
    .withColumn("AN", f.col("id"))
    .withColumn("effective_date", f.lit(context.execution_date))
).select("id", "CT", "NM", "AD", "AN", "effective_date")

cp_nodes_df = cp_nodes_df.dropDuplicates(subset=["id"])

nodes_df = nodes_df.union(cp_nodes_df)

publish_nodes(context, nodes_df, "demo_nested_banking_graph", "AC", data_environment=DataEnvironment.PUBLIC)

2025-08-11 18:45:00,454:INFO:thetaray.common.logging:Truncating data by execution date: tr_job_ts = '1970-01-01 00:00:00' AND type IN ('AC')
                                                                                

{'node_count': 1081}

### Edges

In [4]:
trx_df = dataset_functions.read(context, transactions_dataset().identifier, data_environment=DataEnvironment.PUBLIC)
trx_df = trx_df.where(f.col("counterparty_id").isNotNull())

incoming_df = trx_df.where(f.col('direction') == "IN")
outgoing_df = trx_df.where(f.col('direction') == "OUT")

incoming_edges_df = (
    incoming_df.select("transaction_id", "transaction_timestamp", "amount_original_currency", "customer_id", "counterparty_id", "currency")
    .withColumnRenamed("transaction_id", "id")
    .withColumnRenamed("transaction_timestamp", "effective_date")
    .withColumnRenamed("counterparty_id", "source_node")
    .withColumnRenamed("customer_id", "target_node")
    .withColumnRenamed("amount_original_currency", "AM")
    .withColumnRenamed("currency", "CR")
    .withColumn("count", f.lit(1))
)
incoming_edges_df = incoming_edges_df.withColumn("CT", incoming_edges_df["count"].cast("long"))
incoming_edges_df = incoming_edges_df.select('id', 'effective_date', 'source_node', 'target_node', 'AM', 'CR', 'CT')

print(incoming_edges_df.count())

outgoing_edges_df = (
    outgoing_df.select("transaction_id", "transaction_timestamp", "amount_original_currency", "customer_id", "counterparty_id", "currency")
    .withColumnRenamed("transaction_id", "id")
    .withColumnRenamed("transaction_timestamp", "effective_date")
    .withColumnRenamed("customer_id", "source_node")
    .withColumnRenamed("counterparty_id", "target_node")
    .withColumnRenamed("amount_original_currency", "AM")
    .withColumnRenamed("currency", "CR")
    .withColumn("count", f.lit(1))
)
outgoing_edges_df = outgoing_edges_df.withColumn("CT", outgoing_edges_df["count"].cast("long"))
outgoing_edges_df = outgoing_edges_df.select('id', 'effective_date', 'source_node', 'target_node', 'AM', 'CR', 'CT')

print(outgoing_edges_df.count())

edges_df = incoming_edges_df.union(outgoing_edges_df)

publish_edges(context, edges_df, "demo_nested_banking_graph", "TX", "AC", "AC", data_environment=DataEnvironment.PUBLIC)

                                                                                

1060143


2025-08-11 18:45:08,755:INFO:thetaray.common.logging:Truncating data by execution date: tr_job_ts = '1970-01-01 00:00:00' AND type IN ('TX')


1059732


                                                                                

{'edges_count': 2119875, 'unknown_nodes_count': 89}

### Read alerted activities

In [5]:
act_df = read_alerted_activities(context, evaluation_flow().identifier, data_environment=DataEnvironment.PUBLIC)
eval_act_df = load_evaluated_activities(context, evaluation_flow().identifier, data_environment=DataEnvironment.PUBLIC)
joined_act_df = eval_act_df.join(act_df, "tr_id")
selected_activity_fields = joined_act_df.select("tr_id", "risk_id", "year_month", "is_suppressed", "customer_id")
selected_activity_fields.cache()

DataFrame[tr_id: string, risk_id: string, year_month: timestamp, is_suppressed: boolean, customer_id: string]

### Extract and publish alert nodes

In [6]:
al_nodes_df = (
    selected_activity_fields.withColumn("id", f.concat(f.col("tr_id"), f.lit("_"), f.col("risk_id")))
    .withColumnRenamed("year_month", "effective_date")
    .withColumnRenamed("is_suppressed", "SP")
    .withColumnRenamed("risk_id", "RI")
    .withColumnRenamed("tr_id", "AI")
    .drop("customer_id")
)

In [7]:
publish_nodes(context, al_nodes_df, "demo_nested_banking_graph", "AL", data_environment=DataEnvironment.PUBLIC)

2025-08-11 18:46:27,868:INFO:thetaray.common.logging:Truncating data by execution date: tr_job_ts = '1970-01-01 00:00:00' AND type IN ('AL')
                                                                                

{'node_count': 0}

### Extract and publish alert - account edges

In [8]:
al_edges_df = (
    selected_activity_fields.withColumn("id", f.concat(f.col("tr_id"), f.lit("_"), f.col("risk_id")))
    .withColumnRenamed("year_month", "effective_date")
    .withColumn("source_node", f.col("id"))
    .withColumnRenamed("customer_id", "target_node")
    .drop("is_suppressed", "tr_id", "risk_id")
)

In [9]:
publish_edges(
    context,
    al_edges_df,
    "demo_nested_banking_graph",
    edge_type="AL",
    source_node_type="AL",
    target_node_type="AC", data_environment=DataEnvironment.PUBLIC)

2025-08-11 18:46:29,332:INFO:thetaray.common.logging:Truncating data by execution date: tr_job_ts = '1970-01-01 00:00:00' AND type IN ('AL')
                                                                                

{'edges_count': 0, 'unknown_nodes_count': 0}

In [10]:
selected_activity_fields.unpersist()

DataFrame[tr_id: string, risk_id: string, year_month: timestamp, is_suppressed: boolean, customer_id: string]

In [11]:
context.close()