### Init Context

In [1]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_pay_proc/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 1, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]')

2025-08-11 18:48:03,576:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-08-11 18:48:03,923:INFO:thetaray.common.logging:load_risks took: 0.1708083152770996
2025-08-11 18:48:04,473:INFO:thetaray.common.logging:=== Started updating schema ===


### Imports

In [2]:
from thetaray.api.dataset import dataset_functions

from domains.demo_pay_proc.datasets.customers import customers_dataset
from domains.demo_pay_proc.datasets.transactions import transactions_dataset
from domains.demo_pay_proc.datasets.customer_insights import customer_insights_dataset
from domains.demo_pay_proc.evaluation_flows.ef import evaluation_flow

import json
import psycopg2
import os
import random
import pandas as pd

from datetime import datetime
from faker import Faker
from pyspark.sql import functions as f
from pyspark.sql.types import StructType

from thetaray.api.dataset.schema import DatasetSchemaHandler
from thetaray.common import Constants, Settings
from thetaray.common.data_environment import DataEnvironment

spark = context.get_spark_session()

ns_suffix = Settings.SHARED_NAMESPACE.removeprefix('shared-')

fake = Faker()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/11 18:48:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
Hive Session ID = 165b01a4-b9fd-45d3-9f5b-5297e897bf39
25/08/11 18:48:10 INFO SessionState: Hive Session ID = 165b01a4-b9fd-45d3-9f5b-5297e897bf39


### Creation

In [3]:
DB_HOST = Settings.DB_HOST

DB_USER_CDD = os.environ['CDD_POSTGRES_USERNAME']
DB_PASS_CDD = os.environ['CDD_POSTGRES_PASSWORD']
DB_USER_RP = 'postgres'
DB_PASS_RP = 'postgres'


dsn_cdd = (
    f'user={DB_USER_CDD} '
    f'password={DB_PASS_CDD} '
    f'dbname={Constants.CDD_DB_NAME} '
    f'host={DB_HOST[:-5]} '
    f'port={DB_HOST[-4:]} '
    'sslmode=verify-ca '
    'sslrootcert=/certs/ca.crt'
)


dsn_rp = (
    f'user={DB_USER_RP} '
    f'password={DB_PASS_RP} '
    f'dbname={Constants.CDD_DB_NAME} '
    f'host={DB_HOST[:-5]} '
    f'port={DB_HOST[-4:]} '
    'sslmode=verify-ca '
    'sslrootcert=/certs/ca.crt'
)


def execute_query(query, dsn):
    conn = psycopg2.connect(dsn=dsn)
    with conn.cursor() as cursor:
        cursor.execute(query)
        columns = [col.name for col in cursor.description]
        rows = []
        for row in cursor.fetchall():
            rows.append({col: val for col, val in zip(columns, row)})
        return rows

def get_alert_mapper(solution, ef_id):
    schema = f'apps_{ns_suffix.replace("-", "_")}'
    for alert_mapper in execute_query(f'SELECT * FROM {schema}.rp_mappers', dsn_rp):
        ef_unit = json.loads(alert_mapper['solution_evaluation_flow_unit'])
        if not ef_unit:
            continue
        ef_unit = ef_unit[0]
        if ef_unit['solutionId'] == solution and ef_unit['evaluationFlowId'] == ef_id:
            return alert_mapper


def get_alerts(solution, ef_id):
    schema = f'apps_{ns_suffix.replace("-", "_")}'
    alert_mapper = get_alert_mapper(solution, ef_id)
    if alert_mapper is None:
        raise Exception(f'Alert mapper not found for {solution = } and {ef_id = }')
    alert_mapper_identifier = alert_mapper['identifier']
    alert_fields = execute_query(f'SELECT * FROM {schema}.rp_alert_fields', dsn_rp)
    alert_fields = {alert_field['rp_alert_id']: alert_field for alert_field in alert_fields}
    alerts = execute_query(f"SELECT * FROM {schema}.rp_alerts WHERE alert_mapper_identifier = '{alert_mapper_identifier}' AND history_type = 'CURRENT'", dsn_rp)
    for alert in alerts:
        alert['customer_id'] = alert_fields[alert['alert_id']]['customer_id']
    return alerts


def get_accounts(solution):
    schema = Constants.SOLUTION_SCHEMA_TPL.format(solution=solution)
    query = f"SELECT * FROM {schema}.demo_pay_proc_customers"
    return execute_query(query, dsn_cdd)


def get_account_records(solution, customer_id):
    schema = Constants.SOLUTION_SCHEMA_TPL.format(solution=solution)
    query = f"SELECT * FROM {schema}.demo_pay_proc_customers WHERE customer_id = '{customer_id}'"
    return execute_query(query, dsn_cdd)


def get_account_transactions(solution, customer_id):
    schema = Constants.SOLUTION_SCHEMA_TPL.format(solution=solution)
    query = f"SELECT * FROM {schema}.demo_pay_proc_transactions WHERE customer_id = '{customer_id}'"
    return execute_query(query, dsn_cdd)

In [4]:
solution = Settings.SOLUTION
ef_id = evaluation_flow().identifier
schema = Constants.SOLUTION_SCHEMA_TPL.format(solution=solution)

alerts = get_alerts(solution, ef_id)
accounts = get_accounts(solution)

account_country = execute_query(f'SELECT * FROM {schema}.demo_pay_proc_customers', dsn_cdd)
account_country = {a['customer_id']: a['customer_country_id'] for a in account_country}

customer_insights_data = []

for account in accounts:
    account_id = account['customer_id']
    account_records = get_account_records(solution, account_id)
    account = sorted(account_records, key=lambda x: x['tr_job_ts'])[-1]  # most recent account data
    account_transactions = get_account_transactions(solution, account_id)
    account_transactions_in = [trx for trx in account_transactions if trx['direction'] == 'IN']
    account_transactions_out = [trx for trx in account_transactions if trx['direction'] == 'OUT']
    account_alerts = [alert for alert in alerts if alert['customer_id'] == account_id]

    kyc_classification = 'Small'
    kyc_name = account['customer_name']
    kyc_is_new = len(account_records) == 1
    kyc_recently_updated = not kyc_is_new
    kyc_newly_incorporation = False
    kyc_industry = account['industry']
    kyc_null_field = None

    account_transactions_pd = pd.DataFrame(account_transactions)
    hr_cc = [*set(account_transactions_pd.loc[account_transactions_pd.counterparty_country_risk == 'High']['counterparty_country_code'].tolist())]
    mr_cc = [*set(account_transactions_pd.loc[account_transactions_pd.counterparty_country_risk == 'Moderate']['counterparty_country_code'].tolist())]
    lr_cc = [*set(account_transactions_pd.loc[account_transactions_pd.counterparty_country_risk == 'Low']['counterparty_country_code'].tolist())]

    director_ad = {'CC': (cc := account['customer_country_id']), 'AD': '', 'CL': 'L'}
    director_ad = json.dumps(director_ad)
    company_ad = director_ad
    company_ad = json.dumps(company_ad)

    tr_in = sum(trx['amount_domestic_currency'] for trx in account_transactions_in)
    tr_out = sum(trx['amount_domestic_currency'] for trx in account_transactions_out)
    tr_in_count = len(account_transactions_in)
    tr_out_count = len(account_transactions_out)
    tr_in_seg = tr_in * random.uniform(0.4, 0.8)
    tr_out_seg = tr_out * random.uniform(0.4, 0.8)
    tr_in_seg_count = int(tr_in_count * random.uniform(0.4, 0.8))
    tr_out_seg_count = int(tr_out_count * random.uniform(0.4, 0.8))
    trx_from_date = min(map(lambda trx: trx['transaction_timestamp'], account_transactions))
    trx_to_date = max(map(lambda trx: trx['transaction_timestamp'], account_transactions))

    tm_open = len([alert for alert in account_alerts if alert['state_id'] != 'state_closed'])
    tm_closed = len([alert for alert in account_alerts if alert['state_id'] == 'state_closed'])
    tm_false_positives = len([alert for alert in account_alerts if alert['resolution_code'] == 'Non_Issue'])
    tm = {'Open': tm_open, 'Closed': tm_closed, 'False_positives': tm_false_positives}
    tm = json.dumps(tm)

    scrn_open = 0
    scrn_closed = 0
    scrn_false_positives = 0
    scrn = {'Open': scrn_open, 'Closed': scrn_closed, 'False_positives': scrn_false_positives}
    scrn = json.dumps(scrn)

    account_insights_data = {
        'customer_id': account_id,
        'kyc_classification': kyc_classification,
        'kyc_name': kyc_name,
        'kyc_is_new': kyc_is_new,
        'kyc_recently_updated': kyc_recently_updated,
        'kyc_newly_incorporation': kyc_newly_incorporation,
        'country_of_incorporation': account_country[account_id],
        'kyc_industry': kyc_industry,
        'kyc_null_field': kyc_null_field,
        'hr_cc': hr_cc,
        'mr_cc': mr_cc,
        'lr_cc': lr_cc,
        'director_ad': director_ad,
        'company_ad': company_ad,
        'tr_in': tr_in,
        'tr_out': tr_out,
        'tr_in_count': tr_in_count,
        'tr_out_count': tr_out_count,
        'tr_in_seg': tr_in_seg,
        'tr_out_seg': tr_out_seg,
        'tr_in_seg_count': tr_in_seg_count,
        'tr_out_seg_count': tr_out_seg_count,
        'trx_from_date': trx_from_date,
        'trx_to_date': trx_to_date,
        'tm': tm,
        'scrn': scrn
    }

    customer_insights_data.append(account_insights_data)


customer_insights_ds = next(ds for ds in context.solution.datasets if ds.identifier == 'demo_pay_proc_customer_insights')
customer_insights_schema = DatasetSchemaHandler(customer_insights_ds, context, data_environment=DataEnvironment.get_default())._build_dataset_schema()
customer_insights_schema = StructType([s for s in customer_insights_schema if s.name in [f.identifier for f in customer_insights_ds.field_list]])
customer_insights_df = spark.createDataFrame(customer_insights_data, schema=customer_insights_schema)
customer_insights_df = customer_insights_df.withColumn('tr_timestamp', f.lit(context.execution_date))
customer_insights_df = customer_insights_df.withColumn('effective_date', f.lit(context.execution_date))

In [5]:
dataset_functions.write(context, customer_insights_df, customer_insights_dataset().identifier, data_environment=DataEnvironment.PUBLIC)
dataset_functions.publish(context, customer_insights_dataset().identifier, data_environment=DataEnvironment.PUBLIC)

2025-08-11 18:48:16,757:INFO:thetaray.common.logging:### DataSet - writing started ###
25/08/11 18:48:19 INFO SQLStdHiveAccessController: Created SQLStdHiveAccessController for session context : HiveAuthzSessionContext [sessionString=165b01a4-b9fd-45d3-9f5b-5297e897bf39, clientType=HIVECLI]
25/08/11 18:48:20 ERROR FileUtils: Failed to delete s3a://thetaray-sonar/warehouse/public.db/demo_pay_proc_customer_insights/tr_year=1970/tr_month=1/tr_day=1/tr_date=1970_01_01_00_00_00
2025-08-11 18:48:23,135:INFO:thetaray.common.logging:### DataSet - writing done, 1 written, 0 corrupted, 0 rejected  ###
2025-08-11 18:48:24,128:INFO:thetaray.common.logging:finished publishing records for dataset demo_pay_proc_customer_insights 


True

In [6]:
context.close()