In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
from thetaray.api.tools.metadata import trigger_metadata_sync
trigger_metadata_sync(allow_type_changes=True)



{'message': 'Syncing metadata finished successfully'}

In [2]:
import datetime
from dateutil.relativedelta import relativedelta
import json
import logging
from pyspark.sql import DataFrame, Window, functions as f
from pyspark.sql import SQLContext
from pyspark.sql.types import LongType
import yaml

from common.libs import dates as dates_lib
from common.libs import features_discovery
from common.libs.features_executor import FeaturesExecutor
from common.libs.feature_engineering import max_look_back_monthly_features, max_look_back_daily_weekly_features
from common.libs.zscore import enrich_with_z_score
from common.factory.wrangling_execution_strategy import get_wrangling_execution_strategy
from common.factory.eval_flow_definition import get_evaluation_flow_definition
from common.factory.domain_definition import get_domain_definition
from common.notebook_utils.wrangling.wrangling_execution_strategy import WranglingExecutionStrategy
from common.definitions.domain import DomainDefinition
from common.definitions.eval_flow import EvaluationFlowDefinition
from common.libs.context_utils import get_dataset

from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.solution import IngestionMode
from thetaray.common import Constants
from thetaray.common.data_environment import DataEnvironment

logging.getLogger().handlers[0].setFormatter(logging.Formatter(fmt='%(levelname)s: %(asctime)s @ %(message)s',datefmt='%Y-%m-%d %H:%M:%S'))
logging.basicConfig(level=logging.INFO)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


from thetaray.api.context import init_context
import datetime
from thetaray.common import Constants

from common.libs.config.loader import load_config
from common.libs.config.basic_execution_config_loader import BasicExecutionConfig, DevBasicExecutionConfig
from common.libs.context_utils import is_run_triggered_from_airflow



with open('/thetaray/git/solutions/domains/demo_fuib/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']

execution_date=datetime.datetime(1970, 1, 1)

context = init_context(domain='demo_fuib',
                       execution_date=execution_date,
                       spark_conf=spark_config,
                       spark_master='local[*]',
                       allow_type_changes=True)

spark = context.get_spark_session()
sc = SQLContext(spark)
params = context.parameters
print(f"Spark UI URL: {context.get_spark_ui_url()}")

print(json.dumps(params, indent=4))

2025-09-21 15:43:48,978:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-09-21 15:43:49,620:INFO:thetaray.common.logging:load_risks took: 0.13105368614196777
INFO: 2025-09-21 15:43:50 @ === Started updating schema ===
INFO: 2025-09-21 15:43:50 @ === Started updating schema on Postgres ===
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09-21 15:44:00 @ found 213 tables in solution public schema
INFO: 2025-09

Added `alias` successfully.


INFO: 2025-09-21 15:44:26 @ === Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.
Spark UI URL: https://jupyterhub-platform-thetalab.sonar.thetaray.cloud/user/andre.gutnik/proxy/4040/jobs/
{}




In [3]:
entity='customer'
cadence='monthly'

global_config = load_config('global.yaml', context=context)
global_config

{'bau': False,
 'monthly_features_look_back_in_months': 6,
 'monthly_data_horizon': None,
 'me_to_me_threshold': 1,
 'global_features_params': {'round_digits': 2},
 'tr_cfg_metadata': {'path': 'global.yaml', 'domain': 'demo_fuib'}}

In [4]:
config = load_config(f'{entity}/{cadence}/wrangling.yaml', context=context)
config

{'requested_features': {'z_score_sum_trx': {'v1': {'active': True,
    'train': True}},
  'z_score_cnt_trx': {'v1': {'active': True, 'train': True}},
  'z_score_sum_hghrsk_cntry': {'v1': {'active': True, 'train': True}},
  'cnt_trx_n_day': {'v1': {'active': True, 'train': True}},
  'sum_new_account': {'v1': {'active': True, 'train': True}},
  'max_trx': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash_in': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash_out': {'v1': {'active': True, 'train': True}},
  'cnt_trx_cash': {'v1': {'active': True, 'train': True}},
  'one_to_many': {'v1': {'active': True, 'train': False}},
  'many_to_one': {'v1': {'active': True, 'train': False}},
  'sum_pipe_customer': {'v1': {'active': True, 'train': False}},
  'sum_trx_fop': {'v1': {'active': True, 'train': False}},
  'sum_trx_name_mis': {'v1': {'active': True, 'train': False}},
  'cnt_trx_name_mis': {'v1': {'active': True, 'train':

In [5]:
features = features_discovery.get_features(config)
print(f'Calculating {len(features)} features')

Calculating 17 features


In [6]:
effective_config = global_config.copy()
effective_config.update(config)
effective_config

{'bau': False,
 'monthly_features_look_back_in_months': 6,
 'monthly_data_horizon': None,
 'me_to_me_threshold': 1,
 'global_features_params': {'round_digits': 2},
 'tr_cfg_metadata': {'path': 'customer/monthly/wrangling.yaml',
  'domain': 'demo_fuib'},
 'requested_features': {'z_score_sum_trx': {'v1': {'active': True,
    'train': True}},
  'z_score_cnt_trx': {'v1': {'active': True, 'train': True}},
  'z_score_sum_hghrsk_cntry': {'v1': {'active': True, 'train': True}},
  'cnt_trx_n_day': {'v1': {'active': True, 'train': True}},
  'sum_new_account': {'v1': {'active': True, 'train': True}},
  'max_trx': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash_in': {'v1': {'active': True, 'train': True}},
  'sum_trx_cash_out': {'v1': {'active': True, 'train': True}},
  'cnt_trx_cash': {'v1': {'active': True, 'train': True}},
  'one_to_many': {'v1': {'active': True, 'train': False}},
  'many_to_one': {'v1': {'active': True, 'train

In [7]:
joined_trx_df: DataFrame = dataset_functions.read(context, 'trx_enriched', data_environment=DataEnvironment.PUBLIC)
joined_trx_df.count()

                                                                                

601506

In [8]:
from common.libs import dates as dates_lib

joined_trx_df = dates_lib.month_offset_to_year_month_columns(joined_trx_df, 'month_offset', 'year_month')

# Feature Executor

In [10]:
# wrangling_execution_strategy: WranglingExecutionStrategy = get_wrangling_execution_strategy(basic_execution_config, effective_config ,features)
# joined_trx_df = wrangling_execution_strategy.enrich_df_pre_feature_engineering(joined_trx_df)

features_executor = FeaturesExecutor(joined_trx_df, features, effective_config)
aggs_df = features_executor.execute()
aggs_df.count()

                                                                                

18000

# Widgets

In [11]:
investigated_entity_col = "customer_id"
cadence_col = "month_offset"
counterparty_col = "cp_id"
trx_amount_col = "amount_usd"
trx_direction_col = "direction"
direction_value_out = "debit"
direction_value_in = "credit"

## Populational Widgets

In [12]:
population = aggs_df.groupBy(cadence_col).agg(

    # in_cash_value
    # f.round(f.mean(f.when(f.col('in_cash_value') > 0, f.col('in_cash_value'))), 2).alias('pop_avg_in_cash_value'),
    # f.countDistinct(f.when(f.col('in_cash_value') > 0, f.col('cust'))).alias('custs_with_in_cash_value'),

    # out_cash_value
    # f.round(f.mean(f.when(f.col('out_cash_value') > 0, f.col('out_cash_value'))), 2).alias('pop_avg_out_cash_value'),
    # f.countDistinct(f.when(f.col('out_cash_value') > 0, f.col('cust'))).alias('custs_with_out_cash_value'),

    # total_cash_value
    # f.round(f.mean(f.when(f.col('total_cash_value') > 0, f.col('total_cash_value'))), 2).alias('pop_avg_cash_value'),
    # f.countDistinct(f.when(f.col('total_cash_value') > 0, f.col('cust'))).alias('pop_dstnct_trx_cash'),

    # sum trx
    f.round(f.mean(f.when(f.col('sum_trx') > 0, f.col('sum_trx'))), 2).alias('pop_avg_sum_trx'),
    
    #cnt_trx
    f.round(f.mean(f.when(f.col('cnt_trx') > 0, f.col('cnt_trx'))), 2).alias('pop_avg_cnt_trx'),
    f.countDistinct(f.when(f.col('cnt_trx') > 0, f.col(investigated_entity_col))).alias('pop_dstnct_cust_trx'),

#    sum_trx_cash
    f.round(f.mean(f.when(f.col('sum_trx_cash') > 0, f.col('sum_trx_cash'))), 2).alias('pop_avg_sum_trx_cash'),
    
    # cnt_trx_cash
    f.round(f.mean(f.when(f.col('cnt_trx_cash') > 0, f.col('cnt_trx_cash'))), 2).alias('pop_avg_cnt_trx_cash'),
    f.countDistinct(f.when(f.col('cnt_trx_cash') > 0, f.col(investigated_entity_col))).alias('pop_dstnct_cust_trx_cash'),
    
    # cnt_trx_n_day
    f.round(f.mean(f.when(f.col('cnt_trx_n_day') > 0, f.col('cnt_trx_n_day'))), 2).alias('pop_avg_cnt_trx_n_day'),

    # sum_new_account
    f.round(f.mean(f.when(f.col('sum_new_account') > 0, f.col('sum_new_account'))), 2).alias('pop_avg_new_account'),
    f.countDistinct(f.when(f.col('sum_new_account') > 0, f.col(investigated_entity_col))).alias('pop_dstnct_cust_new_account'),

#     # in_value
#     f.round(f.mean(f.when(f.col('in_value') > 0, f.col('in_value'))), 2).alias('pop_avg_in_value'),
#     f.countDistinct(f.when(f.col('in_value') > 0, f.col('cust'))).alias('custs_with_in_value'),

#     # out_value
#     f.round(f.mean(f.when(f.col('out_value') > 0, f.col('out_value'))), 2).alias('pop_avg_out_value'),
#     f.countDistinct(f.when(f.col('out_value') > 0, f.col('cust'))).alias('custs_with_out_value'),

#     # one_to_many
    # f.round(f.mean(f.when(f.col('one_to_many') > 0, f.col('one_to_many'))), 2).alias('pop_avg_one_to_many'),



#     # many_to_one
    # f.round(f.mean(f.when(f.col('many_to_one') > 0, f.col('many_to_one'))), 2).alias('pop_avg_many_to_one'),
#     f.countDistinct(f.when(f.col('many_to_one') > 0, f.col('cust'))).alias('custs_with_many_to_one'),



#     # total_atm_value
    # f.round(f.mean(f.when(f.col('total_atm_value') > 0, f.col('total_atm_value'))), 2).alias('pop_avg_total_atm_value'),
    # f.countDistinct(f.when(f.col('total_atm_value') > 0, f.col('cust'))).alias('custs_with_total_atm_value'),



#     # total_value
    # f.round(f.mean(f.when(f.col('total_value') > 0, f.col('total_value'))), 2).alias('pop_avg_total_value'),
    # f.countDistinct(f.when(f.col('total_value') > 0, f.col('cust'))).alias('custs_with_total_value'),

    # f.round(f.mean(f.when(f.col('total_pos_value') > 0, f.col('total_pos_value'))), 2).alias('pop_avg_total_pos_value'),
    # f.countDistinct(f.when(f.col('total_pos_value') > 0, f.col('cust'))).alias('custs_with_total_pos_value'),

#     # total_volume
    # f.round(f.mean(f.when(f.col('total_volume') > 0, f.col('total_volume'))), 2).alias('pop_avg_total_volume'),
#     f.countDistinct(f.when(f.col('total_volume') > 0, f.col('cust'))).alias('custs_with_total_volume'),

    # total_rapid_movement_value
    # f.round(f.mean(f.when(f.col('total_rapid_movement_value') > 0, f.col('total_rapid_movement_value'))), 2).alias('pop_avg_rapid_movement'),

    # f.round(f.mean(f.when(f.col('total_onprobation_value') > 0, f.col('total_onprobation_value'))), 2).alias('pop_avg_onprobation'),
    # f.countDistinct(f.when(f.col('total_onprobation_value') > 0, f.col('cust'))).alias('custs_with_onprobation'),

    # f.round(f.mean(f.when(f.col('total_risky_customer_value') > 0, f.col('total_risky_customer_value'))), 2).alias('pop_avg_risky_customer'),
    # f.countDistinct(f.when(f.col('total_risky_customer_value') > 0, f.col('cust'))).alias('custs_with_risky_customer'),

#     # total_hghrsk_cntry_val
#     f.round(f.mean(f.when(f.col('total_hghrsk_cntry_val') > 0, f.col('total_hghrsk_cntry_val'))), 2).alias('pop_total_hghrsk_cntry_val'),
#     f.countDistinct(f.when(f.col('total_hghrsk_cntry_val') > 0, f.col('cust'))).alias('custs_with_hghrsk_cntry_val'),

#     # total_hghrsk_cntry_vol
#     f.round(f.mean(f.when(f.col('total_hghrsk_cntry_vol') > 0, f.col('total_hghrsk_cntry_vol'))), 2).alias('pop_total_hghrsk_cntry_vol'),
#     f.countDistinct(f.when(f.col('total_hghrsk_cntry_vol') > 0, f.col('cust'))).alias('custs_with_hghrsk_cntry_vol')
)

aggs_df = aggs_df.join(population, cadence_col, how= 'left')
aggs_df.count()

18000

## Categorical Widgets
### One to Many

In [13]:
feature_name = "one_to_many"
feature_name_explainability = feature_name + "_explainability"

# Step 1: Aggregate the count and sum first
ds_agg_one_to_many = (
    joined_trx_df.filter( (f.col(trx_direction_col)==direction_value_out) & (~f.col(counterparty_col).isNull()) )
    .groupby(investigated_entity_col, cadence_col, counterparty_col)
    .agg(
        f.count('*').alias('count'),
        f.sum(trx_amount_col).alias('sum')
    )
).withColumn('sum', f.round(f.col('sum'), 2))

ds_agg_one_to_many = ds_agg_one_to_many.withColumn('sum', f.round(f.col('sum'), 2))

# Step 2: Perform collect_list aggregation separately
one_to_many_explainability = (
    ds_agg_one_to_many
    .groupby(investigated_entity_col, cadence_col)
    .agg(
        f.collect_list(
            f.struct(
                f.col(counterparty_col).alias('cn'),
                f.col('count').alias('c'),  # Now referencing from ds_agg_cred
                f.col('sum').alias('s')
            )
        ).alias(feature_name_explainability)
    )
    .select(investigated_entity_col,
        cadence_col,
        f.to_json(
            f.create_map(
                f.lit('data'),
                f.col(feature_name_explainability)
            )
        ).alias(feature_name_explainability)
    )
)

aggs_df = aggs_df.join(one_to_many_explainability.select(investigated_entity_col, cadence_col, feature_name_explainability), [investigated_entity_col, cadence_col], 'left')
aggs_df.count()

18000

### One to Many Concentration

In [14]:
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, DoubleType, IntegerType

# Schema for parsing one_to_many_explainability
schema = StructType([
    StructField("data", ArrayType(
        StructType([
            StructField("cn", StringType(), True),
            StructField("c", IntegerType(), True),
            StructField("s", DoubleType(), True)
        ])
    ))
])

# Parse the JSON string
aggs_df= aggs_df.withColumn(
    "parsed_data",
    f.from_json(f.col("one_to_many_explainability"), schema)
)

# Sort array by 's' descending
aggs_df = aggs_df.withColumn(
    "sorted_data",
    f.expr("""
        array_sort(parsed_data.data, 
            (left, right) -> case 
                when left.s < right.s then 1 
                when left.s > right.s then -1 
                else 0 
            end
        )
    """)
)

# Top counterparty as struct wrapped in {"data":[...]}
aggs_df = aggs_df.withColumn(
    "cp_concentration_explainability",
    f.to_json(
        f.struct(
            f.array(f.element_at(f.col("sorted_data"), 1)).alias("data")
        )
    )
)

# Extract the s value of the top counterparty
aggs_df = aggs_df.withColumn(
    "cp_concentration_s",
    f.element_at(f.col("sorted_data"), 1).getItem("s")
)

# Sum all 's' values
aggs_df = aggs_df.withColumn(
    "total_s",
    f.expr("aggregate(parsed_data.data, double(0), (acc, x) -> acc + x.s)")
)

# Compute concentration ratio
aggs_df = aggs_df.withColumn(
    "cp_concentration",
    f.col("cp_concentration_s") / f.col("total_s")
)

aggs_df = aggs_df.drop('parsed_data', 'sorted_data', 'total_s')
aggs_df.count()

18000

### Many to One

In [15]:
feature_name = "many_to_one"
feature_name_explainability = feature_name + "_explainability"

# Step 1: Aggregate the count and sum first
ds_agg_many_to_one = (
    joined_trx_df.filter( (f.col(trx_direction_col)==direction_value_in) & (~f.col(counterparty_col).isNull()) )
    .groupby(investigated_entity_col, cadence_col, counterparty_col)
    .agg(
        f.count('*').alias('count'),
        f.sum(trx_amount_col).alias('sum')
    )
).withColumn('sum', f.round(f.col('sum'), 2))


# Step 2: Perform collect_list aggregation separately
many_to_one_explainability = (
    ds_agg_many_to_one
    .groupby(investigated_entity_col, cadence_col)
    .agg(
        f.collect_list(
            f.struct(
                f.col(counterparty_col).alias('cn'),
                f.col('count').alias('c'),  # Now referencing from ds_agg_cred
                f.col('sum').alias('s')
            )
        ).alias(feature_name_explainability)
    )
    .select(investigated_entity_col,
        cadence_col,
        f.to_json(
            f.create_map(
                f.lit('data'),
                f.col(feature_name_explainability)
            )
        ).alias(feature_name_explainability)
    )
)

aggs_df = aggs_df.join(many_to_one_explainability.select(investigated_entity_col, cadence_col, feature_name_explainability), [investigated_entity_col, cadence_col], 'left')
aggs_df.count()

18000

### FOP

In [16]:
feature_name = "sum_trx_fop"
feature_name_explainability = feature_name + "_explainability"

# Step 1: Aggregate the count and sum first
ds_agg_one_to_many = (
    joined_trx_df.filter( ((f.col('ip_hash')==f.col('counterparty_ip')) & (f.col('customer_id')!=f.col('cp_id'))) )
    .groupby(investigated_entity_col, cadence_col, counterparty_col)
    .agg(
        f.count('*').alias('count'),
        f.sum(trx_amount_col).alias('sum')
    )
).withColumn('sum', f.round(f.col('sum'), 2))

ds_agg_one_to_many = ds_agg_one_to_many.withColumn('sum', f.round(f.col('sum'), 2))

# Step 2: Perform collect_list aggregation separately
one_to_many_explainability = (
    ds_agg_one_to_many
    .groupby(investigated_entity_col, cadence_col)
    .agg(
        f.collect_list(
            f.struct(
                f.col(counterparty_col).alias('cn'),
                f.col('count').alias('c'),  # Now referencing from ds_agg_cred
                f.col('sum').alias('s')
            )
        ).alias(feature_name_explainability)
    )
    .select(investigated_entity_col,
        cadence_col,
        f.to_json(
            f.create_map(
                f.lit('data'),
                f.col(feature_name_explainability)
            )
        ).alias(feature_name_explainability)
    )
)

aggs_df = aggs_df.join(one_to_many_explainability.select(investigated_entity_col, cadence_col, feature_name_explainability), [investigated_entity_col, cadence_col], 'left')
aggs_df.count()

18000

### High Risk Country

In [17]:
# Step 1: Aggregate the count and sum first
ds_agg_high_risk_country = (
    joined_trx_df.filter(f.col("cp_country_risk_level").isin(['High','Medium','Low']))
    .groupby(investigated_entity_col, cadence_col, 'cp_jurisdiction')
    .agg(
        f.count('*').alias('count'),
        f.sum(trx_amount_col).alias('sum'),
        f.max('cp_country_risk_level').alias("cp_country_risk_level")
    )
).withColumn('sum', f.round(f.col('sum'), 2))


# Step 2: Perform collect_list aggregation separately
high_risk_country_explainability = (
    ds_agg_high_risk_country
    .groupby(investigated_entity_col, cadence_col)
    .agg(
        f.collect_list(
            f.struct(
                f.col('cp_jurisdiction').alias('ct'),
                f.col('cp_country_risk_level').alias('cr'),
                f.col('count').alias('c'),  # Now referencing from ds_agg_cred
                f.col('sum').alias('s')
            )
        ).alias('high_risk_country_explainability')
    )
    .select(investigated_entity_col,
        cadence_col,
        f.to_json(
            f.create_map(
                f.lit('data'),
                f.col('high_risk_country_explainability')
            )
        ).alias('high_risk_country_explainability')
    )
)

aggs_df = aggs_df.join(high_risk_country_explainability.select(investigated_entity_col, cadence_col,'high_risk_country_explainability'), [investigated_entity_col, cadence_col], 'left')
aggs_df.count()

18000

In [18]:
from common.libs import dates as dates_lib

aggs_df = dates_lib.month_offset_to_year_month_columns(aggs_df, 'month_offset', 'year_month')

aggs_df = aggs_df.drop('sum_in_out_ratio')
aggs_df = aggs_df.fillna(0)

In [21]:
dataset_functions.write(context, aggs_df, 'customer_monthly', data_environment=DataEnvironment.PUBLIC)

INFO: 2025-09-21 15:44:44 @ ### DataSet - writing started ###
INFO: 2025-09-21 15:45:27 @ ### DataSet - writing done, 18000 written, 0 corrupted, 0 rejected  ###


{'total_new_records': 18000,
 'corrupted_new_records': 0,
 'rejected_new_records': 0}

In [22]:
from thetaray.common.data_environment import DataEnvironment
dataset_functions.publish(context, 'customer_monthly', data_environment=DataEnvironment.PUBLIC)

INFO: 2025-09-21 15:45:30 @ finished publishing records for dataset customer_monthly 


True

In [59]:
context.close()