### Init Context

In [1]:
import datetime
from dateutil.relativedelta import relativedelta
import json
import logging
from pyspark.sql import DataFrame, Window, functions as f
from pyspark.sql import SQLContext
from pyspark.sql.types import LongType
import yaml

from common.libs import dates as dates_lib
from common.libs import features_discovery
from common.libs.features_executor import FeaturesExecutor
from common.libs.feature_engineering import max_look_back_monthly_features, max_look_back_daily_weekly_features
from common.libs.zscore import enrich_with_z_score
from common.factory.wrangling_execution_strategy import get_wrangling_execution_strategy
from common.factory.eval_flow_definition import get_evaluation_flow_definition
from common.factory.domain_definition import get_domain_definition
from common.notebook_utils.wrangling.wrangling_execution_strategy import WranglingExecutionStrategy
from common.definitions.domain import DomainDefinition
from common.definitions.eval_flow import EvaluationFlowDefinition
from common.libs.context_utils import get_dataset

from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.solution import IngestionMode
from thetaray.common import Constants
from thetaray.common.data_environment import DataEnvironment

logging.getLogger().handlers[0].setFormatter(logging.Formatter(fmt='%(levelname)s: %(asctime)s @ %(message)s',datefmt='%Y-%m-%d %H:%M:%S'))
logging.basicConfig(level=logging.INFO)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


from thetaray.api.context import init_context
import datetime
from thetaray.common import Constants

from common.libs.config.loader import load_config
from common.libs.config.basic_execution_config_loader import BasicExecutionConfig, DevBasicExecutionConfig
from common.libs.context_utils import is_run_triggered_from_airflow



with open('/thetaray/git/solutions/domains/demo_fuib/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']

execution_date=datetime.datetime(1970, 1, 1)

context = init_context(domain='demo_fuib',
                       execution_date=execution_date,
                       spark_conf=spark_config,
                       spark_master='local[*]',
                       allow_type_changes=True)

spark = context.get_spark_session()
sc = SQLContext(spark)
params = context.parameters
print(f"Spark UI URL: {context.get_spark_ui_url()}")

print(json.dumps(params, indent=4))

2025-09-04 13:00:57,255:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-09-04 13:00:57,862:INFO:thetaray.common.logging:load_risks took: 0.12975215911865234
INFO: 2025-09-04 13:00:58 @ === Started updating schema ===
INFO: 2025-09-04 13:00:58 @ === Started updating schema on Postgres ===
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09-04 13:01:21 @ found 195 tables in solution public schema
INFO: 2025-09

Added `alias` successfully.


INFO: 2025-09-04 13:01:41 @ Creating metastore schema for evaluation flow: cust_month_ef
INFO: 2025-09-04 13:01:42 @ === Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.
Spark UI URL: https://jupyterhub-platform-thetalab.sonar.thetaray.cloud/user/manager/proxy/4040/jobs/
{}




### Imports

In [2]:
from thetaray.api.dataset import dataset_functions

import json
import psycopg2
import os
import random
import pandas as pd

import datetime
from faker import Faker
from pyspark.sql import functions as f
from pyspark.sql import Window
from pyspark.sql.types import StructType, LongType

from thetaray.api.dataset.schema import DatasetSchemaHandler
from thetaray.common import Constants, Settings
from thetaray.common.data_environment import DataEnvironment

spark = context.get_spark_session()

# from domains.cbk.datasets.kyc import kyc_dataset
# from domains.cbk.datasets.trx_enriched import trx_enriched_dataset
# from domains.cbk.datasets.insights import customer_insights_dataset

In [3]:
# from thetaray.api.tools.metadata import trigger_metadata_sync
# trigger_metadata_sync(allow_type_changes=True)

In [4]:
trx_df: DataFrame = dataset_functions.read(context, 'trx_enriched', from_job_ts=Constants.BEGINNING_OF_TIME)
trx_df.count()

                                                                                

228937

In [5]:
# trx_df = trx_df.withColumn('trx_direction', f.when(f.col('recvpay') == '1',
#                                                                  f.lit('in'))
#                                                                  .otherwise(f.lit('out')))

# trx_df = trx_df.withColumn('counterparty_country', f.when(f.col('recvpay') == '1',
#                                                                  f.col('byordercountry'))
#                                                                  .otherwise(f.col('benecountry')))

# trx_df = trx_df.withColumn('counterparty_bank_country', f.when(f.col('recvpay') == '1',
#                                                                  f.col('byorderbankcountry'))
#                                                                  .otherwise(f.col('benebankcountry')))

# trx_df = trx_df.withColumn('counterparty_country_risk_factor', f.when(f.col('recvpay') == '1',
#                                                                  f.col('byorder_country_risk_factor'))
#                                                                  .otherwise(f.col('bene_country_risk_factor')))

# trx_df = trx_df.withColumn('counterparty_bank_country_risk_factor', f.when(f.col('recvpay') == '1',
#                                                                  f.col('byorder_bank_country_risk_factor'))
#                                                                  .otherwise(f.col('bene_bank_country_risk_factor')))

In [6]:
trx_df = trx_df.withColumn('month', f.date_trunc('month', f.col('txn_ts')))

In [7]:
# trx_df = trx_df\
#     .withColumn('month_offset', f.datediff(f.col(trx_enriched_dataset().occurred_on_field), f.lit(Constants.BEGINNING_OF_TIME)).cast(LongType()))

# trx_df = trx_df\
#     .withColumn('month', f.date_trunc('month', f.col(trx_enriched_dataset().occurred_on_field)))

trx_df = trx_df.groupBy('customer_id','month','month_offset').agg(
    f.count(f.when(f.col('direction')=='credit', 1).otherwise(None)).alias('tr_in_count'),
    f.sum(f.when(f.col('direction')=='credit', f.col('amount_usd')).otherwise(None)).alias('tr_in'),
    f.count(f.when(f.col('direction')=='debit', 1).otherwise(None)).alias('tr_out_count'),
    f.sum(f.when(f.col('direction')=='debit', f.col('amount_usd')).otherwise(None)).alias('tr_out'),
    f.collect_set(f.when(f.col('cp_country_risk_level') == 'High', f.col('cp_jurisdiction')).otherwise(None)).alias('hr_cc_ct'),
    f.collect_set(f.when(f.col('cp_country_risk_level') == 'Medium', f.col('cp_jurisdiction')).otherwise(None)).alias('mr_cc_ct'),
    f.collect_set(f.when(f.col('cp_country_risk_level') == 'Low', f.col('cp_jurisdiction')).otherwise(None)).alias('lr_cc_ct'),
    f.max('customer_name').alias('customer_name'),
    f.max('customer_type').alias('customer_type'),
    f.max('date_of_birth').alias('date_of_birth'),
    f.max('address_full').alias('address_full'),
    f.max('occupation').alias('occupation'),
    f.max('pep_indicator').alias('pep_indicator')
)

# w_30d = Window.partitionBy('account_key').orderBy('day_offset').rangeBetween(-29, 0)
# list_30d_agg = ['tr_in_count', 'tr_in', 'tr_out_count', 'tr_out']
# for column_name in list_30d_agg:
#     trx_df = trx_df.withColumn(column_name, f.sum(column_name).over(w_30d))

# list_30d_collect_set = ['hr_cc_bene', 'hr_cc_orig', 'mr_cc_bene', 'mr_cc_orig', 'lr_cc_bene', 'lr_cc_orig']
# for column_name in list_30d_collect_set:
#     trx_df = trx_df.withColumn(column_name, f.array_distinct(f.flatten(f.collect_list(column_name).over(w_30d))))

trx_df = trx_df.withColumn('hr_cc', f.array_distinct(f.flatten(f.array('hr_cc_ct'))))
trx_df = trx_df.withColumn('mr_cc', f.array_distinct(f.flatten(f.array('mr_cc_ct'))))
trx_df = trx_df.withColumn('lr_cc', f.array_distinct(f.flatten(f.array('lr_cc_ct'))))

# w_pop = Window.partitionBy('day_offset')
# trx_df = trx_df.withColumn('tr_in_seg', f.approx_percentile(f.when(f.col('tr_in')!=0, f.col('tr_in')).otherwise(None), 0.5).over(w_pop))
# trx_df = trx_df.withColumn('tr_out_seg', f.approx_percentile(f.when(f.col('tr_out')!=0, f.col('tr_out')).otherwise(None), 0.5).over(w_pop))
# trx_df = trx_df.withColumn('tr_in_seg_count', (f.approx_percentile(f.when(f.col('tr_in_count')!=0, f.col('tr_in_count')).otherwise(None), 0.5).over(w_pop)).cast('long'))
# trx_df = trx_df.withColumn('tr_out_seg_count', (f.approx_percentile(f.when(f.col('tr_out_count')!=0, f.col('tr_out_count')).otherwise(None), 0.5).over(w_pop)).cast('long'))


trx_df = trx_df.withColumn('trx_from_date', f.add_months('month', -1))
trx_df = trx_df.withColumnRenamed('month', 'trx_to_date')
# trx_df = trx_df.withColumn('trx_from_date', f.date_sub('day', 29))
# trx_df = trx_df.withColumnRenamed('day', 'trx_to_date')

# trx_df = trx_df.drop('day_offset', *list_30d_collect_set).fillna(0)

In [8]:
# accounts_df = accounts_df.withColumn('kyc_classification', f.when(f.col('account_risk_level') == 'E', 'High').otherwise(f.when(f.col('account_risk_level') == 'M', 'Medium').otherwise('Low')))
# accounts_df = accounts_df.withColumn('kyc_name', f.lit('XXXXXXXXXX'))
# accounts_df = accounts_df.withColumn('kyc_is_new', f.when(f.date_diff(f.lit(context.execution_date), f.col('account_open_date')) < 90, True).otherwise(False))
# accounts_df = accounts_df.withColumn('kyc_new_customer', f.col('kyc_is_new'))
# accounts_df = accounts_df.withColumn('kyc_recently_updated', f.lit(False))
# accounts_df = accounts_df.withColumn('kyc_newly_incorporation', f.col('kyc_is_new'))
# accounts_df = accounts_df.withColumn('kyc_occupation', f.col('account_code_naf_description_en'))
# accounts_df = accounts_df.withColumn('kyc_null_field', f.lit(''))
# accounts_df = accounts_df.withColumn('account_country_risk_color_aux', f.when(f.col('account_country_risk_color') == 'BLACK', 'H').otherwise(f.when(f.col('account_country_risk_color') == 'GREY', 'M').otherwise('L')))
# accounts_df = accounts_df.withColumn('director_ad', f.format_string('{"CC": "%s", "AD": "%s", "CL": "%s"}', f.col('account_country_cd'), f.col('account_address_line_6'), f.col('account_country_risk_color_aux')))
# accounts_df = accounts_df.withColumn('company_ad', f.col('director_ad'))
# accounts_df = accounts_df.withColumn('tm', f.format_string('{"Open": "%d", "Closed": "%d", "False_positives": "%d"}', f.lit(0), f.lit(0), f.lit(0)))
# accounts_df = accounts_df.withColumn('scrn', f.format_string('{"Open": "%d", "Closed": "%d", "False_positives": "%d"}', f.lit(0), f.lit(0), f.lit(0)))
# accounts_df = accounts_df.select(
#     'account_key',
#     'kyc_classification',
#     'kyc_name',
#     'kyc_is_new',
#     'kyc_new_customer',
#     'kyc_recently_updated',
#     'kyc_newly_incorporation',
#     'kyc_occupation',
#     'kyc_null_field',
#     'director_ad',
#     'company_ad',
#     'tm',
#     'scrn'
# )

In [9]:
trx_df = trx_df.withColumn('customer_risk', f.when(f.col('occupation').isin(['marketer','consultant']),f.lit('High')).otherwise(f.lit('Low')))

In [10]:
trx_df = trx_df.withColumn('tr_timestamp', f.col('trx_to_date'))
trx_df = trx_df.withColumn('effective_date', f.col('trx_to_date'))
trx_df.count()

                                                                                

12000

In [11]:
from pyspark.sql.types import StringType
trx_df = trx_df.drop('lr_cc_ct', 'mr_cc_ct','hr_cc_ct')
trx_df = trx_df.withColumn('date_of_birth', trx_df['date_of_birth'].cast(StringType()))

In [12]:
# from thetaray.utils.schema_ds_ffc_generator import create_metadata_ds_file_from_df, create_metadata_ds_file_from_csv
# from thetaray.api.solution import DataSet, Field, DataType, IngestionMode, BusinessType

# create_metadata_ds_file_from_df(context=context,
#                                 df=trx_df,
#                                 ds_identifier='customer_insights',
#                                 ds_display_name='customer_insights',
#                                 ingestion_mode=IngestionMode.UPDATE,
#                                 publish=True,
#                                 primary_key=['customer_id'],
#                                 data_permission='dpv:demo_fuib',
#                                 num_of_partitions=4,
#                                 num_of_buckets=7)

In [13]:
dataset_functions.write(context, trx_df, 'customer_insights_fuib')

INFO: 2025-09-04 13:02:37 @ ### DataSet - writing started ###
25/09/04 13:02:40 ERROR FileUtils: Failed to delete s3a://thetaray-sonar/warehouse/manager.db/customer_insights_fuib/tr_year=1970/tr_month=1/tr_day=1/tr_date=1970_01_01_00_00_00
INFO: 2025-09-04 13:02:47 @ ### DataSet - writing done, 12000 written, 0 corrupted, 0 rejected  ###


{'total_new_records': 12000,
 'corrupted_new_records': 0,
 'rejected_new_records': 0}

In [21]:
dataset_functions.publish(context, 'customer_insights_bus_c2', data_environment=DataEnvironment.PUBLIC)

ValueError: Metadata for dataset: customer_insights_bus_c2 does not exist

In [18]:
context.close()