# Initialize Context

In [1]:
import datetime
from dateutil.relativedelta import relativedelta
import json
import logging
from pyspark.sql import DataFrame, Window, functions as f
from pyspark.sql import SQLContext
from pyspark.sql.types import LongType
import yaml

from common.libs import dates as dates_lib
from common.libs import features_discovery
from common.libs.features_executor import FeaturesExecutor
from common.libs.feature_engineering import max_look_back_monthly_features, max_look_back_daily_weekly_features
from common.libs.zscore import enrich_with_z_score
from common.factory.wrangling_execution_strategy import get_wrangling_execution_strategy
from common.factory.eval_flow_definition import get_evaluation_flow_definition
from common.factory.domain_definition import get_domain_definition
from common.notebook_utils.wrangling.wrangling_execution_strategy import WranglingExecutionStrategy
from common.definitions.domain import DomainDefinition
from common.definitions.eval_flow import EvaluationFlowDefinition
from common.libs.context_utils import get_dataset

from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.solution import IngestionMode
from thetaray.common import Constants
from thetaray.common.data_environment import DataEnvironment

logging.getLogger().handlers[0].setFormatter(logging.Formatter(fmt='%(levelname)s: %(asctime)s @ %(message)s',datefmt='%Y-%m-%d %H:%M:%S'))
logging.basicConfig(level=logging.INFO)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


from thetaray.api.context import init_context
import datetime
from thetaray.common import Constants

from common.libs.config.loader import load_config
from common.libs.config.basic_execution_config_loader import BasicExecutionConfig, DevBasicExecutionConfig
from common.libs.context_utils import is_run_triggered_from_airflow



with open('/thetaray/git/solutions/domains/demo_fuib/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']

execution_date=datetime.datetime(1970, 1, 1)

context = init_context(domain='demo_fuib',
                       execution_date=execution_date,
                       spark_conf=spark_config,
                       spark_master='local[*]',
                       allow_type_changes=True)

spark = context.get_spark_session()
sc = SQLContext(spark)
params = context.parameters
print(f"Spark UI URL: {context.get_spark_ui_url()}")

print(json.dumps(params, indent=4))

2025-09-11 13:00:46,773:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-09-11 13:00:47,443:INFO:thetaray.common.logging:load_risks took: 0.1312410831451416
INFO: 2025-09-11 13:00:48 @ === Started updating schema ===
INFO: 2025-09-11 13:00:48 @ === Started updating schema on Postgres ===
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-11 13:00:57 @ found 213 tables in solution public schema
INFO: 2025-09-

Added `alias` successfully.


INFO: 2025-09-11 13:01:19 @ === Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.
Spark UI URL: https://jupyterhub-platform-thetalab.sonar.thetaray.cloud/user/andre.gutnik/proxy/4040/jobs/
{}




In [2]:
# from common.libs.config.loader import load_config

# config = load_config(f'{entity}/{cadence}/train_run.yaml', context=context)
# config

# Read training dataset

In [3]:
from thetaray.common.data_environment import DataEnvironment
from thetaray.api.dataset import dataset_functions
import pyspark.sql.functions as f

In [4]:
ds_train = dataset_functions.read(context, 'customer_monthly', data_environment=DataEnvironment.PUBLIC).drop('tr_timestamp') 
#TODO: Add here any additional filtering/sampling on the customers dataset. 
# ds_train = ds_train.toPandas()
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [5]:
ds_train.groupBy('year_month','month_offset').count().show()

                                                                                

+-------------------+------------+-----+
|         year_month|month_offset|count|
+-------------------+------------+-----+
|2025-01-01 00:00:00|         660| 1500|
|2024-09-01 00:00:00|         656| 1500|
|2025-04-01 00:00:00|         663| 1500|
|2024-08-01 00:00:00|         655| 1500|
|2025-03-01 00:00:00|         662| 1500|
|2024-10-01 00:00:00|         657| 1500|
|2024-07-01 00:00:00|         654| 1500|
|2025-02-01 00:00:00|         661| 1500|
|2025-06-01 00:00:00|         665| 1500|
|2025-05-01 00:00:00|         664| 1500|
|2024-11-01 00:00:00|         658| 1500|
|2024-12-01 00:00:00|         659| 1500|
+-------------------+------------+-----+



In [6]:
ds_train = ds_train.filter(f.col('year_month')<'2025-06-01')
ds_train.count()

16500

In [7]:
# train_features_cols = ['one_to_many', 'sum_out_trx', 'cnt_trx_cash', 
#                        'sum_trx_cash','cnt_trx_n_day', 'z_score_cnt_trx',
#                        'sum_trx', 'sum_hghrsk_cntry', 'many_to_one', 'max_trx', 
#                        'sum_pipe_customer', 'pop_dstnct_cust_trx', 'sum_in_trx','cp_concentration', 
#                        'z_score_sum_hghrsk_cntry', 'sum_new_account', 'z_score_sum_trx', 'cnt_trx']

In [8]:
train_features_cols = ['one_to_many', 
                       'sum_out_trx', 
                       'cnt_trx_cash', 
                       'sum_trx_cash',
                       'cnt_trx_n_day', 
                       'z_score_cnt_trx',
                       'sum_trx', 
                       'sum_hghrsk_cntry', 
                       'many_to_one', 
                       'max_trx', 
                       'sum_pipe_customer', 
                       # 'sum_in_trx',
                       'cp_concentration', 
                       'z_score_sum_hghrsk_cntry',
                       'sum_new_account', 
                       # 'z_score_sum_trx', 
                       'cnt_trx',
                       'sum_trx_fop',
                       'cnt_trx_fop',
                       'cnt_dstnct_fop',
                       'sum_trx_name_mis',
                       'cnt_trx_name_mis']

# Prepare Feature Extraction transformer & Anomaly Detection model 

In [9]:
from thetaray.api.evaluation.preprocess.numeric_features import NumericFeaturesTransformer
from thetaray.api.evaluation.preprocess.features_extractor import FeaturesExtractor
from thetaray.api.anomaly_detection import ThetaRayDetector
from common.libs.features_discovery import get_features_output_fields

# train_features_cols = [field.identifier for field in get_features_output_fields(domain, entity, cadence, train_only=True)]
# _config = config.get('numeric_features_transformer')
nft = NumericFeaturesTransformer(features=train_features_cols,
                                 strategy='constant',
                                fill_value = 0)
features_extractor = FeaturesExtractor([nft])
tr_detector = ThetaRayDetector(normalization_type=1, set_zero_rating=1)

2025-09-11 13:01:25.136386: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-11 13:01:25.138160: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-11 13:01:25.141086: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-11 13:01:25.149008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757595685.162126   10390 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757595685.16

##################################################
## Migrating to ModularSystem (AnomalyDetector) ##
##################################################
User Parameters:
Algos_to_run: ['Ny', 'RL', 'NF', 'NK', 'Pg', 'HB', 'GC', 'GL']
normalization_type: 1
Fusion_threshold: 0.5
Rating_percentile: 1.0
thread_mode: 1
max_wrk: None
nucset_sampling: False


In [10]:
ds_train = ds_train.fillna(0)
ds_train=ds_train.toPandas()

                                                                                

# Fit models & publish models / visualizations to MLFlow

In [11]:
import mlflow

from thetaray.api.models import save_model
from thetaray.api.drift import save_reference_dataset
from thetaray.api.evaluation import evaluate_reference_dataset
from thetaray.api.sample import get_sample_percent_unlabeled

with mlflow.start_run(nested=True):
    features_extraction_model = features_extractor.fit(ds_train)
    save_model('customer_monthly_fe', features_extraction_model, {"version": "release"}, data_environment=DataEnvironment.PUBLIC) #TODO - complete
    features = features_extraction_model.transform(ds_train)
    detection_model = tr_detector.fit(X=features)
    save_model('customer_monthly_ad', detection_model, {"version": "release"}, data_environment=DataEnvironment.PUBLIC) #TODO - complete
    print("Training finished")

    # spark = context.get_spark_session()
    # ref_df = spark.createDataFrame(ds_train)
    # evaluated_sample = evaluate_reference_dataset(context, 
    #                                                     #TODO - complete
    #                                               'algo',
    #                                               ref_df, features_extraction_model, detection_model)

    # save_reference_dataset(context, evaluated_sample)

INFO: 2025-09-11 13:01:32 @ Found credentials in environment variables.
INFO: 2025-09-11 13:01:33 @ Waiting up to 300 seconds for model version to finish creation. Model name: customer_monthly_fe, version 8
INFO: 2025-09-11 13:01:33 @ [AlgoLog] [13:01:33.271] [USER] [fit:Start]
  multiarray.copyto(a, fill_value, casting='unsafe')



--- Training set Info ---
Dimensions: 16500 x 20
-------------------------

Normalizing training set with min-max...
 
-------------------------------
Start fit...

Multithreading mode activated with    16 CPUs
 
Ny: Model Fitting Start
13:01:33.271
[AlgoLog] [ 13:01:33.271 ] [ USER ] [ fit : Start ]
Number of Adaptive Bins:  512
the knn is:  59
the depth is  9


INFO: 2025-09-11 13:01:33 @ [AlgoLog] [13:01:33.926] [USER] [fit:End]
INFO: 2025-09-11 13:01:33 @ [AlgoLog] [13:01:33.929] [USER] [fit:Start]


13:01:33.926
[AlgoLog] [ 13:01:33.926 ] [ USER ] [ fit : End ]
Ny: Bins amount: 128
Ny: Unique kernel points amount: 
13:01:33.929
[AlgoLog] [ 13:01:33.929 ] [ USER ] [ fit : Start ]


INFO: 2025-09-11 13:01:34 @ [AlgoLog] [13:01:34.620] [USER] [fit:End]


13:01:34.620
[AlgoLog] [ 13:01:34.620 ] [ USER ] [ fit : End ]
Ny: Model Fitting End
Ny: Scoring Data...
Ny: Scoring Data End
_score_scaler: Ny threshold: 0.4718204901210403
RL: Model Fitting Start
Number of Adaptive Bins:  256
RL: Number of erased AdaptiveBins: 33
RL: Kernel size after erase: 223


2025-09-11 13:01:43.460923: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


2025-09-11 13:01:43.713437: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.713505: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.713611: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.713642: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.729458: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.729521: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.729676: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.729711: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.852325: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.852395: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:43.852504: W tensorflow/core/util/util.cc:163] Not handling typ

RL: epoch: 0, Loss: 0.811124
RL: epoch: 10, Loss: 0.863581
RL: epoch: 20, Loss: 1.350270
RL: epoch: 30, Loss: 0.723096
RL: epoch: 40, Loss: 0.985009
RL: epoch: 50, Loss: 0.759523
RL: epoch: 60, Loss: 0.724622
RL: epoch: 70, Loss: 0.790555
RL: epoch: 80, Loss: 0.695239
RL: epoch: 90, Loss: 0.366237
RL: epoch: 100, Loss: 0.500349
RL: epoch: 110, Loss: 0.529914
RL: epoch: 120, Loss: 0.347935
RL: epoch: 130, Loss: 0.393863
RL: epoch: 140, Loss: 0.366423
RL: epoch: 150, Loss: 0.168485
RL: epoch: 160, Loss: 0.234247
RL: epoch: 170, Loss: 0.340564
RL: epoch: 180, Loss: 0.473934
RL: epoch: 190, Loss: 0.352978
RL: Model Fitting End
RL: Scoring Data...
RL: Scoring Data End


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


_score_scaler: RL threshold: 0.027914186538442715
NF: Model Fitting Start
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


2025-09-11 13:01:45.947566: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.947970: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.948213: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.948521: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.948762: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.949062: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.949293: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:45.949581: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:46.022927: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:46.023006: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:46.023143: W tensorflow/core/util/util.cc:163] Not handling typ

epoch: 0  loss: 21.530411
epoch: 10  loss: -4.926717
epoch: 20  loss: -7.065931
epoch: 30  loss: -7.849668
epoch: 40  loss: -8.321351
epoch: 50  loss: -8.58638
epoch: 60  loss: -8.892613
NF: Model Fitting End
NF: Scoring Data...


2025-09-11 13:01:50.681259: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.681766: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.682077: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.682446: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.682757: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.683101: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.683388: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.683742: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.692626: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.693024: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:50.693293: W tensorflow/core/util/util.cc:163] Not handling typ

NF: Scoring Data End
_score_scaler: NF threshold: 0.5522657211128531
NK: Model Fitting Start
NK: Eigenvalues: 14
NK: Perform decomposition...
iter 1


  detection_model = tr_detector.fit(X=features)
2025-09-11 13:01:51.242182: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:51.242408: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:51.251274: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:51.251452: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE


epoch 0, Loss: 0.737113
epoch 10, Loss: 0.454169
epoch 20, Loss: 0.370095
epoch 30, Loss: 0.323259
epoch 40, Loss: 0.289413
epoch 50, Loss: 0.262994
epoch 60, Loss: 0.243136


2025-09-11 13:01:51.499374: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
2025-09-11 13:01:51.500796: W tensorflow/core/util/util.cc:163] Not handling type DT_DOUBLE
INFO: 2025-09-11 13:01:51 @ [AlgoLog] [13:01:51.505] [USER] [fit:Start]
INFO: 2025-09-11 13:01:51 @ [AlgoLog] [13:01:51.508] [USER] [fit:End]


NK: Dimension of intermediate representation: 28 x 20
NK: Dimension of neural representation: 28 x 20
Sigmoid factor =  3.016747762315387
Final analytic epsilon =  65.55111175790817
13:01:51.505
[AlgoLog] [ 13:01:51.505 ] [ USER ] [ fit : Start ]
NK: Dimension reduction of the training set
NK: Building the Ny kernel...
NK: Normalizing the kernel...
NK: Eigenvalue decomposition...
NK: Number of relevant eigenvalues 20
NK: Second eigenvalue = 0.16215918
NK: Time step = 1
13:01:51.508
[AlgoLog] [ 13:01:51.508 ] [ USER ] [ fit : End ]
NK: Size of extension: 16500 x 28
NK: Model Fitting End
NK: Scoring Data...
NK: Dimension reduction of the testing set
NK: Size of extension: 16500 x 28
NK: Scoring Data End
_score_scaler: NK threshold: 0.3381640744370118
Pg: Model Fitting Start
Pg: Model Fitting End
Pg: Scoring Data...
Pg: Embedding 16500 datapoints of dimension 20 in a kernel feature space of dimension 30
Pg: Pg dimensionality reduction to 11 dimensions
Pg: Scoring Data End
_score_scaler: P

  multiarray.copyto(a, fill_value, casting='unsafe')


GC: 268 elements were sampled
GC: Model Fitting End
GC: Scoring Data...
GC: Scoring Data End
_score_scaler: GC threshold: 8.90856634545413
GL: Model Fitting Start
GL: GC Sampling...


  multiarray.copyto(a, fill_value, casting='unsafe')


GL: GC Sampling End - 268 elements were sampled
GL: Model Fitting End
GL: Scoring Data...
GL: Scoring Data End


INFO: 2025-09-11 13:02:35 @ [AlgoLog] [13:02:35.852] [USER] [fit:Start]


_score_scaler: GL threshold: 0.795507109789128
 
End fit...
-------------------------------
 
Fuser: Algos to fuse: ['Ny', 'RL', 'NF', 'NK', 'Pg', 'HB', 'GC', 'GL']
Fuser: Number of anomalies:
Fu  262
Ny  475
RL  300
NF  217
NK  242
Pg  219
HB  190
GC  290
GL  516

Fused Score:
 Min.: 0.047, Med.: 0.068, Avg.: 0.096, Prcntl 90: 0.145, Prcntl 99: 0.557, Max.: 1.000

13:02:35.852
[AlgoLog] [ 13:02:35.852 ] [ USER ] [ fit : Start ]
Score threshold for rating: 0.399
Analytic epsilon =  0.07802533892595935


  fit_res = self.detector.fit(X_prepared, y_prepared)
INFO: 2025-09-11 13:02:36 @ [AlgoLog] [13:02:36.327] [USER] [fit:End]


13:02:36.327
[AlgoLog] [ 13:02:36.327 ] [ USER ] [ fit : End ]


INFO: 2025-09-11 13:02:44 @ Waiting up to 300 seconds for model version to finish creation. Model name: customer_monthly_ad, version 7


Training finished
üèÉ View run fun-gnat-602 at: https://mlflow:5000/#/experiments/0/runs/26f00c856a67467e8f22d191186720ee
üß™ View experiment at: https://mlflow:5000/#/experiments/0


In [12]:
context.close()