### Init Context

In [2]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_ret_smb/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 1, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]')

2025-05-29 12:32:49,124:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-05-29 12:32:49,217:INFO:thetaray.common.logging:load_risks took: 0.04812264442443848
2025-05-29 12:32:49,219:INFO:thetaray.common.logging:Skip _connectors validation due to no connectors provided.
2025-05-29 12:32:49,557:INFO:thetaray.common.logging:=== Started updating schema ===
2025-05-29 12:32:49,628:INFO:thetaray.common.logging:=== Started updating schema on Postgres ===
2025-05-29 12:32:52,217:INFO:thetaray.common.logging:found 43 tables in solution public schema
2025-05-29 12:32:52,220:INFO:thetaray.common.logging:demo_ret_smb_ef
2025-05-29 12:32:52,228:INFO:thetaray.common.logging:found 43 tables in solution public schema
2025-05-29 12:32:52,229:INFO:thetaray.common.logging:demo_ret_indiv_ef
2025-05-29 12:32:52,233:INFO:thetaray.common.logging:=== Finished updating schema ===
202

### Imports

In [3]:
from thetaray.api.dataset import dataset_functions
from thetaray.api.evaluation import fit_on_worker
from thetaray.api.histograms import save_histograms
from thetaray.api.evaluation.preprocess.features_extractor import FeaturesExtractor
from thetaray.api.models import save_model
from thetaray.api.anomaly_detection import ThetaRayDetector
from thetaray.api.evaluation.preprocess.numeric_features import NumericFeaturesTransformer

import mlflow
from pyspark.sql import functions as f

from domains.demo_ret_smb.datasets.customer_monthly import customer_monthly_dataset as input_dataset
from domains.demo_ret_smb.evaluation_flows.ef import evaluation_flow as ef

### Load data

In [None]:
data = dataset_functions.read(context, input_dataset().identifier)
data = data.orderBy([f.hash('customer_id'), 'year_month']).limit(1000000)
data_pd = data.toPandas()

### Model Training

In [None]:
requested_features = ['pipe_accnt_behv',
                     'tax_heaven_jurisd',
                     'spike_of_trx',
                     'many_to_one',
                     'one_to_many',
                     'avg_tx_amount_monthly',
                     'pct_domestic_transactions',
                     'atm_withdrawal_ratio']
nft = NumericFeaturesTransformer(features=requested_features, strategy='constant', fill_value=0.0)
fu = FeaturesExtractor([nft])
trd = ThetaRayDetector(algo_type=['Ny', 'RL', 'NF'],
                       learning_method=1,
                       normalization_type=1,
                       Fusion_threshold=0.5,
                       Rating_percentile=5.0,
                       set_zero_rating=1)

with mlflow.start_run(nested=True):
    feature_extraction_model = fit_on_worker(fu.fit, X=data_pd)
    save_model(ef().evaluation_steps[0].feature_extraction_model.name, feature_extraction_model, tags=ef().evaluation_steps[0].feature_extraction_model.tags)
    detection_model = fit_on_worker(trd.fit, X=feature_extraction_model.transform(data_pd))
    save_model(ef().evaluation_steps[0].detection_model.name, detection_model, tags=ef().evaluation_steps[0].detection_model.tags)
    save_histograms(context, data_pd, requested_features)

In [None]:
context.close()