### Init Context

In [None]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_remittance/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 1, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]')

### Imports

In [None]:
from thetaray.api.dataset import dataset_functions
from thetaray.api.evaluation import fit_on_worker
from thetaray.api.histograms import save_histograms
from thetaray.api.evaluation.preprocess.features_extractor import FeaturesExtractor
from thetaray.api.models import save_model
from thetaray.api.anomaly_detection import ThetaRayDetector
from thetaray.api.evaluation.preprocess.numeric_features import NumericFeaturesTransformer

import mlflow
from pyspark.sql import functions as f

from domains.demo_remittance.datasets.customer_monthly import customer_monthly_dataset as input_dataset
from domains.demo_remittance.evaluation_flows.ef import evaluation_flow as ef

### Load data

In [None]:
data = dataset_functions.read(context, input_dataset().identifier)
data = data.orderBy([f.hash('customer_id'), 'year_month']).limit(1000000)
data_pd = data.toPandas()

### Model Training

In [None]:
requested_features = ['multpl_tx_bl_lim', 
                      'vel_spike', 
                      'multi_party_actv', 
                      'hr_jurid_vol',
                     'total_tx_amount',
                     'avg_tx_amount']
nft = NumericFeaturesTransformer(features=requested_features, strategy='constant', fill_value=0.0)
fu = FeaturesExtractor([nft])
trd = ThetaRayDetector(algo_type=['Ny', 'RL', 'NF'],
                       learning_method=1,
                       normalization_type=1,
                       Fusion_threshold=0.3,
                       Rating_percentile=5.0,
                       set_zero_rating=1)

with mlflow.start_run(nested=True):
    feature_extraction_model = fit_on_worker(fu.fit, X=data_pd)
    save_model(ef().evaluation_steps[0].feature_extraction_model.name, feature_extraction_model, tags=ef().evaluation_steps[0].feature_extraction_model.tags)
    detection_model = fit_on_worker(trd.fit, X=feature_extraction_model.transform(data_pd))
    save_model(ef().evaluation_steps[0].detection_model.name, detection_model, tags=ef().evaluation_steps[0].detection_model.tags)
    save_histograms(context, data_pd, requested_features)

In [None]:
context.close()