In [None]:
import datetime
import mlflow

from thetaray.api.evaluation.preprocess.numeric_features import NumericFeaturesTransformer
from thetaray.api.evaluation.preprocess.features_extractor import FeaturesExtractor
from thetaray.api.evaluation.preprocess.categorical_features import CategoricalFeaturesTransformer
from thetaray.api.context import init_context
from thetaray.api.sample import get_sample_percent_unlabeled
from thetaray.api.evaluation import fit_on_worker
from thetaray.api.models import save_model
from thetaray.api.dataset import dataset_functions
from thetaray.api.anomaly_detection import ThetaRayDetector
from thetaray.api.metric import CustomMetricPublisher
from thetaray.api.histograms import save_histograms

In [None]:
from thetaray.api.solution import EvaluationType

In [None]:
context = init_context(execution_date=datetime.datetime(1970,1,1))
spark = context.get_spark_session()

In [None]:
data = dataset_functions.read(context, 'wrangling', generate_pk=True)
data = data.replace(float('nan'), None)
data = data.drop('tr_timestamp')

In [None]:
sample = get_sample_percent_unlabeled(data, 20)
sample_pd = sample.toPandas()

In [None]:
X = sample_pd.drop(columns=['tr_pk'])

In [None]:
features_list = ['amount',
'duration',
'payments',
'birth_number',
'min1',
'max1',
'mean1',
'min2',
'max2',
'mean2',
'min3',
'max3',
'mean3',
'min4',
'max4',
'mean4',
'min5',
'max5',
'mean5',
'min6',
'max6',
'mean6',
'has_card']
nft = NumericFeaturesTransformer(features=features_list,strategy='mean',fill_value=None)

In [None]:
fu = FeaturesExtractor([
    nft,
    CategoricalFeaturesTransformer(features=['frequency', 'type_disp', 'type_card'], mapping=None, strategy=None, fill_value=None)])

In [None]:
with mlflow.start_run(nested=True):
    CustomMetricPublisher(
        execution_date=context.execution_date, metric_type='tr_algo_train', publish_to_mlflow=True, publish_to_es=True
    ).log_param('model_type', EvaluationType.THETARAY_ANALYSIS)
    feature_extraction_model = fit_on_worker(fu.fit, X=X)
    save_model('tr_feature_extraction_model', feature_extraction_model, tags={"version": "release"})
    detection_model, captured_stdout, captured_stderr = fit_on_worker(ThetaRayDetector(algo_type=['GC'], Fusion_threshold=0.45).fit, X=feature_extraction_model.transform(X), capture_stdout=True)
    save_model('tr_detection_model', detection_model, tags={"version": "release"})
    save_histograms(context, sample, features_list)

In [None]:
# Workaround to finish spark job, as spark-joblib uses pyspark API with bugs (will be fixed in spark 3)
spark.stop()