In [2]:
import datetime
from dateutil.relativedelta import relativedelta
import json
import logging
from pyspark.sql import DataFrame, Window, functions as f
from pyspark.sql import SQLContext
from pyspark.sql.types import LongType
import yaml

from common.libs import dates as dates_lib
from common.libs import features_discovery
from common.libs.features_executor import FeaturesExecutor
from common.libs.feature_engineering import max_look_back_monthly_features, max_look_back_daily_weekly_features
from common.libs.zscore import enrich_with_z_score
from common.factory.wrangling_execution_strategy import get_wrangling_execution_strategy
from common.factory.eval_flow_definition import get_evaluation_flow_definition
from common.factory.domain_definition import get_domain_definition
from common.notebook_utils.wrangling.wrangling_execution_strategy import WranglingExecutionStrategy
from common.definitions.domain import DomainDefinition
from common.definitions.eval_flow import EvaluationFlowDefinition
from common.libs.context_utils import get_dataset

from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.solution import IngestionMode
from thetaray.common import Constants
from thetaray.common.data_environment import DataEnvironment

logging.getLogger().handlers[0].setFormatter(logging.Formatter(fmt='%(levelname)s: %(asctime)s @ %(message)s',datefmt='%Y-%m-%d %H:%M:%S'))
logging.basicConfig(level=logging.INFO)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


from thetaray.api.context import init_context
import datetime
from thetaray.common import Constants

from common.libs.config.loader import load_config
from common.libs.config.basic_execution_config_loader import BasicExecutionConfig, DevBasicExecutionConfig
from common.libs.context_utils import is_run_triggered_from_airflow



with open('/thetaray/git/solutions/domains/demo_fuib/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']

execution_date=datetime.datetime(1970, 1, 1)

context = init_context(domain='demo_fuib',
                       execution_date=execution_date,
                       spark_conf=spark_config,
                       spark_master='local[*]',
                       allow_type_changes=True)

spark = context.get_spark_session()
sc = SQLContext(spark)
params = context.parameters
print(f"Spark UI URL: {context.get_spark_ui_url()}")

print(json.dumps(params, indent=4))

2025-09-09 08:52:17,320:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-09-09 08:52:17,977:INFO:thetaray.common.logging:load_risks took: 0.13797879219055176
INFO: 2025-09-09 08:52:18 @ === Started updating schema ===
INFO: 2025-09-09 08:52:18 @ === Started updating schema on Postgres ===
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09-09 08:52:29 @ found 209 tables in solution public schema
INFO: 2025-09

Added `alias` successfully.


INFO: 2025-09-09 08:52:55 @ === Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.
Spark UI URL: https://jupyterhub-platform-thetalab.sonar.thetaray.cloud/user/andre.gutnik/proxy/4040/jobs/
{}




In [3]:
# logging.getLogger().handlers[0].setFormatter(logging.Formatter(fmt='%(levelname)s: %(asctime)s @ %(message)s',datefmt='%Y-%m-%d %H:%M:%S'))
# logging.basicConfig(level=logging.INFO)

# spark_conf = {'spark.authenticate': False,
#               'spark.network.crypto.enabled': False,
#               "spark.sql.repl.eagerEval.enabled": True,
#               "spark.sql.autoBroadcastJoinThreshold": -1,
#               "spark.executor.memory": "16g",
#               "spark.driver.memory": "4g",
#               "spark.executor.cores": "2",
#               "spark.executor.instances": "1",
#               "spark.sql.adaptive.enabled": "true",
#               "spark.dynamicAllocation.enabled": "true",
#               "spark.dynamicAllocation.initialExecutors": "1",
#               "spark.dynamicAllocation.maxExecutors": "10",
#               "spark.dynamicAllocation.minExecutors": "0",
#               "spark.dynamicAllocation.shuffleTracking.enabled": "true"
#               }
# execution_date = datetime.datetime(1970,1,1)
# context = init_context(execution_date=execution_date, spark_conf=spark_conf)
# spark = context.get_spark_session()
# sc = SQLContext(spark)

2025-09-09 08:14:21,627:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-09-09 08:14:22,235:INFO:thetaray.common.logging:load_risks took: 0.12721943855285645
INFO: 2025-09-09 08:14:22 @ === Started updating schema ===


# 1. Config
## 1.1 Datasets and columns

In [3]:
EVALUATION_FLOW = 'cust_month_eval' # evaluation flow name
features_dataset_identifier = 'customer_monthly' # name of the aggregated dataset (output of wrangling)
transactions_dataset_identifier = 'trx_enriched' # name of the transaction dataset

date_col = 'year_month' # date column name in the aggregated datset (usually "year_month")

date_analysis_col = 'delivery_timestamp' # date column name in the transaction dataset 


investigated_entities = ['customer_id'] # invetigated entity column name

algo_step = 'algo'
# algo_step = 'tr_evaluation'

aggregation_level = 'month'
fusion_threshold = 0.5

data_env = DataEnvironment.PUBLIC

## 1.2 Features do be analyzed
There are two options for setting the analyzed features:

1- Manually creating a list:

In [4]:
analysis_features = ['one_to_many', 
                       'sum_out_trx', 
                       'cnt_trx_cash', 
                       'sum_trx_cash',
                       'cnt_trx_n_day', 
                       'z_score_cnt_trx',
                       'sum_trx', 
                       'sum_hghrsk_cntry', 
                       'many_to_one', 
                       'max_trx', 
                       'sum_pipe_customer', 
                       # 'sum_in_trx',
                       'cp_concentration', 
                       'z_score_sum_hghrsk_cntry',
                       'sum_new_account', 
                       # 'z_score_sum_trx', 
                       'cnt_trx',
                       'sum_trx_fop',
                       'cnt_trx_fop',
                       'cnt_dstnct_fop']

print("Number of analyzed features: ", len(analysis_features))

Number of analyzed features:  18


## 1.3 Forensic Fields
List of forensic fields to be added to the alert report. It is important to make sure these fields exist in the input dataset of the evaluation flow.

In [5]:
forensic_fields = analysis_features + ['sum_trx', 'cnt_trx']

# 2. Load Functions

In [6]:
def date_agg(df,date_col,agg_level):
    '''format date aggregaition column to dataframe from existing date column.
    input:  spark Dataframe,
            name of column to convert to date column
            agg_level - 'month', 'week' or 'day' for monthly, weekly or daily aggregation
    return: New spark DataFrame
    '''
    if agg_level == "month":
        if 'year_month' not in df.columns:
            df = df.withColumn('year_month', f.concat(f.year(f.col(date_col)), f.lit('_'), f.lpad(f.month(f.col(date_col)), 2, '0')))
        return df, 'year_month'
    elif agg_level == "week":
        if 'week_date' not in df.columns:
            df = df.withColumn("week", f.date_trunc('week', f.col(date_col))).withColumn("week_date",f.concat(f.year(f.col("week")), f.lit('-'), f.lpad(f.month(f.col("week")),2,'0'), f.lit('-'), f.lpad(f.dayofmonth(f.col("week")),2,'0')))
            return df, "week_date"
    elif agg_level == "day":
        return df, date_col
    return df, ""

def create_alert_report(evaluated_activities):
    alert_report = evaluated_activities.filter(f.col(algo_step+"_score")>0.5).toPandas()
    nums = [1,2,3,4,5]
    cols_tf = []
    len_eval_col = len(algo_step)+1

    for num in nums:
        alert_report["TF_"+str(num)] = None
        alert_report["TF_"+str(num)+"_value"] = None
        alert_report["TF_"+str(num)+"_rating"] = None
        cols_tf += ["TF_"+str(num),"TF_"+str(num)+"_value","TF_"+str(num)+"_rating"]

    for cols in analysis_features:
        cols = algo_step + '_'+ cols + "_rank" 
        for num in nums:
            alert_report.loc[alert_report[cols]==num,'TF_'+str(num)] = cols[len_eval_col:-5]
            alert_report.loc[alert_report[cols]==num,'TF_'+str(num)+"_value"] = alert_report[cols[len_eval_col:-5]]
            alert_report.loc[alert_report[cols]==num,"TF_"+str(num)+"_rating"] = alert_report[algo_step+"_"+cols[len_eval_col:-4]+"rating"]/10000
            
    alert_report = alert_report[investigated_entities + [date_agg_level,algo_step+"_score"] + cols_tf + forensic_fields]
    alert_report = alert_report.reset_index()
    return alert_report

def create_alert_overview(evaluated_activities):
    dates = evaluated_activities.select(f.min(date_col).alias('min_date'), f.max(date_col).alias('max_date')).withColumn('diff',f.round(f.months_between(f.col('max_date'),f.col('min_date')))).collect()[0]
    min_date = dates['min_date']
    max_date = dates['max_date']
    number_of_months = int(dates['diff'])
    number_of_anomalies = evaluated_activities.filter(f.col(algo_step+"_score") > fusion_threshold).count()
    number_of_investigated = evaluated_activities.count()
    anomaly_percentage = (number_of_anomalies/number_of_investigated)*100
    number_of_features = len(analysis_features)
    anomaly_overview = pd.DataFrame({'min_date': [min_date], 'max_date': [max_date],'number_of_months':[number_of_months], 'number of anomalies': [number_of_anomalies],
     'number of investigated entities': [number_of_investigated], 'anomaly percentage':[anomaly_percentage],
    'number of features': [number_of_features]})
    return anomaly_overview

def create_alert_overview_per_month(evaluated_activities):
    overview_per_month = evaluated_activities.withColumn('anomaly_ind', f.when((f.col(algo_step+"_score") > fusion_threshold), f.lit(1)).otherwise(f.lit(0)))
    overview_per_month = overview_per_month.groupBy(date_col).agg(f.sum('anomaly_ind').alias('number of anomalies'), f.count(investigated_entities[0]).alias('number of investigated entities')).orderBy(date_col, ascending=False)
    overview_per_month = overview_per_month.withColumn('anomaly percentage', f.round( (f.col('number of anomalies')/f.col('number of investigated entities'))*100, 2))
#    overview_per_month = overview_per_month.toPandas()
    return overview_per_month

import numpy as np
## Calculates bins according to col_name column, and counts TF_col for each bin
def bin_counts(alert_report,col_name=algo_step+'_score',bins=np.arange(0.5,1.05,0.05),TF_col='TF_1'):
    bin_col_name = f'{col_name}_binning'
    alert_report[bin_col_name] = pd.cut(alert_report[col_name],bins)
    start=True
    for name, group in alert_report.groupby(by=[bin_col_name]):
        if group.shape[0]>0:
            if start:
                res = group.groupby(TF_col).agg({TF_col:'count'}).T
                res[bin_col_name] = name
                start=False
            else:
                tmp_res = group.groupby(TF_col).agg({TF_col:'count'}).T
                tmp_res[bin_col_name] = name
                res = res.append(tmp_res)
    res = res.set_index(bin_col_name).fillna(0)
    res['total transactions']=res.sum(axis=1)
    res.loc['total transactions'] = res.sum(axis=0)
    res = res.T
    return res

# highlight cells with value>0
def highlight_cells(val, color):
    if val>0:
        return 'background-color: {}'.format(color)
def highlight_column(s):
    """
    Highlight the entire column with a color.
    """
    return ['background-color: #C6E2E9' for _ in s]


def tf_eval_score(data, tf_col):
    decimals = 0
    res = bin_counts(data,TF_col=tf_col)#.sort_values(by='total transactions', ascending=False)
    res = res.reset_index()
    res = res[res[tf_col] != 'total transactions'].set_index(tf_col)
    res = res.rename(columns={"total transactions": "total anomalies"})
    res = res.apply(lambda x: round(x, decimals)) # .reset_index().drop(9).set_index('TF_1')
    return res

def tf_rating(data, tf_col):
    decimals = 0
    res_rating = bin_counts(alert_report,col_name=f'{tf_col}_rating',bins=np.arange(0,1.05,0.05),TF_col=tf_col).sort_values(by='total transactions', ascending=False)
    res_rating.style.applymap(highlight_cells, color='#C6E2E9')
    # TODO: total anomalies
    res_rating = res_rating.reset_index()
    res_rating = res_rating[res_rating[tf_col] != 'total transactions'].set_index(tf_col)
    res_rating = res_rating.apply(lambda x: round(x, decimals))
    res_rating = res_rating.rename(columns={"total transactions": "total anomalies"})

    res_rating.style.apply(highlight_column, subset=['total anomalies'])
    return res_rating


# 3. Data preparation
## 3.1 Reading datasets

In [7]:
# evaluated_activities = load_evaluated_activities(context, EVALUATION_FLOW, data_environment=data_env)
# evaluated_activities, date_agg_level = date_agg(evaluated_activities, date_col, aggregation_level)

# trx_df = read(context, transactions_dataset_identifier, from_job_ts=Constants.BEGINNING_OF_TIME, data_environment=data_env)
agg_df = DataFrame = dataset_functions.read(context,'customer_monthly', from_job_ts=Constants.BEGINNING_OF_TIME, data_environment=data_env)
agg_df.count()
# evaluated_dataset_start = evaluated_activities.select(f.min(date_col)).collect()[0][0].strftime("%Y/%m/%d")
# evaluated_dataset_end = evaluated_activities.select(f.max(date_col)).collect()[0][0].strftime("%Y/%m/%d")

# print("Number of rows in evaluated dataset: ", evaluated_activities.count())
# print(f"Period in evaluated dataset - from {evaluated_dataset_start} until {evaluated_dataset_end}")

                                                                                

18000

## 4. Anomaly Overview
### 4.1 General Information - alerts

In [None]:
anomaly_overview = create_alert_overview(evaluated_activities)
anomaly_overview

### 4.2 General Information per date agg level - alerts

In [None]:
anomaly_overview_per_month = create_alert_overview_per_month(evaluated_activities).toPandas()
anomaly_overview_per_month

### 4.3 General Information - training set

In [None]:
train_start_date = "2023-04-01"
train_end_date = "2023-12-01"

In [None]:
training_trx_data = trx_df.filter(f.col(date_analysis_col)>=train_start_date).filter(f.col(date_analysis_col)<train_end_date)
training_agg_data = agg_df.filter(f.col(date_col)>=train_start_date).filter(f.col(date_col)<train_end_date)
number_of_trx = training_trx_data.count()
number_of_agg = training_agg_data .count()

train_overview = pd.DataFrame({'train_start_date': [train_start_date], 'train_end_date': [train_end_date],'number of transactions':[number_of_trx], 'number of rows in agg dataset': [number_of_agg ]})
train_overview

## 5. Alert Report

In [8]:
from thetaray.api.evaluation import evaluate
evaluated_df = evaluate(context,'cust_month_ef', agg_df.filter(f.col('year_month')=='2025-06-01'), data_environment=DataEnvironment.PUBLIC)
evaluated_df.count()

INFO: 2025-09-09 08:53:00 @ Applying backwards compatibility patches
2025-09-09 08:53:00.375841: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-09 08:53:00.377639: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-09 08:53:00.381010: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-09 08:53:00.389427: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757407980.404372   15081 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

INFO: 2025-09-09 08:53:03 @ Rolling back backwards compatibility patches
INFO: 2025-09-09 08:53:03 @ Rolled back backwards compatibility patches
INFO: 2025-09-09 08:53:03 @ Applying backwards compatibility patches
INFO: 2025-09-09 08:53:03 @ Applied backwards compatibility patches


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

INFO: 2025-09-09 08:53:03 @ Rolling back backwards compatibility patches
INFO: 2025-09-09 08:53:03 @ Rolled back backwards compatibility patches
2025-09-09 08:53:08,354:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:08,362:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:08,373:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:08,388:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:08.683282: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-09 08:53:08.685940: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors 

1500

In [9]:
evaluated_df, date_agg_level = date_agg(evaluated_df, date_col, aggregation_level)

In [10]:
alert_report = create_alert_report(evaluated_df)

2025-09-09 08:53:13,307:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:13,308:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:13,309:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:13,309:INFO:thetaray.common.logging:Applying backwards compatibility patches
2025-09-09 08:53:13,313:INFO:thetaray.common.logging:Applied backwards compatibility patches
2025-09-09 08:53:13,313:INFO:thetaray.common.logging:Rolling back backwards compatibility patches
2025-09-09 08:53:13,313:INFO:thetaray.common.logging:Rolled back backwards compatibility patches
2025-09-09 08:53:13,314:INFO:thetaray.common.logging:Applied backwards compatibility patches
2025-09-09 08:53:13,314:INFO:thetaray.common.logging:Applied backwards compatibility patches
2025-09-09 08:53:13,314:INFO:thetaray.common.logging:Rolling back backwards compatibility patches
2025-09-09 08:53:13,314:INFO:thetaray.common.logging

In [11]:
feature_mapping = {
                    'one_to_many': "One to Many", 
                    'sum_out_trx': "Outgoing Value",
                    'cnt_trx_cash': "Cash Volume", 
                    'sum_trx_cash': "Cash Value",
                    'cnt_trx_n_day': "Rapid Movement of Funds", 
                    'z_score_cnt_trx': "Total Volume History",
                    'sum_trx': "Total Value", 
                    'sum_hghrsk_cntry': "Risky Countries Value", 
                    'many_to_one': "Many to One", 
                    'max_trx': "Transaction Spike", 
                    'sum_pipe_customer': "Pipe Customer", 
                    # 'sum_in_trx': "Incoming Value",
                    'cp_concentration': "Counterparty Concentration Value", 
                    'z_score_sum_hghrsk_cntry': "Risky Countries Value Histoy",
                    'sum_new_account': "New Account Value", 
#                   'z_score_sum_trx' : "Total Volue History",
                    'cnt_trx': "Total Volume",
                    'sum_trx_fop': "FOP Value",
                    'cnt_trx_fop': "FOP Volume",
                    'cnt_dstnct_fop': "FOP Counterparties"
}

alert_report["TF_1"].replace(feature_mapping, inplace=True)
alert_report["TF_2"].replace(feature_mapping, inplace=True)
alert_report["TF_3"].replace(feature_mapping, inplace=True)
alert_report["TF_4"].replace(feature_mapping, inplace=True)
alert_report["TF_5"].replace(feature_mapping, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  alert_report["TF_1"].replace(feature_mapping, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  alert_report["TF_2"].replace(feature_mapping, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [None]:
alert_report

In [12]:
insights= dataset_functions.read(context,'customer_insights_fuib', data_environment=DataEnvironment.PUBLIC)
insights.count()

1500

In [13]:
insights_pd = insights.select('customer_id', 'customer_age', 'customer_country').toPandas()

In [14]:
final_report = pd.merge(
    alert_report,
    insights_pd,
    on="customer_id",
    how="left"
)


In [15]:
final_report.shape

(458, 41)

In [16]:
with pd.ExcelWriter('fuib_alert_report.xlsx') as writer:  
    final_report.to_excel(writer, sheet_name='fuib_alert_report')

  with pd.ExcelWriter('fuib_alert_report.xlsx') as writer:


## 6. Top 3 Trigger Features Analysis

- TF_1: how many times the features occupies the position of Trigger Feature 1
- TF_2: how many times the features occupies the position of Trigger Feature 2
- TF_3: how many times the features occupies the position of Trigger Feature 3
- Avg_algo_score: The average of the algo score for all alerts in which that specific feature occupied the Top 3 Trigger Features
- Avg_tf_value: The average of the Feature value for all alerts in which that specific feature occupied the Top 3 Trigger Features
- Avg_tf_rating The average of the Trigger Feature Rating for all alerts in which that specific feature occupied the Top 3 Trigger Features

In [None]:
evaluated_activities

In [None]:
(alert_report
 .groupby(by='TF_1')
 .agg(number_of_alerts=('index','count'),
      sum_algo_Score=(algo_step+'_score','sum'),
      sum_tf_rating=('TF_1_rating','sum'),
      avg_sum_trx=('sum_trx','mean'),
      min_sum_trx=('sum_trx','min'),
      max_sum_trx=('sum_trx','max'),
      avg_cnt_trx=('cnt_trx','mean'),
      min_cnt_trx=('cnt_trx','min'),
      max_cnt_trx=('cnt_trx','max'))
 .reset_index()
 .rename(columns={'TF_1':'trigger_feature'}))

In [15]:
tf1 = alert_report.groupby(by='TF_1').agg(number_of_alerts=('index','count'), sum_algo_Score=(algo_step+'_score','sum'), sum_tf_rating=('TF_1_rating','sum'), avg_sum_trx=('sum_trx','mean'), min_sum_trx=('sum_trx','min'), max_sum_trx=('sum_trx','max'), avg_cnt_trx=('cnt_trx','mean'), min_cnt_trx=('cnt_trx','min'), max_cnt_trx=('cnt_trx','max')).reset_index().rename(columns={'TF_1':'trigger_feature'})
tf2 = alert_report.groupby(by='TF_2').agg(number_of_alerts=('index','count'), sum_algo_Score=(algo_step+'_score','sum'), sum_tf_rating=('TF_2_rating','sum'), avg_sum_trx=('sum_trx','mean'), min_sum_trx=('sum_trx','min'), max_sum_trx=('sum_trx','max'), avg_cnt_trx=('cnt_trx','mean'), min_cnt_trx=('cnt_trx','min'), max_cnt_trx=('cnt_trx','max')).reset_index().rename(columns={'TF_2':'trigger_feature'})
tf3 = alert_report.groupby(by='TF_3').agg(number_of_alerts=('index','count'), sum_algo_Score=(algo_step+'_score','sum'), sum_tf_rating=('TF_3_rating','sum'), avg_sum_trx=('sum_trx','mean'), min_sum_trx=('sum_trx','min'), max_sum_trx=('sum_trx','max'), avg_cnt_trx=('cnt_trx','mean'), min_cnt_trx=('cnt_trx','min'), max_cnt_trx=('cnt_trx','max')).reset_index().rename(columns={'TF_3':'trigger_feature'})
all_tfs = pd.concat([tf1,tf2,tf3]).sort_values('number_of_alerts', axis=0, ascending=False, na_position='last', ignore_index=True, key=None)

all_tfs_hist = all_tfs.groupby(by='trigger_feature').agg(number_of_alerts=('number_of_alerts','sum'),algo_Score=('sum_algo_Score','sum'), sum_tf_rating=('sum_tf_rating','sum'), min_sum_trx=('min_sum_trx','min'),  max_sum_trx=('max_sum_trx','max'),  avg_sum_trx=('avg_sum_trx','mean'), min_cnt_trx=('min_cnt_trx','min'),  max_cnt_trx=('max_cnt_trx','max'),  avg_cnt_trx=('avg_cnt_trx','mean')).sort_values(by='number_of_alerts',ascending=False).reset_index()
all_tfs_hist['% out of all alerts'] = round((100*all_tfs_hist['number_of_alerts']/alert_report.shape[0]),2)
all_tfs_hist['avg_algo_score'] = round((all_tfs_hist['algo_Score']/all_tfs_hist['number_of_alerts']),2)
#all_tfs_hist['avg_tf_value'] = round((all_tfs_hist['sum_tf_value']/all_tfs_hist['number_of_alerts']),2)
all_tfs_hist['avg_tf_rating'] = round((all_tfs_hist['sum_tf_rating']/all_tfs_hist['number_of_alerts']),2)

all_tfs_hist = all_tfs_hist.merge(tf1[['trigger_feature','number_of_alerts']].rename(columns={'number_of_alerts':'#TF_1'}),on='trigger_feature', how='left')
all_tfs_hist = all_tfs_hist.merge(tf2[['trigger_feature','number_of_alerts']].rename(columns={'number_of_alerts':'#TF_2'}),on='trigger_feature', how='left')
all_tfs_hist = all_tfs_hist.merge(tf3[['trigger_feature','number_of_alerts']].rename(columns={'number_of_alerts':'#TF_3'}),on='trigger_feature', how='left')

all_tfs_hist = all_tfs_hist[['trigger_feature', 'number_of_alerts', '% out of all alerts', '#TF_1', '#TF_2', '#TF_3', 'avg_algo_score', 'avg_tf_rating', 'min_sum_trx', 'max_sum_trx', 'avg_sum_trx', 'min_cnt_trx', 'max_cnt_trx', 'avg_cnt_trx']] 
all_tfs_hist

KeyError: "Column(s) ['sum_trx'] do not exist"

In [16]:
all_tf_1 = alert_report['TF_1'].values.tolist()
all_tf_2 = alert_report['TF_2'].values.tolist()
all_tf_3 = alert_report['TF_3'].values.tolist()
all_top_3 = all_tfs_hist['trigger_feature'].values.tolist()

not_tf_1 = [x for x in analysis_features if x not in all_tf_1]
not_tf_2 = [x for x in analysis_features if x not in all_tf_2]
not_tf_3 = [x for x in analysis_features if x not in all_tf_3]
not_top_3 = [x for x in analysis_features if x not in all_top_3]

print('Number of features that never appeared as TF1: ', len(not_tf_1))
print(not_tf_1,'\n')
    
print('Number of features that never appeared as TF2: ', len(not_tf_2))
print(not_tf_2,'\n')
    
print('Number of features that never appeared as TF3: ', len(not_tf_3))
print(not_tf_3,'\n')

print('Number of features that never appeared as TF1, TF2 or TF3 (pattern): ', len(not_top_3))
print(not_top_3,'\n')

NameError: name 'all_tfs_hist' is not defined

In [None]:
# dictionary of lists 
d_not_pattern = {'Features that never appeared as TF1': pd.Series(not_tf_1), 'Features that never appeared as TF2': pd.Series(not_tf_2), 'Features that never appeared as TF3': pd.Series(not_tf_3), 'Features that never appeared as  TF1, TF2 or TF3 (pattern)': pd.Series(not_top_3)} 
    
df_not_pattern = pd.DataFrame.from_dict(d_not_pattern, orient='index').T
    
df_not_pattern

## 7. Pattern Analysis
Get the total number of patterns and the frequency of each pattern in a descending order

In [None]:
pattern_count = evaluated_activities.groupby(algo_step+'_pattern').count().count()
print('Total Number Of Patterns: ', pattern_count)

In [None]:
patterns = alert_report.groupby(['TF_1','TF_2','TF_3']).agg(count =('index','count'), avg_sum_trx=('sum_trx','mean'), min_sum_trx=('sum_trx','min'), max_sum_trx=('sum_trx','max'), avg_cnt_trx=('cnt_trx','mean'), min_cnt_trx=('cnt_trx','min'), max_cnt_trx=('cnt_trx','max')).sort_values(by='count', ascending=False)
patterns['count %'] = ((patterns['count']/pattern_count)*100).round(1)
print('Total Number Of Patterns: ', pattern_count)
patterns

## 8. Trigger Feature Analysis

### 8.1. Trigger Feature #1

In [None]:
pd.options.display.float_format = "{:,.0f}".format

The following table shows the number of times each feature occurred as TF1 for different bins of algo_score.

In [None]:
tf1_eval_score = tf_eval_score(alert_report, 'TF_1').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf1_eval_score

The following table shows the distribution of Trigger Feature Rating per Feature.

In [None]:
tf1_rating = tf_rating(alert_report, 'TF_1').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf1_rating

### 8.2. Trigger Feature #2

The following table shows the number of times each feature occurred as TF2 for different bins of algo_score.

In [None]:
tf2_eval_score = tf_eval_score(alert_report, 'TF_2').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf2_eval_score

The following table shows the distribution of Trigger Feature Rating per Feature.

In [None]:
tf2_rating = tf_rating(alert_report, 'TF_2').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf2_rating

### 8.3. Trigger Feature #3

The following table shows the number of times each feature occurred as TF3 for different bins of algo_score.

In [None]:
tf3_eval_score = tf_eval_score(alert_report, 'TF_3').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf3_eval_score

The following table shows the distribution of Trigger Feature Rating per Feature.

In [None]:
tf3_rating = tf_rating(alert_report, 'TF_3').sort_values('total anomalies', ascending=False, na_position='last', key=None)
tf3_rating

## 9. Saving tables to Excel

In [None]:
with pd.ExcelWriter('algo_report.xlsx') as writer:  
    train_overview.to_excel(writer, sheet_name='train_overview')
    anomaly_overview.to_excel(writer, sheet_name='anomaly_overview')
    anomaly_overview_per_month.to_excel(writer, sheet_name='anomaly_overview_per_month')
    alert_report.to_excel(writer, sheet_name='alert_report')
    all_tfs_hist.to_excel(writer, sheet_name='top_TF3_analysis')
    df_not_pattern.to_excel(writer, sheet_name='non_appearing_TFs')
    patterns.to_excel(writer, sheet_name='pattern_analysis')
    tf1_eval_score.to_excel(writer, sheet_name='tf1_eval_score')
    tf1_rating.to_excel(writer, sheet_name='tf1_rating')
    tf2_eval_score.to_excel(writer, sheet_name='tf2_eval_score')
    tf2_rating.to_excel(writer, sheet_name='tf2_rating')
    tf3_eval_score.to_excel(writer, sheet_name='tf3_eval_score')
    tf3_rating.to_excel(writer, sheet_name='tf3_rating')

In [None]:
context.close()