In [42]:
import sys
import pandas as pd
import numpy as np
import stats
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from statistics import mean
from frozendict import frozendict
from utils import get_occ_dicts, get_hmmconf_dict

from IPython.utils import io

In [3]:
INPUT_DIR = os.path.join('..','output')
PREFIX_ALIGN_DIR = os.path.join(INPUT_DIR, 'prefix-alignment')
PREFIX_W_2_DIR = os.path.join(PREFIX_ALIGN_DIR, 'window-size-2')
PREFIX_W_INF_DIR = os.path.join(PREFIX_ALIGN_DIR, 'window-size-maximum')
HMMCONF_DIR = os.path.join(INPUT_DIR,'hmmconf')
C_3PO_DIR = os.path.join(INPUT_DIR,'C-3PO')

LOGS = ["BPI_2017","BPI_2012","M1","M2","M4","M8","M9"]
LOG_TYPES = ['completeness20', 'completeness50', 'sim']

## Datasets
note: Kristo can share his insights
- completeness logs
- regular logs
- model types

Additional:
1. Summary statistics
    - trace length distribution
    - unique acitivities
2. Behaviour (parallelism, XOR etc)

In [62]:
occ_output_dirs = [C_3PO_DIR, PREFIX_W_2_DIR, PREFIX_W_INF_DIR, HMMCONF_DIR]

runs_dict_C_3PO = get_occ_dicts(occ_output_dirs[0], True)
print("3PO")
print("columns: {}".format(runs_dict_C_3PO['BPI_2017']['completeness20'].columns.tolist()))
print("cases: {}".format(runs_dict_C_3PO['BPI_2017']['completeness20'].shape[0]))
print(runs_dict_C_3PO['BPI_2017']['completeness20'].head())

print(110*'-')

runs_dict_pref_w_2 = get_occ_dicts(occ_output_dirs[1])
print("W-2")
print("columns: {}".format(runs_dict_pref_w_2['BPI_2017']['completeness20'].columns.tolist()))
print("cases: {}".format(runs_dict_pref_w_2['BPI_2017']['completeness20'].shape[0]))

print(110*'-')

runs_dict_pref_w_inf = get_occ_dicts(occ_output_dirs[2])
print("W-inf")
print("columns: {}".format(runs_dict_pref_w_inf['BPI_2017']['completeness20'].columns.tolist()))
print("cases: {}".format(runs_dict_pref_w_inf['BPI_2017']['completeness20'].shape[0]))

print(110*'-')

runs_dict_hmmconf = get_hmmconf_dict(occ_output_dirs[3])
print("hmmconf")
print("columns: {}".format(runs_dict_hmmconf['BPI_2017']['completeness20'].columns.tolist()))
print("cases: {}".format(runs_dict_hmmconf['BPI_2017']['completeness20'].groupby('caseid').mean().shape[0]))
print(runs_dict_hmmconf['BPI_2017']['completeness20'].head())

3PO
columns: ['TraceId', 'total cost', 'Completeness cost', 'Conformance cost', 'ExecutionTime']
cases: 1000
   TraceId  total cost Completeness cost Conformance cost ExecutionTime
0        0        32.0                 0               32           3.6
1        1        17.0                 0               17             2
2        2        22.0                 0               22           2.4
3        3        26.0                 0               26           2.6
4        4        27.0                 0               27           2.8
--------------------------------------------------------------------------------------------------------------
W-2
columns: ['TraceId', 'Conformance cost', 'ExecutionTime']
cases: 1000
--------------------------------------------------------------------------------------------------------------
W-inf
columns: ['TraceId', 'Conformance cost', 'ExecutionTime']
cases: 1000
---------------------------------------------------------------------------------------

  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[log_name][log_type] = by_row_index.mean()
  log_dfs[lo

## Stress test

- Operating system: Windows 11 Home 64-bit
- Processor: AMD Ryzen 7 5800 8-Core Processor (16 CPUs) ~3.4 GHz
- Memory: 16384MB RAM

Stress test was carried out by running each algorithm on the datasets 5 times and averaging the execution times to level out system resource interference while running the experiments.


### Average computation time per trace

In [39]:
log_names = [list(zip(3*[log_n], LOG_TYPES)) for log_n in LOGS]
indexes = [l_n+'_'+l_t for l in log_names for (l_n, l_t) in l]
columns = ['C-3PO', 'W-2', 'W-inf', 'HMMCONF']

In [41]:
data_hmmconf = [mean(runs_dict_hmmconf[l_n][l_t].groupby('caseid').mean()['execution time']) for ln_lt in log_names for (l_n, l_t) in ln_lt]

occs = [runs_dict_C_3PO, runs_dict_pref_w_2, runs_dict_pref_w_inf]
data = [[mean(d[l_n][l_t]['ExecutionTime']) for ln_lt in log_names for (l_n, l_t) in ln_lt] for d in occs] + [data_hmmconf]
data = [[round(data[0][i],3), round(data[1][i],3), round(data[2][i],3), round(data[3][i],3)] for i in range(len(data[0]))]
results = pd.DataFrame(index=indexes, data=data, columns=columns)

def highlight_min(s):
    is_max = s == s.min()
    return ['color: green' if cell else '' for cell in is_max]

results.style.apply(highlight_min, axis=1)

Unnamed: 0,C-3PO,W-2,W-inf,HMMCONF
BPI_2017_completeness20,2.875,33.315,45.151,0.088
BPI_2017_completeness50,2.0,10.668,13.617,0.103
BPI_2017_sim,2.744,31.41,41.727,0.086
BPI_2012_completeness20,2.414,31.956,50.342,0.071
BPI_2012_completeness50,1.388,10.011,16.01,0.08
BPI_2012_sim,2.477,32.432,53.474,0.086
M1_completeness20,0.656,8.23,9.323,0.48
M1_completeness50,0.294,1.997,2.372,0.434
M1_sim,0.454,4.871,5.944,0.383
M2_completeness20,1.41,37.772,45.82,0.976


## Correlation test

### Spearman correlation with non-conforming results

#### HMMCONF

In [94]:
log_name = 'M1'
log_type = 'completeness50'

hmmconf = runs_dict_hmmconf[log_name][log_type].groupby('caseid').mean()

C_3PO_conf_compl = runs_dict_C_3PO[log_name][log_type][['Conformance cost', 'Completeness cost']].iloc[hmmconf.index]

joined_df = hmmconf.join(C_3PO_conf_compl)

print(joined_df.head())

def get_correlation_between_metrics(log_name:str=None, log_type:str=None, dim1:str=None, dim2:str=None) -> None:
    pred1 = joined_df['Conformance cost'] > 0
    pred2 = joined_df['finalconf'] > 0

    filtered_df = joined_df.loc[pred1 & pred2, :]
    rho_conf_mean_finalconf = stats.spearmanr(filtered_df['Conformance cost'], filtered_df['finalconf'])
    rho_conf_compl = stats.spearmanr(filtered_df['Conformance cost'], filtered_df['injected_distance'])
    rho_compl_injected_distance = stats.spearmanr(filtered_df['Completeness cost'], filtered_df['injected_distance'])
    rho_completeness = stats.spearmanr(filtered_df['Completeness cost'], filtered_df['completeness'])

    print(
        'Conformance cost - mean final conformance: spearman rho: {:.3f}, p-value: {:.10f}'.format(rho_conf_mean_finalconf[0], rho_conf_mean_finalconf[1]),
        '\nConformance cost - injected distance: spearman rho: {:.3f}, p-value: {:.10f}'.format(rho_conf_compl[0], rho_conf_compl[1]), 
        '\nCompleteness - injected distance: spearman rho: {:.3f}, p-value: {:.10f}'.format(rho_compl_injected_distance[0], rho_compl_injected_distance[1]),
        '\nCompleteness: spearman rho: {:.3f}, p-value: {:.10f}'.format(rho_completeness[0], rho_completeness[1])
    )

get_correlation_between_metrics()

        execution time  emitconf  stateconf  finalconf  injected_distance  \
caseid                                                                      
401.0         0.200044  0.016877  -0.025387   0.179965           2.800000   
402.0         0.200034  0.192182   0.378777   0.525664           7.428571   
403.0         0.340280  0.005768  -0.191543   0.014518          10.200000   
404.0         0.640628  0.076018   0.388933   0.515178           8.600000   
405.0         0.100340  0.009698  -0.214307   0.046164           3.000000   

        completeness Conformance cost Completeness cost  
caseid                                                   
401.0       0.594286                4                 4  
402.0       0.424902                2                 4  
403.0       0.370476                4                 1  
404.0       0.408124                7                 5  
405.0       0.565476                3                 1  
Conformance cost - mean final conformance: spearman rh