In [2]:
import os

import pandas as pd
import pm4py

from definitions import LOGS_DIR, ROOT_DIR

mapping = {
    'BPI_Challenge_2013_closed_problems.xes.gz': 'BPIC 2013 closed problems',
    'BPI_Challenge_2012.xes.gz': 'BPIC 2012',
    'BPI_Challenge_2013_incidents.xes.gz': 'BPIC 2013 incidents',
    'BPI_Challenge_2013_open_problems.xes.gz': 'BPIC 2013 open problems',
    'BPI%20Challenge%202017.xes.gz': 'BPIC 2017',
    'BPIC15_1.xes': 'BPIC 2015',
    'Road_Traffic_Fine_Management_Process.xes.gz': 'RTFMP',
    'Sepsis%20Cases%20-%20Event%20Log.xes.gz': 'Sepsis',
    'helpdesk.csv': 'Helpdesk'
}

In [28]:
df = pd.read_csv('log_stats.csv')
df = df.drop(columns=['avg_event_per_trace'])
df['num_events_copy'] = df['num_traces'] * df['avg_trace_length']
df['amount_of_padding'] = df['num_events'] - df['num_events_copy']
df['relative_padding'] = df['amount_of_padding'] / df['num_events']
df['file_name'] = df['file_name'].map(mapping)

In [29]:
df

Unnamed: 0,file_name,vocab_size,num_traces,num_events,max_trace_length,min_trace_length,avg_trace_length,num_events_copy,amount_of_padding,relative_padding,log
0,BPI Challenge 2018.xes.gz,42,43809,130244157,2973,24,57.391541,2514266.0,127729891.0,0.980696,
1,BPI%20Challenge%202017.xes.gz,27,31509,5671620,180,10,38.156305,1202267.0,4469353.0,0.788021,BPIC 2017
2,BPIC15_1.xes,399,1199,121099,101,2,43.550459,52217.0,68882.0,0.568807,BPIC 2015
3,BPI_Challenge_2012.xes.gz,25,13087,2290225,175,3,20.035149,262200.0,2028025.0,0.885513,BPIC 2012
4,BPI_Challenge_2013_closed_problems.xes.gz,5,1487,52045,35,1,4.478816,6660.0,45385.0,0.872034,BPIC 2013 closed problems
5,BPI_Challenge_2013_incidents.xes.gz,5,7554,929142,123,1,8.675271,65533.0,863609.0,0.929469,BPIC 2013 incidents
6,BPI_Challenge_2013_open_problems.xes.gz,4,819,18018,22,1,2.870574,2351.0,15667.0,0.869519,BPIC 2013 open problems
7,BPI_Challenge_2019.xes,43,251734,249216660,990,1,6.33972,1595923.0,247620737.0,0.993596,
8,CoSeLoG WABO 3.xes.gz,370,1087,134788,124,3,41.215271,44801.0,89987.0,0.667619,
9,Hospital Billing - Event Log.xes.gz,19,100000,21700000,217,1,4.51359,451359.0,21248641.0,0.9792,


In [22]:
def to_latex_table(df, filename, caption=None, label=None):
    style = df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1) \
        .format(lambda x: str(x).replace('set()', '{}').replace("{", '\\{').replace("}", '\\}').replace("'", ''))

    col_format = 'l' + 'c' * (len(df.columns) - 1)
    output = style.to_latex(hrules=False, caption=caption, label=label, column_format=col_format)
    output = output.replace(r'\begin{tabular}{' + col_format + '}', r"""
    \centering
    \begin{NiceTabular}{""" + col_format + r"""}
    \CodeBefore
    \rowcolor{gray!50}{1}
    \rowcolors{2}{gray!25}{white}
    \Body""")
    output = output.replace(r'\end{tabular}', r'\end{NiceTabular}')

    if filename:
        with open(os.path.join(ROOT_DIR, 'reports', filename), 'w') as f:
            f.write(output)

    return output

In [25]:
to_latex_table(df, 'log_stats.tex', caption='Log statistics', label='tab:log_stats')

'\\begin{table}\n\\caption{Log statistics}\n\\label{tab:log_stats}\n\n    \\centering\n    \\begin{NiceTabular}{lccccccccc}\n    \\CodeBefore\n    \\rowcolor{gray!50}{1}\n    \\rowcolors{2}{gray!25}{white}\n    \\Body\n & \\textbf{file\\_name} & \\textbf{vocab\\_size} & \\textbf{num\\_traces} & \\textbf{num\\_events} & \\textbf{max\\_trace\\_length} & \\textbf{min\\_trace\\_length} & \\textbf{avg\\_trace\\_length} & \\textbf{num\\_events\\_copy} & \\textbf{amount\\_of\\_padding} & \\textbf{relative\\_padding} \\\\\n0 & BPI Challenge 2018.xes.gz & 42 & 43809 & 130244157 & 2973 & 24 & 57.39154055102833 & 2514266.0 & 127729891.0 & 0.980695748216943 \\\\\n1 & BPI%20Challenge%202017.xes.gz & 27 & 31509 & 5671620 & 180 & 10 & 38.15630454790695 & 1202267.0 & 4469353.0 & 0.7880205302894059 \\\\\n2 & BPIC15_1.xes & 399 & 1199 & 121099 & 101 & 2 & 43.55045871559633 & 52217.0 & 68882.0 & 0.5688073394495413 \\\\\n3 & BPI_Challenge_2012.xes.gz & 25 & 13087 & 2290225 & 175 & 3 & 20.035149384885763 &

In [51]:
name = 'Road_Traffic_Fine_Management_Process.xes.gz'
log = pm4py.read_xes(file_path=os.path.join(LOGS_DIR, name))

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

In [71]:
longest_trace = log.groupby('case:concept:name')['concept:name'].count().max()
longest_trace

AttributeError: 'EventLog' object has no attribute 'groupby'

In [53]:
variants = pm4py.get_variants(log)
variant_count = len(variants)
variant_count

231

In [63]:
variants_only_once = list(filter(lambda x: x[1] == 1, variants.items()))
variants_only_once = list(map(lambda x: x[0], variants_only_once))
variants_only_once

[('Create Fine',
  'Send Fine',
  'Insert Fine Notification',
  'Add penalty',
  'Insert Date Appeal to Prefecture',
  'Payment',
  'Receive Result Appeal from Prefecture',
  'Payment'),
 ('Create Fine',
  'Send Fine',
  'Insert Fine Notification',
  'Add penalty',
  'Notify Result Appeal to Offender',
  'Receive Result Appeal from Prefecture',
  'Send for Credit Collection'),
 ('Create Fine',
  'Send Fine',
  'Insert Fine Notification',
  'Insert Date Appeal to Prefecture',
  'Send Appeal to Prefecture',
  'Receive Result Appeal from Prefecture',
  'Add penalty',
  'Notify Result Appeal to Offender',
  'Appeal to Judge',
  'Send for Credit Collection'),
 ('Create Fine',
  'Send Fine',
  'Insert Date Appeal to Prefecture',
  'Insert Fine Notification',
  'Add penalty',
  'Send Appeal to Prefecture',
  'Receive Result Appeal from Prefecture',
  'Notify Result Appeal to Offender',
  'Send for Credit Collection'),
 ('Create Fine',
  'Send Fine',
  'Insert Fine Notification',
  'Receive Re

In [64]:
filtered2 = pm4py.filter_variants(log, variants_only_once, retain=False)
filtered2_variant_count = len(pm4py.get_variants(filtered2))
filtered2_variant_count

131

In [65]:
filtered2_longest_trace = filtered2.groupby('case:concept:name')['concept:name'].count().max()
filtered2_longest_trace

14

In [66]:
pm4py.write_xes(filtered2, os.path.join(LOGS_DIR, 'RTFMP_filtered.xes'))

exporting log, completed traces ::   0%|          | 0/150270 [00:00<?, ?it/s]

In [68]:
filtered2.attributes

AttributeError: 'DataFrame' object has no attribute 'attributes'

In [73]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

log = xes_importer.apply(os.path.join(LOGS_DIR, name))

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

In [74]:
log.attributes

{'meta_org:different_resources_standard_deviation': 0.061,
 'meta_org:different_resources_min': 2,
 'meta_3TU:language': 'eng',
 'meta_concept:named_events_total': {'value': 561470,
  'children': {'Payment': 77601,
   'Insert Date Appeal to Prefecture': 4188,
   'Receive Result Appeal from Prefecture': 999,
   'Send Appeal to Prefecture': 4141,
   'Notify Result Appeal to Offender': 896,
   'Appeal to Judge': 555,
   'Add penalty': 79860,
   'Insert Fine Notification': 79860,
   'Send for Credit Collection': 59013,
   'Create Fine': 150370,
   'Send Fine': 103987}},
 'meta_org:group_events_average': {'value': 3.734,
  'children': {'UNKNOWN': 3.734}},
 'meta_org:different_groups_max': 1,
 'meta_life:different_transitions_min': 1,
 'meta_concept:different_names_standard_deviation': 1.578,
 'meta_time:log_duration': 424825200.0,
 'meta_3TU:creation_place': 'Eindhoven',
 'meta_general:classifiers': {'value': 1,
  'children': {'Event Name': {'value': 'concept:name',
    'children': {'meta_g

In [75]:
variants = pm4py.get_variants(log)
variant_count = len(variants)
variant_count



231

In [80]:
variants_only_once = list(filter(lambda x: len(x[1]) == 1, variants.items()))
variants_only_once = list(map(lambda x: x[0], variants_only_once))
len(variants_only_once)

100

In [81]:
filtered2 = pm4py.filter_variants(log, variants_only_once, retain=False)
filtered2_variant_count = len(pm4py.get_variants(filtered2))
filtered2_variant_count



131

In [82]:
xes_exporter.apply(filtered2, os.path.join(LOGS_DIR, 'RTFMP_filtered.xes'))

exporting log, completed traces ::   0%|          | 0/150270 [00:00<?, ?it/s]

In [83]:
abc = xes_importer.apply(os.path.join(LOGS_DIR, 'RTFMP_filtered.xes'))
abc.attributes['concept:name']

parsing log, completed traces ::   0%|          | 0/150270 [00:00<?, ?it/s]

'Road Traffic Fine Management Process'