In [1]:
import os

import pandas as pd
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer

from definitions import LOGS_DIR, ROOT_DIR

In [2]:
# Nicer log names
mapping = {
    'BPI_Challenge_2013_closed_problems.xes.gz': 'BPI13 Closed',
    'BPI_Challenge_2012.xes.gz': 'BPI12',
    'BPI_Challenge_2013_incidents.xes.gz': 'BPI13 Incidents',
    'BPI_Challenge_2013_open_problems.xes.gz': 'BPI13 Open',
    'BPI%20Challenge%202017.xes.gz': 'BPI17',
    'BPIC15_1.xes': 'BPI15',
    'Road_Traffic_Fine_Management_Process.xes.gz': 'RTFMP',
    'Sepsis%20Cases%20-%20Event%20Log.xes.gz': 'Sepsis',
    'helpdesk.csv': 'Helpdesk',
    'RTFMP_filtered.xes': 'RTFMP filtered',
}

In [5]:
def extract_stats(log):
    event_frequency_distribution = {}
    trace_length_distribution = {}
    longest_trace_length = 0
    amount_of_padding = 0
    for trace in log:
        # Update longest trace length
        if len(trace) > longest_trace_length:
            longest_trace_length = len(trace)
        
        # Save trace length distribution
        if len(trace) not in trace_length_distribution:
            trace_length_distribution[len(trace)] = 1
        else:
            trace_length_distribution[len(trace)] += 1
            
        # Save event frequency distribution
        for event in trace:
            if str(event['concept:name']) not in event_frequency_distribution:
                event_frequency_distribution[str(event['concept:name'])] = 1
            else:
                event_frequency_distribution[str(event['concept:name'])] += 1
    # Calculate amount of padding
    for trace in log:
        amount_of_padding += longest_trace_length - len(trace)
        
    return trace_length_distribution, event_frequency_distribution, amount_of_padding

log_stats = []
for file_name in sorted(os.listdir(LOGS_DIR)):
    if file_name.endswith('.xes') or file_name.endswith('.xes.gz'):
        xes_file_name = os.path.join(LOGS_DIR, file_name)
        log = xes_importer.apply(xes_file_name)
    elif file_name.endswith('.csv'):
        log = pm4py.format_dataframe(pd.read_csv(os.path.join(LOGS_DIR, file_name), sep=','),
                                     case_id='CaseID',
                                     activity_key='ActivityID',
                                     timestamp_key='CompleteTimestamp')
        log = pm4py.convert_to_event_log(log)
    else:
        continue
    # Extract stats
    tld, efd, padding = extract_stats(log)
    # Save stats
    if str(file_name) in mapping:
        log_stats.append({
            'file_name': str(file_name),
            'trace_length_distribution': tld,
            'event_frequency_distribution': efd,
            'amount_of_padding': padding,
        })



parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/1199 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/1487 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/7554 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/819 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/43809 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/150270 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/1050 [00:00<?, ?it/s]



In [19]:
# transform to dataframe
log_stats_rows = []
for row in log_stats:
    file_name = row['file_name']
    tld = row['trace_length_distribution']
    efd = row['event_frequency_distribution']
    padding = row['amount_of_padding']
    log_stats_rows.append({
        'log': mapping[file_name],
        'num traces': sum(tld.values()),
        'num events': sum(efd.values()),
        'num activities': len(efd),
        'longest trace': max(tld.keys()),
        'avg trace length': sum([k * v for k, v in tld.items()]) / sum(tld.values()),
        'shortest trace length': min(tld.keys()),
        'amount of padding': padding,
        'relative padding': padding / (sum(efd.values()) + padding),
    })
df = pd.DataFrame(log_stats_rows)
df.to_csv(os.path.join(ROOT_DIR, 'reports', 'log_stats.csv'), index=False)
df

Unnamed: 0,log,num traces,num events,num activities,longest trace,avg trace length,shortest trace length,amount of padding,relative padding
0,BPI17,31509,1202267,26,180,38.156305,10,4469353,0.788021
1,BPI15,1199,52217,398,101,43.550459,2,68882,0.568807
2,BPI12,13087,262200,24,175,20.035149,3,2028025,0.885513
3,BPI13 Closed,1487,6660,4,35,4.478816,1,45385,0.872034
4,BPI13 Incidents,7554,65533,4,123,8.675271,1,863609,0.929469
5,BPI13 Open,819,2351,3,22,2.870574,1,15667,0.869519
6,RTFMP filtered,150270,560551,11,14,3.730292,2,1543229,0.733551
7,RTFMP,150370,561470,11,20,3.733923,2,2445930,0.813304
8,Sepsis,1050,15214,16,185,14.489524,3,179036,0.921678
9,Helpdesk,3804,13710,9,14,3.604101,1,39546,0.742564


In [17]:
def to_latex_table(df, filename, caption=None, label=None):
    style = df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1) \
        .format(lambda x: str(x).replace('set()', '{}').replace("{", '\\{').replace("}", '\\}').replace("'", ''))

    col_format = 'l' + 'c' * (len(df.columns) - 1)
    output = style.to_latex(hrules=False, caption=caption, label=label, column_format=col_format)
    output = output.replace(r'\begin{tabular}{' + col_format + '}', r"""
    \centering
    \begin{NiceTabular}{""" + col_format + r"""}
    \CodeBefore
    \rowcolor{gray!50}{1}
    \rowcolors{2}{gray!25}{white}
    \Body""")
    output = output.replace(r'\end{tabular}', r'\end{NiceTabular}')

    if filename:
        with open(os.path.join(ROOT_DIR, 'reports', filename), 'w') as f:
            f.write(output)

    return output

In [18]:
_ = to_latex_table(df, 'log_stats.tex', caption='Log statistics', label='tab:log_stats')