# Benchmarking
Look at time vs length of trace.

In [None]:
from datetime import timedelta
import pm4py
import time
import pandas as pd
from pm4py.objects.log.obj import EventLog

In [10]:
sepsis = pm4py.read_xes('data/Sepsis Cases - Event Log.xes')
rtfmp = pm4py.read_xes('/home/vco/Datasets/12683249/Road_Traffic_Fine_Management_Process.xes')
bpic13i = pm4py.read_xes('/home/vco/Datasets/BPI_Challenge_2013_incidents.xes')
dreyers = pm4py.read_xes('/home/vco/Datasets/Dreyers Foundation.xes')
dfs = {'sepsis': sepsis, 'rtfmp': rtfmp, 'bpic13i': bpic13i, 'dreyers': dreyers}
# dreyers does not have column of type date, but the events are ordered so create one.
dreyers['time:timestamp'] = dreyers.groupby('case:concept:name').cumcount()
dreyers['time:timestamp'] = dreyers['time:timestamp'].apply(lambda x: pd.to_datetime('2024-01-01') + timedelta(x))

In [36]:
res = {}
for k, df in dfs.items():
    train_random_10_percent, _ = pm4py.split_train_test(df,train_percentage=0.1)
    conformance_random_50_percent, _ = pm4py.split_train_test(df,train_percentage=0.5)
    dcr_graph, _ = pm4py.discover_dcr(train_random_10_percent)
    declare_model = pm4py.discover_declare(train_random_10_percent)
    skeleton_model = pm4py.discover_log_skeleton(train_random_10_percent)
    to_plot = []
    for trace in pm4py.convert_to_event_log(conformance_random_50_percent):
        len_trace = len(trace)
        temp_log = EventLog()
        temp_log.append(trace)
        
        start_time = time.time()
        pm4py.conformance_dcr(temp_log, dcr_graph)
        elapsed_time = time.time() - start_time
        to_plot.append([len_trace, elapsed_time, 'dcr'])

        start_time = time.time()
        pm4py.conformance_declare(temp_log, declare_model)
        elapsed_time = time.time() - start_time
        to_plot.append([len_trace, elapsed_time, 'declare'])

        start_time = time.time()
        pm4py.conformance_log_skeleton(temp_log, skeleton_model)
        elapsed_time = time.time() - start_time
        to_plot.append([len_trace, elapsed_time, 'skeleton'])

    res[k] = pd.DataFrame(to_plot)

split test log with random length traces and the test log size varies between.
- a random 10% to discover the model.
- a random 50% of the log to do conformance.
these are fixed for the models.

In [46]:
align = {}
for k, df in dfs.items():
    train_random_10_percent, _ = pm4py.split_train_test(df,train_percentage=0.1)
    conformance_random_50_percent, _ = pm4py.split_train_test(df,train_percentage=0.5)
    dcr_graph, _ = pm4py.discover_dcr(train_random_10_percent)
    dfg_model = pm4py.discover_dfg(train_random_10_percent)
    petri_net_model = pm4py.discover_petri_net_inductive(train_random_10_percent)
    to_plot = []
    for trace in pm4py.convert_to_event_log(conformance_random_50_percent):
        len_trace = len(trace)
        if len_trace > 0:
            temp_log = EventLog()
            temp_log.append(trace)
            sa = trace[0]['concept:name']
            ea = trace[len_trace-1]['concept:name']
            start_time = time.time()
            pm4py.optimal_alignment_dcr(temp_log, dcr_graph)
            elapsed_time = time.time() - start_time
            to_plot.append([len_trace, elapsed_time, 'dcr'])
    
            start_time = time.time()
            pm4py.algo.conformance.alignments.dfg.algorithm.apply(temp_log, dfg_model, sa, ea)
            elapsed_time = time.time() - start_time
            to_plot.append([len_trace, elapsed_time, 'dfg'])
    
            start_time = time.time()
            pm4py.conformance_log_skeleton(temp_log, petri_net_model)
            elapsed_time = time.time() - start_time
            to_plot.append([len_trace, elapsed_time, 'petri'])

    align[k] = pd.DataFrame(to_plot)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[start_timestamp_key] = df[timestamp_key]


KeyError: 0

In [31]:
# filtering in pm4py gives this warning
import warnings
warnings.filterwarnings(
    action='ignore', category=UserWarning, message=r"Boolean Series.*"
)

def ranges(min_trace, max_trace):
    step = (max_trace - min_trace)/10
    res = [int(x*step) for x in range(10)]
    if min_trace < step:
        res[0] = min_trace
    if max_trace > 10*step:
        res.append(max_trace)
    return res

dis = {}
for k, df in dfs.items():
    min_trace = int(df[['case:concept:name','concept:name']].groupby('case:concept:name').count().min().iloc[0])
    max_trace = int(df[['case:concept:name','concept:name']].groupby('case:concept:name').count().max().iloc[0])
    splits = ranges(min_trace, max_trace)
    min_size = splits[0]
    to_plot = []
    i = 0
    for max_size in splits[1:]:
        subset_df = pm4py.filter_case_size(df, min_size=min_size, max_size=max_size)
        if len(subset_df)>0:
            start_time = time.time()
            dcr_graph, _ = pm4py.discover_dcr(subset_df)
            elapsed_time = time.time() - start_time
            to_plot.append([i, elapsed_time, 'dcr'])
        
            start_time = time.time()
            declare_model = pm4py.discover_declare(subset_df)
            elapsed_time = time.time() - start_time
            to_plot.append([i, elapsed_time, 'declare'])
        
            start_time = time.time()
            skeleton_model = pm4py.discover_log_skeleton(subset_df)
            elapsed_time = time.time() - start_time
            to_plot.append([i, elapsed_time, 'skeleton'])
        i+=10
        min_size = max_size
    dis[k] = pd.DataFrame(to_plot)