In [1]:
# Suppress warnings in this notebook
import warnings
warnings.filterwarnings('ignore')

# Make an experimental design table
Here the settings for the experiments can be modified in the dictionary called run_settings. Refer to the paper for more details on of each of the parameters.

In [2]:
run_settings = {# level of entropy: min, medium and/or max
                "process_entropy":["min_entropy","med_entropy","max_entropy"],
                
                # number of traces/cases in the event-log
                "number_of_traces":[1000],

                # number of activity types
                "statespace_size":[5,10], 

                # first or higher-order markov chain to represent the transitions
                "process_type":["memoryless","memory"], 
                
                # order of HOMC - only specify this when using process with memory
                "process_memory":[2,4],
                
                # number of transitions - only used for medium entropy (should be higher than 2 and < statespace size)
                "med_ent_n_transitions":[3, 5],
                                
                # lambda parameter of inter-arrival times
                "inter_arrival_time":[1.5],
                
                # lambda parameter of process noise
                "process_stability_scale":[0.1],
                
                # probability of agent being available
                "resource_availability_p":[0.5],

                # number of agents in the process
                "resource_availability_n":[3],

                # waiting time in full days, when no agent is available
                "resource_availability_m":[0.041],
                
                # variation between activity durations
                "activity_duration_lambda_range":[1, 5],
                
                # business hours definition: when can cases be processed? ()
                "Deterministic_offset_W":["weekdays", "all-week"],

                # time-unit for a full week: days = 7, hrs = 24*7, etc.
                "Deterministic_offset_u":[7],
                
                # training data format (See Verenich et al., 2019): 
                # True - use first event to predict total cycle-time. 
                # False - use Prefix-log format / each event to predict remaining cycle time.
                "first_state_model":[True],

                # offset for the timestamps used (1970 time after 1970)
                "datetime_offset":[35],
                
                # number of repetitions of the experiments: duplicates the experiment table (2 times here)
                "num_replications":list(range(0, 2))
               }


# import the make_design_table function to generate a full factorial experimental design table
from SynBPS.simulation.DoE import make_design_table
df = make_design_table(run_settings, file="data/design_table.csv")

# inspect the resulting design table
df

saved to data/design_table.csv


Unnamed: 0,process_entropy,number_of_traces,statespace_size,process_type,process_memory,med_ent_n_transitions,inter_arrival_time,process_stability_scale,resource_availability_p,resource_availability_n,resource_availability_m,activity_duration_lambda_range,Deterministic_offset_W,Deterministic_offset_u,first_state_model,datetime_offset,num_replications,RUN,Done,Failure
0,min_entropy,1000.0,5,memoryless,2.0,3.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,True,35.0,0.0,1,0,0
1,med_entropy,1000.0,5,memoryless,2.0,3.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,True,35.0,0.0,2,0,0
2,max_entropy,1000.0,5,memoryless,2.0,3.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,True,35.0,0.0,3,0,0
3,min_entropy,1000.0,10,memoryless,2.0,3.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,True,35.0,0.0,4,0,0
4,med_entropy,1000.0,10,memoryless,2.0,3.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,True,35.0,0.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,med_entropy,1000.0,5,memory,4.0,5.0,1.5,0.0,0.5,3.0,0.0,5.0,all-week,7.0,True,35.0,1.0,380,0,0
380,max_entropy,1000.0,5,memory,4.0,5.0,1.5,0.0,0.5,3.0,0.0,5.0,all-week,7.0,True,35.0,1.0,381,0,0
381,min_entropy,1000.0,10,memory,4.0,5.0,1.5,0.0,0.5,3.0,0.0,5.0,all-week,7.0,True,35.0,1.0,382,0,0
382,med_entropy,1000.0,10,memory,4.0,5.0,1.5,0.0,0.5,3.0,0.0,5.0,all-week,7.0,True,35.0,1.0,383,0,0


## A less complicated example:

In [3]:
run_settings = {# level of entropy: min, medium and/or max
                "process_entropy":["min_entropy"], #,"med_entropy","max_entropy"
                
                # number of traces/cases in the event-log
                "number_of_traces":[1000],

                # number of activity types
                "statespace_size":[5], 

                # first or higher-order markov chain to represent the transitions
                "process_type":["memoryless"], 
                
                # order of HOMC - only specify this when using process with memory
                "process_memory":[2],
                
                # number of transitions - only used for medium entropy (should be higher than 2 and < statespace size)
                "med_ent_n_transitions":[5],
                                
                # lambda parameter of inter-arrival times
                "inter_arrival_time":[1.5],
                
                # lambda parameter of process noise
                "process_stability_scale":[0.1],
                
                # probability of agent being available
                "resource_availability_p":[0.5],

                # number of agents in the process
                "resource_availability_n":[3],

                # waiting time in full days, when no agent is available
                "resource_availability_m":[0.041],
                
                # variation between activity durations
                "activity_duration_lambda_range":[1],
                
                # business hours definition: when can cases be processed? ()
                "Deterministic_offset_W":["weekdays"],

                # time-unit for a full week: days = 7, hrs = 24*7, etc.
                "Deterministic_offset_u":[7],
                
                # training data format (See Verenich et al., 2019): 
                # True - use first event to predict total cycle-time. 
                # False - use Prefix-log format / each event to predict remaining cycle time.
                "first_state_model":[False],

                # offset for the timestamps used (years after 1970)
                "datetime_offset":[45],
                
                # number of repetitions of the experiments: duplicates the experiment table (2 times here)
                "num_replications":list(range(0, 3))
               }


# import the make_design_table function to generate a full factorial experimental design table
from SynBPS.simulation.DoE import make_design_table
df = make_design_table(run_settings, file="data/design_table.csv")

# inspect the resulting design table
df

saved to data/design_table.csv


Unnamed: 0,process_entropy,number_of_traces,statespace_size,process_type,process_memory,med_ent_n_transitions,inter_arrival_time,process_stability_scale,resource_availability_p,resource_availability_n,resource_availability_m,activity_duration_lambda_range,Deterministic_offset_W,Deterministic_offset_u,first_state_model,datetime_offset,num_replications,RUN,Done,Failure
0,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,False,45.0,0.0,1,0,0
1,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,False,45.0,1.0,2,0,0
2,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,0.0,1.0,weekdays,7.0,False,45.0,2.0,3,0,0


# Specify Train() and Test() functions
Before running the experiments, you need to define model training and evaluation functions.

In this example we train a first state model, which is a model using only the first observed event (state) to predict to total cycle-time. The default data preparation will result in a prefix-log, which can be used to predict remaining cycle-time from every observed event in the trace.

Input for the **training_function** is a dictionary named **input_data**, which contain the following:
- x_train
- x_test
- y_train
- y_test

Output is an **inference table** containing predictions and actual target values for the test data. This table is used for analysis of the results. The **eval_function** also uses this table to calculate aggregated metrics.

In [4]:
def training_function(input_data):
    print("training")
    
    """
    Example model: Lasso regression
    This is just an example of how to define your model in this framework.
    Using this model on this data format is not advised as we break i.i.d. assumptions.
    """

    #retrieve model class from sklearn
    from sklearn import linear_model
    reg = linear_model.Lasso(alpha=0.1)

    #reshape training data for this type of model 
    #(from: N x t x k, to: N x (t x k))
    #num_obs = input_data["x_train"].shape[0]
    from numpy import prod
    flattened_dim = prod(input_data["x_train"].shape[1:])

    #train the regression model
    reg.fit(input_data["x_train"].reshape((input_data["x_train"].shape[0], flattened_dim)), input_data["y_train"])

    #predict on the test data
    y_pred = reg.predict(input_data["x_test"].reshape((input_data["x_test"].shape[0], flattened_dim)))

    #get the inference table (used for analysis of the final results)
    inference = input_data["Inference_test"]
    
    #add predictions to the inference table
    inference["y_pred"] = y_pred
    return inference

#from sklearn.metrics 

def eval_function(inference):
    print("evaluation")
    """
    Example evaluation: Aggregated scores
    The inference table also enable the ability to make trace or prefix-level evaluations using its id variables
    """
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

    y = inference["y"]
    y_pred = inference["y_pred"]

    MSE = mean_squared_error(y, y_pred)
    MAE = mean_absolute_error(y, y_pred)
    R2 = r2_score(y, y_pred)
    EVAR = explained_variance_score(y, y_pred)

    # the resulting metrics should be stored in a dictionary and be scalars only
    # adding prefixes to column name (key) is suggested when logging many metrics
    metrics = {"TEST_MSE":MSE,
              "TEST_MAE":MAE,
              "TEST_R2":R2,
              "TEST_EVAR":EVAR}
    print(metrics)
    return metrics

# Perform the experiments
Now we perform the experiments using the defined training and evaluation functions.

In [5]:
from SynBPS.simulation.simulation_pipeline import run_experiments
results = run_experiments(training_function, 
                          eval_function, 
                          output_dir="data/",
                          out_file="results.csv", 
                          design_table="design_table.csv")

Run: 0
traces: 1000
eventlog saved to: data/0_Eventlog_min_entropy_memoryless.csv
events: 5000
ids: 1000
Cases before dropping len=1: 1000 cases 5000 rows
Cases after dropping len=1: 1000 cases 5000 rows
Sorting by id, date (chronological order)
Number of cases in log: 1000
longest trace is: 5
Time format: 2015-01-02 06:00:00
Std. format: %Y-%m-%d %H:%M:%S
   id event  activity_no                 time        end_datetime
0   1     f            1  2015-01-02 06:00:00 2015-01-02 17:33:52
1   1     b            2  2015-01-02 17:33:52 2015-01-02 18:41:56
2   1     c            3  2015-01-03 06:00:00 2015-01-06 00:23:11
3   1     e            4  2015-01-06 06:00:00 2015-01-06 15:42:47
4   1     d            5  2015-01-06 15:42:47 2015-01-06 21:03:28
mode: event
Log starts at: 2015-01-02 06:00:00
Last event starts at: 2018-11-18 06:00:00
Train-test split happens at: 2016-12-10 06:00:00
1000
1000
2380
Post-processing:

dropping last event from each case
before: 5000
after: 4000
data in X is t

# Analysis
Work in progress..

In [8]:
import pandas as pd
pd.read_csv("data/results.csv")

Unnamed: 0,process_entropy,number_of_traces,statespace_size,process_type,process_memory,med_ent_n_transitions,inter_arrival_time,process_stability_scale,resource_availability_p,resource_availability_n,...,num_events,num_variants,avg_tracelen,min_tracelen,max_tracelen,RESULT_num_events,TEST_MSE,TEST_MAE,TEST_R2,TEST_EVAR
0,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,...,5000,1,5.0,5,5,5000,14589990000.0,91633.268876,0.291956,0.293779
1,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,...,5000,1,5.0,5,5,5000,12611590000.0,85810.523605,0.350587,0.350592
2,min_entropy,1000.0,5,memoryless,2.0,5.0,1.5,0.0,0.5,3.0,...,5000,1,5.0,5,5,5000,9740177000.0,66280.415909,0.47976,0.480468
