In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
from dataprep.helperfunctions import *
from dataprep.memory_helperfunctions import prepare_data_f_memory

In [4]:
from simulation.simulation_pipeline import *
from simulation.simulation_helpers import *

In [5]:
import pomegranate

In [6]:
from experiment.DoE import *

# Make a design table

In [7]:
run_settings = {"process_entropy":["min_entropy","med_entropy","max_entropy"], #,"med_entropy","max_entropy","med_entropy","max_entropy"
                "number_of_traces":[100],
                "statespace_size":[5,10],
                "process_type":["memoryless","memory"],        #"memoryless","memory" 
                
                #order of HOMC
                "process_memory":[4],
                
                #number of transitions
                "med_ent_n_transitions":[3,5],
                                
                #lambda parameter of inter-arrival times
                "inter_arrival_time":[1.5],
                
                #lambda parameter of process noise
                "process_stability_scale":[0.1],
                
                #probability of getting an agent
                "resource_availability_p":[0.5],
                #number of agents
                "resource_availability_n":[3],
                #waiting time in days, when no agent is available
                "resource_availability_m":[0.041],
                
                #variation between activity durations
                "activity_duration_lambda_range":[1],
                
                #business hours definition: when can cases be processed?
                "Deterministic_offset_W":["weekdays"], #weekdays, all-week
                #time-unit for a full week: days = 7, hrs = 24*7, etc.
                "Deterministic_offset_u":[7],
                
                #run full model pipeline from simulated data
                "model_pipeline":[False],
                
                #repeat the experiment
                "num_replications":list(range(0,1))
               }


# Generate a full factorial:
df=build_full_fact(run_settings)#[0:2]

# Recode the string factor levels (recoding from natural number to string)
df = fix_label_values(df, run_settings, variables = ["process_entropy",
                                                     "process_type",
                                                     "Deterministic_offset_W",
                                                     "model_pipeline"])
df = df.drop("Name_fix",axis=1)
# Important variables
df["RUN"] = df.index + 1
df["Done"] = 0
df["Failure"] = 0

#change types and save
df.statespace_size = df.statespace_size.astype(int)
df.to_csv("results/design_table.csv",index=False)
df

Unnamed: 0,process_entropy,number_of_traces,statespace_size,process_type,process_memory,med_ent_n_transitions,inter_arrival_time,process_stability_scale,resource_availability_p,resource_availability_n,resource_availability_m,activity_duration_lambda_range,Deterministic_offset_W,Deterministic_offset_u,model_pipeline,num_replications,RUN,Done,Failure
0,min_entropy,100.0,5,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,1,0,0
1,med_entropy,100.0,5,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,2,0,0
2,max_entropy,100.0,5,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,3,0,0
3,min_entropy,100.0,10,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,4,0,0
4,med_entropy,100.0,10,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,5,0,0
5,max_entropy,100.0,10,memoryless,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,6,0,0
6,min_entropy,100.0,5,memory,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,7,0,0
7,med_entropy,100.0,5,memory,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,8,0,0
8,max_entropy,100.0,5,memory,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,9,0,0
9,min_entropy,100.0,10,memory,4.0,3.0,1.5,0.1,0.5,3.0,0.041,1.0,weekdays,7.0,False,0.0,10,0,0


In [8]:
#df = df.loc[:1]
#df

# Perform the experiments

In [None]:
results = []

for run in df.index:
    print(run)
    
    start_time = time.time()
    
    """
    Settings from experiments
    """
    curr_settings = df.loc[run]
    
    """
    settings for the simulation
    """
    
    SIM_SETTINGS = {"save_eventlog":1, #0 = no, 1 = yes...
                
                "statespace_size":make_D(int(curr_settings["statespace_size"])),

                "number_of_traces":int(curr_settings["number_of_traces"]),  

                "process_entropy":curr_settings["process_entropy"],

                "process_type":curr_settings["process_type"],                

                "process_memory":int(curr_settings["process_memory"]),          
                    
                # desired max number of possible transitions in P. 
                # NOTE: This can maximally be the number of states, and should be higher than 2
                "med_ent_n_transitions":int(curr_settings["med_ent_n_transitions"]),
                    
                #lambda parameter of inter-arrival times
                "time_settings":{"inter_arrival_time":float(curr_settings["inter_arrival_time"]), 
                                 
                                #lambda parameter of process noise
                                "process_stability_scale":float(curr_settings["process_stability_scale"]),
                                 
                                #probability of getting an agent
                                "resource_availability_p":float(curr_settings["resource_availability_p"]),                          
                                #waiting time in days, when no agent is available      
                                "resource_availability_n":int(curr_settings["resource_availability_n"]),
                                #waiting time in days, when no agent is available
                                "resource_availability_m":float(curr_settings["resource_availability_m"]), 
                                 
                                #variation between activity durations
                                "activity_duration_lambda_range":float(curr_settings["activity_duration_lambda_range"]),

                                #time-unit for a full week: days = 7, hrs = 24*7, etc.
                                "Deterministic_offset_W":make_workweek(curr_settings["Deterministic_offset_W"]),

                                "Deterministic_offset_u":int(curr_settings["Deterministic_offset_u"])},
                #offset for the timestamps in days from 1970
                "datetime_offset":53,
                #run is the seed
                "run":run}

    # generate the log
    log = Generate_eventlog(SIM_SETTINGS)
    #log.to_csv("results/"+str(run)+"log.csv",index=False)
    
    # log simulated log characteristics
    curr_settings["simuation_time_sec"] = time.time() - start_time
    curr_settings["num_traces"] = len(log.caseid.unique())
    curr_settings["num_events"] = len(log)
    
    variants = []
    tracelengths = []
    for traceid in log.caseid.unique():
        trace = log.loc[log.caseid == traceid]
        #tracelen
        tracelen = len(trace)
        tracelengths.append(tracelen)
        #variant
        sequence = ""
        sequence = sequence.join(trace.activity.tolist())
        variants.append(sequence)
        
    n_variants = len(set(variants))       
    
    curr_settings["num_variants"] = n_variants
    
    curr_settings["avg_tracelen"] = np.mean(tracelengths)
    curr_settings["min_tracelen"] = np.min(tracelengths)
    curr_settings["max_tracelen"] = np.max(tracelengths)
    
    

    # run a machine learning pipeline as well?
    if curr_settings["model_pipeline"] == True:
        """
        Prepare data for modelling
        """
        input_data = prepare_data_f_memory(log)

        """
        Train a model
        """
        # X: 
        input_data["x_train"]
        input_data["x_test"]

        # Y:
        input_data["y_test"]
        input_data["y_test"]

        """
        Evaluate the model
        """

    
        """
        Store the results
        """
        curr_settings["RES_num_events"] = len(log)
        
    #append
    #curr_settings = pd.DataFrame(curr_settings.T)
    #curr_settings.index = [run]
    results.append(curr_settings)
#store results
results = pd.DataFrame(results)
results.to_csv("results/experiments.csv",index=False)  

0
traces: 100
events: 500
ids: 100
1
traces: 100
events: 340
ids: 100
2
traces: 100
events: 531
ids: 100
3
traces: 100
events: 1000
ids: 100
4
traces: 100
events: 1155
ids: 100
5
traces: 100
events: 1098
ids: 100
6
traces: 100
events: 500
ids: 100
7
generated traces: 100
traces: 100
events: 721
ids: 100
8
generated traces: 100
traces: 100
events: 622
ids: 100
9
traces: 100
events: 1000
ids: 100
10
generated traces: 100
traces: 100
events: 832
ids: 100
11
generated traces: 100
traces: 100
events: 977
ids: 100
12
traces: 100
events: 500
ids: 100
13
traces: 100
events: 388
ids: 100
14
traces: 100
events: 568
ids: 100
15
traces: 100
events: 1000
ids: 100
16
traces: 100
events: 787
ids: 100
17
traces: 100
events: 1203
ids: 100
18
traces: 100
events: 500
ids: 100
19
generated traces: 100
traces: 100
events: 820
ids: 100
20
generated traces: 100
traces: 100
events: 582
ids: 100
21
traces: 100
events: 1000
ids: 100
22


In [None]:
results

In [None]:
#log.to_csv("log.csv",index=False)
log.head(10)

In [None]:
log['caseid'].groupby(log['start_day']).count().plot(kind="bar")

In [None]:
log.start_datetime

In [None]:
(1/24)*3.25

# Inspect example data

In [None]:
log

In [None]:
log['arrival_datetime'].describe()

In [None]:
log['caseid'].groupby(log['arrival_datetime'].dt.day).count().plot(kind="bar")

In [None]:
log['caseid'].groupby(log['arrival_datetime'].dt.year).count().plot(kind="bar") #dayofyear