# Prepare the labelled datasets

- We need the partitioned BPI2013 & BPI2015 labelled by partition
- In addition, we need to sample them down such that they are more balanced

In [21]:
import pm4py
import os
import pandas as pd
import numpy as np

In [22]:
def read_log_partitions(logdir):
    all_logs = []
    for ds in os.listdir(logdir):
        print(ds)
        fn = os.path.join(logdir, ds)
        log = pm4py.read_xes(fn)
        print(len(log))
        log["origin"] = ds
        all_logs.append(log)
    log = pd.concat(all_logs)
    log = log.reset_index(drop=True)
    return log


In [23]:
def _make_ccn_unique(log):
    # Problem: some ccns appear in several partitions --> relabel the ccn
    # Construct a unique ccn consisting of the original ccn + the origin of the trace
    new_ccn = log["case:concept:name"] + "-" + log["origin"]
    log["case:concept:name_from_partition"] = log["case:concept:name"]
    log["case:concept:name"] = new_ccn

    # Check that the new one does not contain duplicates
    tmp = log.groupby(["case:concept:name", "origin"]).count().reset_index()
    assert len(tmp[tmp.duplicated(["case:concept:name", "origin"])]) == 0
    return log

## BPI2013

In [24]:
logdir = "./datasets/bpi2013/"
target_ds_name = "./datasets/bpi2013-downsampled.xes"

In [25]:
log = read_log_partitions(logdir)
log = _make_ccn_unique(log)

# Ensure the index is alright, should be a range
print(log.index)
print(log.index[-100:])

closed.xes
6660
incidents.xes
65533
open.xes
2351
RangeIndex(start=0, stop=74544, step=1)
RangeIndex(start=74444, stop=74544, step=1)


In [26]:
# Downsample the log
print("N traces:")
n_traces_per_log = []
for o, df_ in log.groupby("origin"):
    n_traces = len(df_["case:concept:name"].unique())

    print(o, n_traces)
    n_traces_per_log.append((o, n_traces))

o, n_traces_max = max(n_traces_per_log, key=lambda x: x[1])
print(o, n_traces_max)

print("N traces relative to max n traces in partition:")
for o, df_ in log.groupby("origin"):
    n_traces_rel = len(df_["case:concept:name"].unique())/n_traces_max
    print(o, n_traces_rel)

# --> Downsample incidents.xes to approx 20 %

N traces:
closed.xes 1487
incidents.xes 7554
open.xes 819
incidents.xes 7554
N traces relative to max n traces in partition:
closed.xes 0.19684935133703998
incidents.xes 1.0
open.xes 0.10841938046068308


In [27]:
# reduce log by selecting 20 % of traces belonging to incidents.xes
# keep the rest the same
incident_ccn = log[log["origin"] == "incidents.xes"]["case:concept:name"].unique()

to_remove = np.random.choice(incident_ccn, round(len(incident_ccn) * 0.8), replace=False)
assert log[log["case:concept:name"].isin(to_remove)]["origin"].unique()[0] == "incidents.xes"
print(f"Removing {len(to_remove) / len(incident_ccn)} % of incident rows")
log = log[~log["case:concept:name"].isin(to_remove)]
log.reset_index(drop=True, inplace=True)

Removing 0.7999735239608154 % of incident rows


In [28]:
# Check one more time: number of traces per cluster label
for o, df_ in log.groupby("origin"):
    print(o)
    print(len(df_["case:concept:name"].unique()))

closed.xes
1487
incidents.xes
1511
open.xes
819


In [29]:
# Difference expected because the name_from_partition is the original and there we had duplications across the cases
len(log["case:concept:name"].unique()) - len(log["case:concept:name_from_partition"].unique())

465

In [30]:
# Again, ensure the index is alright, should be a range
print(log.index)
print(log.index[-100:])

RangeIndex(start=0, stop=21859, step=1)
RangeIndex(start=21759, stop=21859, step=1)


In [31]:
# Store downsampled log
pm4py.write_xes(log, target_ds_name)

In [35]:
# For checks
rl = pm4py.read_xes(target_ds_name)

In [36]:
# Check one more time: number of traces per cluster label
for o, df_ in rl.groupby("origin"):
    print(o)
    print(len(df_["case:concept:name"].unique()))

closed.xes
1487
incidents.xes
1511
open.xes
819


## BPI2015

For BPI 2015, we also reduce the data size significantly for faster results

In [60]:
logdir = "./datasets/bpi2015/"
target_ds_name = "./datasets/bpi2015-downsampled-reduced.xes"

In [61]:
log = read_log_partitions(logdir)
log = _make_ccn_unique(log)

# Ensure the index is alright, should be a range
print(log.index)
print(log.index[-100:])

mun_5.xes
59083
mun_4.xes
47293
mun_3.xes
59681
mun_2.xes
44354
mun_1.xes
52217
RangeIndex(start=0, stop=262628, step=1)
RangeIndex(start=262528, stop=262628, step=1)


In [62]:
# Downsample the log
print("N traces:")
n_traces_per_log = []
for o, df_ in log.groupby("origin"):
    n_traces = len(df_["case:concept:name"].unique())

    print(o, n_traces)
    n_traces_per_log.append((o, n_traces))

o, n_traces_max = max(n_traces_per_log, key=lambda x: x[1])
print(o, n_traces_max)

print("N traces relative to max n traces in partition:")
for o, df_ in log.groupby("origin"):
    n_traces_rel = len(df_["case:concept:name"].unique())/n_traces_max
    print(o, n_traces_rel)

# --> Downsample mun3.xes to approx 75 %

N traces:
mun_1.xes 1199
mun_2.xes 832
mun_3.xes 1409
mun_4.xes 1053
mun_5.xes 1156
mun_3.xes 1409
N traces relative to max n traces in partition:
mun_1.xes 0.850958126330731
mun_2.xes 0.5904897090134847
mun_3.xes 1.0
mun_4.xes 0.7473385379701917
mun_5.xes 0.8204400283889283


In [63]:
# reduce log by removing 25 % of traces belonging to mun_3.xes
# keep the rest the same
incident_ccn = log[log["origin"] == "mun_3.xes"]["case:concept:name"].unique()

to_remove = np.random.choice(incident_ccn, round(len(incident_ccn) * 0.25), replace=False)
assert log[log["case:concept:name"].isin(to_remove)]["origin"].unique()[0] == "mun_3.xes"
print(f"Removing {len(to_remove) / len(incident_ccn)} % of incident rows")
log = log[~log["case:concept:name"].isin(to_remove)]
log.reset_index(drop=True, inplace=True)

Removing 0.24982256919801277 % of incident rows


In [64]:
# Check one more time: number of traces per cluster label
for o, df_ in log.groupby("origin"):
    print(o)
    print(len(df_["case:concept:name"].unique()))

print("N traces relative to max n traces in partition:")
for o, df_ in log.groupby("origin"):
    n_traces_rel = len(df_["case:concept:name"].unique())/n_traces_max
    print(o, n_traces_rel)

mun_1.xes
1199
mun_2.xes
832
mun_3.xes
1057
mun_4.xes
1053
mun_5.xes
1156
N traces relative to max n traces in partition:
mun_1.xes 0.850958126330731
mun_2.xes 0.5904897090134847
mun_3.xes 0.7501774308019872
mun_4.xes 0.7473385379701917
mun_5.xes 0.8204400283889283


In [None]:
# IMPORTANT
# Additional step only for BPI2015: 
# reduce the overall log to 15 % and ensure the traces per cluster are still somewhat balanced
unique_ccns = log["case:concept:name"].unique()
to_keep = np.random.choice(unique_ccns, round(len(unique_ccns) * 0.20), replace=False)

log = log[log["case:concept:name"].isin(to_keep)]

print("N traces relative to max n traces in partition:")
for o, df_ in log.groupby("origin"):
    n_traces_rel = len(df_["case:concept:name"].unique())
    print(o, n_traces_rel)

log.reset_index(drop=True, inplace=True)

N traces relative to max n traces in partition:
mun_1.xes 231
mun_2.xes 156
mun_3.xes 233
mun_4.xes 210
mun_5.xes 229


In [70]:
# Again, ensure the index is alright, should be a range
print(log.index)
print(log.index[-100:])

RangeIndex(start=0, stop=49516, step=1)
RangeIndex(start=49416, stop=49516, step=1)


In [71]:
# Store downsampled log
pm4py.write_xes(log, target_ds_name)

In [72]:
# For checks
rl = pm4py.read_xes(target_ds_name)