# Dataset Creation

- [ ] Create the datasets for different k, different sizes, different variant to trace ratio (1 h)
  - Complexity:
    - var_to_trace ratio (high, medium, small)
    - maximum trace length (10, 15, 20)
    - nb_of_sequences (small, medium, large)
    - model modification (small, medium, large) 
  - 4 x k (3, 4, 5, 6)

In [1]:
import pm4py
import utils as utl
import os
import pandas as pd
import numpy as np
# from pm4py.objects.log.util import sampling

## Road traffic log

In [2]:
log_name = "road_traffic"
target_k = 3
n_per_model = [5, 5, 5] # in numbers if 1k
assert len(n_per_model) == target_k

size_per_model = [n * 1000 for n in n_per_model]

log = pm4py.read_xes(f"./datasets/datasets/{log_name}.xes")
data_dir = f"./data-synthetic/{log_name}/"
model_dir = f"./data-synthetic/{log_name}/models/"
os.makedirs(model_dir, exist_ok=True)

print(f"Creating synthetic dataset from log: {log_name} with {target_k} clusters and {sum(size_per_model)} traces in total")



Creating synthetic dataset from log: road_traffic with 3 clusters and 15000 traces in total


In [3]:
tree = utl.load_tree(log)
modified_trees = utl.create_modified_models(tree, k=target_k)

viz = []
for mt in modified_trees:
    viz.append(pm4py.visualization.process_tree.visualizer.apply(mt))

Swapping activities: Send Fine <--> Receive Result Appeal from Prefecture
Swapping activities: Insert Date Appeal to Prefecture <--> Payment
Swapping activities: Insert Date Appeal to Prefecture <--> Notify Result Appeal to Offender


In [4]:
# Overlap should be small between models
to_compare = utl.create_string_traces_to_compare(modified_trees)
overlaps = utl._measure_overlaps(to_compare)
print(overlaps)

[0.0007757951900698216, 0.0029770211182435576, 0.0]


In [5]:
synthetic_log = utl.create_synthetic_log(modified_trees, n_traces=250, var_to_trace_ratio=0.5, max_trace_length=50)

In [7]:
for k, df_ in synthetic_log.groupby("label"):
    print(f"Number of traces in the cluster: {k}")
    print(len(df_.groupby("case:concept:name")))
    print("Number of variants in the cluster:")
    print(
        len(pm4py.statistics.variants.log.get.get_variants(df_))
    )


Number of traces in the cluster: 0
250
Number of variants in the cluster:
112
Number of traces in the cluster: 1
256
Number of variants in the cluster:
106
Number of traces in the cluster: 2
250
Number of variants in the cluster:
109


In [None]:

pm4py.write_xes(synthetic_log, os.path.join(data_dir, f"{log_name}-labelled.xes"))
for k, mt in enumerate(modified_trees):
    pm4py.write_ptml(mt, os.path.join(model_dir, f"model-{k}.ptml"))


In [9]:
import k_traceoids as ktr

## Hospital billing log

In [None]:
log_name = "hospital_billing"
target_k = 4
n_per_model = [20, 20, 20, 20] # in numbers if 1k, because pm4pys simulation generates 1k traces at a time

assert len(n_per_model) == target_k
size_per_model = [n * 1000 for n in n_per_model]

log = pm4py.read_xes(f"./datasets/datasets/{log_name}.xes")
data_dir = f"./data-synthetic/{log_name}/"
model_dir = f"./data-synthetic/{log_name}/models/"
os.makedirs(model_dir, exist_ok=True)

print(f"Creating synthetic dataset from log: {log_name} with {target_k} clusters and {sum(size_per_model)} traces in total")

In [None]:
tree = utl.load_tree(log)
modified_trees = create_modified_models(tree, k=target_k)

viz = []
for mt in modified_trees:
    viz.append(pm4py.visualization.process_tree.visualizer.apply(mt))

In [None]:

pm4py.visualization.process_tree.visualizer.apply(tree)

In [None]:
viz[0]

In [None]:
# Overlap should be small between models
to_compare = create_string_traces_to_compare(modified_trees)
overlaps = utl._measure_overlaps(to_compare)
print(overlaps)

In [None]:
synthetic_log = simulate_log(n_per_model, modified_trees)

pm4py.write_xes(synthetic_log, os.path.join(data_dir, f"{log_name}-labelled.xes"))
for k, mt in enumerate(modified_trees):
    pm4py.write_ptml(mt, os.path.join(model_dir, f"model-{k}.ptml"))


In [None]:
l =  pm4py.read_xes(os.path.join(data_dir, f"{log_name}-labelled.xes"))

In [None]:
l

In [None]:
l.groupby("label").count()

In [None]:
## Scratch

In [None]:
l = pm4py.read_xes(os.path.join("./data-synthetic/road_traffic", f"road_traffic-labelled.xes"))

In [None]:
l.groupby("case:concept:name").count()

In [None]:
l["case:concept:name"]

In [None]:
pm4py.read_xes("./datasets/bpi2015-downsampled-reduced.xes")

## Scratch

In [12]:
k = 5
pm = "imf"
cc = "al"
max_iterations = 5
ds = "road_traffic-labelled"

log = ktr.data.prepare_log(f"./datasets/{ds}.xes")

iteration = 0
cluster_assignment = ktr.initialization.initialize_clusters(log, k)


In [14]:

models = ktr.models.calculate_model(log, pm, cluster_assignment)

In [15]:

fitness = ktr.conformance.check_conformance(log, models, cc)

In [16]:
iteration = 1
ca_col = f"cluster_assignment_{iteration}"

In [17]:
cluster_assignment[ca_col] = ktr.reassign.reassign_clusters(
                fitness,
                cluster_assignment,
                k,
            )

In [None]:
# Take the last two columns, check how much in percent is equal, if the percentage is

In [19]:
import pandas as pd

In [None]:
col1, col2 = cluster_assignment.iloc[:, -2], cluster_assignment.iloc[:, -1]
percent_equal = (col1 == col2)

In [25]:
percent_equal.sum() / percent_equal.count()

np.float64(0.36507936507936506)

In [29]:
if percent_equal.mean() <= 0.5:
    print("nah")

nah


In [18]:
cluster_assignment

Unnamed: 0,case_index,cluster_assignment_init,cluster_assignment_1
0,0,0.0,1
1,1,1.0,1
2,2,2.0,1
3,3,3.0,1
4,4,4.0,1
...,...,...,...
751,751,2.0,1
752,752,2.0,4
753,753,4.0,4
754,754,0.0,4
