In [1]:
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.objects.conversion.bpmn import converter as bpmn_converter
from pm4py.algo.simulation.playout.petri_net import algorithm as petri_simulator
from pm4py.objects.log.obj import EventLog, Trace, Event
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import pandas as pd
import random

In [2]:
#!gdown --fuzzy "https://drive.google.com/file/d/1v-ltQdytbyEOqlmKj9b61rPvHtRHQB6E/view?usp=sharing"

In [3]:
#!pip install pm4py==2.2.27
#!pip install gdown

In [4]:
def create_timestamp(log, start_time=None, hours_between=1):
    """
    Flattens a PM4Py simulated event log into a list of dicts with columns: case, activity, timestamp.

    Parameters:
        simulated_log (pm4py.objects.log.obj.EventLog): The event log from simulation.
        start_time (datetime): Optional start time for each trace (default: Jan 1, 2024, 8:00am).
        hours_between (int): Hours of delay between events in a trace.

    Returns:
        List[Dict[str, Any]]: Flattened event log.
    """
    if start_time is None:
        start_time = datetime(2024, 1, 1, 8, 0, 0)

    flat_log = []
    case_id = 1

    for trace in simulated_log:
        timestamp = start_time
        for event in trace:
            flat_log.append({
                "case": case_id,
                "activity": event["concept:name"],
                "timestamp": timestamp
            })
            timestamp += timedelta(hours=hours_between)
        case_id += 1

    return flat_log

In [5]:
def count_trace_variants(df, case_col="case", activity_col="activity"):
    """
    Counts the frequency of each unique trace variant in a flat event log DataFrame.

    Parameters:
        df (pd.DataFrame): The event log DataFrame with at least 'case' and 'activity' columns.
        case_col (str): Name of the column representing case ID (default: 'case').
        activity_col (str): Name of the column representing activity name (default: 'activity').

    Returns:
        Counter: A Counter object where keys are activity tuples (variants) and values are counts.
    """
    trace_groups = df.groupby(case_col)[activity_col].apply(tuple)
    variant_counts = Counter(trace_groups)
    
    print("ðŸ“Š Trace Variant Frequencies:")
    # for variant, count in variant_counts.items():
    #     print(f"{variant}: {count}")
    
    return variant_counts

In [6]:
# --- Step 1: Load BPMN file ---
bpmn_path = "wastewater_process_model_wo_circle_opt.bpmn" 
bpmn_graph = bpmn_importer.apply(bpmn_path)

In [7]:
# --- Step 2: Convert BPMN to Petri net ---
net, im, fm = bpmn_converter.apply(bpmn_graph)

In [8]:
# --- Step 3: Simulate traces from Petri net ---
simulated_log = petri_simulator.apply(net, im, parameters={"no_traces": 1200})


In [9]:
flat_log = []
case_id = 1
log = create_timestamp(simulated_log, start_time=None, hours_between=1)

df_log = pd.DataFrame(log)
log_variant_counts = count_trace_variants(df_log)
print(log_variant_counts)
df_log.to_csv("original_log.csv", index=False)

ðŸ“Š Trace Variant Frequencies:
Counter({('Coarse Screens', 'Fine Screens', 'Compacted Screenings', 'Disposal to Landfill'): 498, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Sludge Grinder', 'Blend Tank', 'Anaerobic Digester', 'Storage Tank', 'Disposal to Landfill'): 128, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Sludge Grinder', 'Blend Tank', 'Anaerobic Digester'): 121, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Final Clarifier', 'Chlorine Contact Basin', 'Pond', 'De-chlorination Pond', 'Cascade Aerator'): 71, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Final Clarifier', 'Chlorine Contact Basin', 'Pond', 'De-chlorination Pond', 'Effluent Pumping Station'): 58, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Wastewater Centrifuge De-watering Thickening', 'Blend Tank', 'Anaerobic Digester', 'Storage Tank', 'Disposal to Landfill'): 26, ('Coarse Screens', 'Fi

In [10]:
variant_dict = defaultdict(list)
for trace in simulated_log:
    variant = tuple(event["concept:name"] for event in trace)
    variant_dict[variant].append(trace)

In [11]:
# --- Set fixed number of traces per variant ---
traces_per_variant = 91  # exact number per variant
final_log = []
case_id = 1

for variant, traces in variant_dict.items():
    # If we have fewer than needed, replicate traces to meet the target
    full_traces = []
    while len(full_traces) < traces_per_variant:
        for t in traces:
            full_traces.append(t)
            if len(full_traces) == traces_per_variant:
                break
    for trace in full_traces:
        timestamp = datetime(2024, 1, 1, 8, 0, 0)
        for event in trace:
            final_log.append({
                "case": case_id,
                "activity": event["concept:name"],
                "timestamp": timestamp
            })
            timestamp += timedelta(hours=1)
        case_id += 1
    

In [12]:
df_final = pd.DataFrame(final_log)
final_log_variant_counts = count_trace_variants(df_final)
print(final_log_variant_counts)
df_final.to_csv("even_variant_log.csv", index=False)

ðŸ“Š Trace Variant Frequencies:
Counter({('Coarse Screens', 'Fine Screens', 'Compacted Screenings', 'Disposal to Landfill'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Sludge Grinder', 'Blend Tank', 'Anaerobic Digester', 'Storage Tank', 'Disposal to Landfill'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Final Clarifier', 'Chlorine Contact Basin', 'Pond', 'De-chlorination Pond', 'Cascade Aerator'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Wastewater Centrifuge De-watering Thickening', 'Blend Tank', 'Anaerobic Digester'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Bio-reactor Tanks', 'Final Clarifier', 'Chlorine Contact Basin', 'Pond', 'De-chlorination Pond', 'Effluent Pumping Station'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'Sludge Grinder', 'Blend Tank', 'Anaerobic Digester'): 100, ('Coarse Screens', 'Fine Screens', 'Primary Clarifiers', 'B

## Filter out from the generated dataset by PLG with noise

In [1]:
import pandas as pd
from collections import defaultdict, Counter
from datetime import datetime, timedelta

df = pd.read_csv("../event_log/wastewater/log_5000_all_noise_30.csv")  

grouped = df.groupby("case")["activity"].apply(tuple)

variant_dict = defaultdict(list)
for case_id, variant in grouped.items():
    trace_df = df[df["case"] == case_id]
    trace_events = list(trace_df.sort_values("timestamp").to_dict("records"))
    variant_dict[variant].append(trace_events)

traces_per_variant = 5
final_log = []
new_case_id = 1

for variant, traces in variant_dict.items():
    full_traces = []
    while len(full_traces) < traces_per_variant:
        for t in traces:
            full_traces.append(t)
            if len(full_traces) == traces_per_variant:
                break

    for trace in full_traces:
        timestamp = datetime(2024, 1, 1, 8, 0, 0)
        for event in trace:
            final_log.append({
                "case": new_case_id,
                "activity": event["activity"],
                "timestamp": timestamp
            })
            timestamp += timedelta(hours=1)
        new_case_id += 1

final_df = pd.DataFrame(final_log)
final_df.to_csv("replicated_even_variant_log.csv", index=False)

print(f"âœ… Saved {len(final_df)} events from {new_case_id - 1} evenly replicated traces.")


âœ… Saved 35545 events from 5370 evenly replicated traces.
