In [1]:
import pandas as pd
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

# === Step 1: Load and prepare XES log ===
print("Loading XES log...")
log = xes_importer.apply("/Users/6706363/Downloads/BPI_Challenge_2019.xes")

# Convert to DataFrame
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

# Drop duplicate columns, if any
df = df.loc[:, ~df.columns.duplicated()]

# Rename columns to standard names
df.rename(columns={
    "case:concept:name": "case",
    "concept:name": "activity",
    "time:timestamp": "timestamp"
}, inplace=True)


Loading XES log...


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 251734/251734 [00:47<00:00, 5270.87it/s]


In [15]:
import pandas as pd

df = df.sort_values(by=["case", "timestamp"]).reset_index(drop=True)
df["next activity"] = df.groupby("case")["activity"].shift(-1)

for i in range(1, 7):
    df[f"split_{i}"] = df["case"].map(lambda x: "train" if hash((x, i)) % 5 < 3 else "test")

def build_prefixes(log_df, min_length):
    samples = []
    for _, case_ in log_df.groupby("case"):
        while (len(case_) - 1) >= min_length:
            pref = case_.iloc[:-1, :]
            samples.append(list(pref["activity"].values))
            case_ = pref
    return samples

leakage_list = []

print("Calculating example leakage only...")
for split in range(1, 7):
    train = df[df[f"split_{split}"] == "train"]
    test = df[df[f"split_{split}"] == "test"]

    train_prefixes = build_prefixes(train, 1)
    test_prefixes = build_prefixes(test, 1)

    train_prefix_set = set(tuple(p) for p in train_prefixes)
    test_prefix_set = set(tuple(p) for p in test_prefixes)

    # Count how many test prefixes appear in training set
    leaked = sum(1 for p in test_prefix_set if p in train_prefix_set)
    leakage_percent = leaked / len(test_prefix_set) if test_prefix_set else 0

    print(f"Split {split}: Leakage = {leakage_percent:.3f}")
    leakage_list.append(leakage_percent)

results_df = pd.DataFrame({
    "split": list(range(1, 7)),
    "leakage": [round(val, 3) for val in leakage_list]
})

results_df.to_csv("2018_example_leakage.csv", index=False)
print("Results saved to 2018_example_leakage.csv")

Calculating example leakage only...
Split 1: Leakage = 0.122
Split 2: Leakage = 0.119
Split 3: Leakage = 0.120
Split 4: Leakage = 0.119
Split 5: Leakage = 0.120
Split 6: Leakage = 0.118
Results saved to example_leakage_only.csv


In [None]:
# Resource-centric example leakage
import pandas as pd

df = df.sort_values(by=["org:resource", "timestamp"]).reset_index(drop=True)
df["next activity"] = df.groupby("org:resource")["activity"].shift(-1)

# Generate deterministic splits based on resource
for i in range(1, 7):
    df[f"split_{i}"] = df["org:resource"].map(lambda x: "train" if hash((x, i)) % 5 < 3 else "test")

def build_prefixes(log_df, min_length):
    samples = []
    for _, resource_df in log_df.groupby("org:resource"):
        while (len(resource_df) - 1) >= min_length:
            pref = resource_df.iloc[:-1, :]
            samples.append(list(pref["activity"].values))
            resource_df = pref
    return samples

leakage_list = []

print("Calculating example leakage from resource perspective...")
for split in range(1, 7):
    train = df[df[f"split_{split}"] == "train"]
    test = df[df[f"split_{split}"] == "test"]

    train_prefixes = build_prefixes(train, 1)
    test_prefixes = build_prefixes(test, 1)

    train_prefix_set = set(tuple(p) for p in train_prefixes)
    test_prefix_set = set(tuple(p) for p in test_prefixes)

    # Count how many test prefixes appear in training set
    leaked = sum(1 for p in test_prefix_set if p in train_prefix_set)
    leakage_percent = leaked / len(test_prefix_set) if test_prefix_set else 0

    print(f"Split {split}: Leakage = {leakage_percent:.3f}")
    leakage_list.append(leakage_percent)

results_df = pd.DataFrame({
    "split": list(range(1, 7)),
    "leakage": [round(val, 3) for val in leakage_list]
})

results_df.to_csv("2019_example_leakage_resource.csv", index=False)
print("Results saved to 2019_example_leakage_resource.csv")


Calculating example leakage from resource perspective...
