In [2]:
### Display
from IPython.display import display
## Data Handling
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import random
import numpy as np
from cvxopt.misc_solvers import scale


In [3]:
# please do not change or delete this cell (probably not needed, but just in case ;))
random.seed(42)
np.random.seed(42)

In [4]:
# Try fast rustxes importer first (if installed), otherwise fall back to default importer
try:
    log = pm4py.read_xes("fines_event_log.xes", variant="rustxes")
except Exception as e:
    print("rustxes importer not available, falling back to default XES importer:", repr(e))
    log = pm4py.read_xes("fines_event_log.xes")

# Ensure the dataframe is in the format expected by pm4py algorithms
log = pm4py.format_dataframe(
    log,
    case_id="case:concept:name",
    activity_key="concept:name",
    timestamp_key="time:timestamp",
)
log = log.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)

log



Unnamed: 0,concept:name,penalty,time:timestamp,dismissal,expense,case:fine,case:concept:name,paymentAmount
0,Create Fine,,2006-07-23 22:00:00+00:00,,,35.0,A1,
1,Send Fine,,2006-12-04 23:00:00+00:00,,11.00,35.0,A1,
2,Create Fine,,2007-03-19 23:00:00+00:00,,,36.0,A10008,
3,Send Fine,,2007-07-16 22:00:00+00:00,,13.00,36.0,A10008,
4,Insert Fine Notification,,2007-08-01 22:00:00+00:00,,,36.0,A10008,
...,...,...,...,...,...,...,...,...
267247,Create Fine,,2002-09-06 22:00:00+00:00,,,131.0,V9999,
267248,Send Fine,,2002-10-24 22:00:00+00:00,,15.16,131.0,V9999,
267249,Insert Fine Notification,,2002-11-03 23:00:00+00:00,,,131.0,V9999,
267250,Add penalty,131.0,2003-01-02 23:00:00+00:00,,,131.0,V9999,


In [5]:
# number of cases
number_cases = log["case:concept:name"].nunique()
number_cases

71522

In [6]:
number_activities = log["concept:name"].nunique()
number_activities

11

# a) Full Model Discovery

In [None]:
from pm4py.objects.process_tree.obj import Operator

# Discover a model from the full event log (standard inductive miner = default parameters)
pt_full = pm4py.discover_process_tree_inductive(log)
net_full, im_full, fm_full = pm4py.convert_to_petri_net(pt_full)

print("Discovered process tree (Inductive Miner, default parameters):")
print(pt_full)

# --- Helper functions to reason about the process tree (model-based answers) ---
def can_be_empty(node):
    """Returns True iff the subtree can generate an empty trace (tau)."""
    if node.operator is None:
        return node.label is None

    op = node.operator
    if op in (Operator.XOR, Operator.OR):
        return any(can_be_empty(c) for c in node.children)
    if op in (Operator.SEQUENCE, Operator.PARALLEL):
        return all(can_be_empty(c) for c in node.children)
    if op == Operator.LOOP:
        # Loop executes its body at least once; empty only if body can be empty
        return can_be_empty(node.children[0]) if node.children else True

    return any(can_be_empty(c) for c in node.children)


def start_activities(node):
    """Returns the set of visible activities that can start a trace in the model."""
    if node.operator is None:
        return set() if node.label is None else {node.label}

    op = node.operator
    if op == Operator.SEQUENCE:
        res = set()
        for c in node.children:
            res |= start_activities(c)
            if not can_be_empty(c):
                break
        return res

    if op in (Operator.XOR, Operator.OR, Operator.PARALLEL):
        res = set()
        for c in node.children:
            res |= start_activities(c)
        return res

    if op == Operator.LOOP:
        return start_activities(node.children[0]) if node.children else set()

    res = set()
    for c in node.children:
        res |= start_activities(c)
    return res


def labels_in_subtree(node):
    if node.operator is None:
        return set() if node.label is None else {node.label}
    res = set()
    for c in node.children:
        res |= labels_in_subtree(c)
    return res


def mandatory_all(node):
    """Labels that appear in every trace generated by this subtree."""
    if node.operator is None:
        return set() if node.label is None else {node.label}

    op = node.operator
    child_mand = [mandatory_all(c) for c in node.children]

    if op in (Operator.SEQUENCE, Operator.PARALLEL):
        res = set()
        for s in child_mand:
            res |= s
        return res

    if op in (Operator.XOR, Operator.OR):
        if not child_mand:
            return set()
        res = set(child_mand[0])
        for s in child_mand[1:]:
            res &= s
        return res

    if op == Operator.LOOP:
        return mandatory_all(node.children[0]) if node.children else set()

    res = set()
    for s in child_mand:
        res |= s
    return res


def find_paths(node, target_label, path=None):
    if path is None:
        path = []
    path = path + [node]

    if node.operator is None:
        return [path] if node.label == target_label else []

    res = []
    for c in node.children:
        res.extend(find_paths(c, target_label, path))
    return res


def forced_labels_if_activity_occurs(tree, target_label):
    """Returns labels that are guaranteed to occur if target_label occurs (model-based)."""
    paths = find_paths(tree, target_label)
    if not paths:
        return set()

    forced_sets = []
    for path in paths:
        forced = {target_label}
        # Traverse upwards from leaf to root and collect mandatory labels from siblings
        for i in range(len(path) - 1, 0, -1):
            child = path[i]
            parent = path[i - 1]
            if parent.operator in (Operator.SEQUENCE, Operator.PARALLEL):
                for sib in parent.children:
                    if sib is not child:
                        forced |= mandatory_all(sib)
        forced_sets.append(forced)

    common = set.intersection(*forced_sets) if forced_sets else set()
    return common


def repeatable_activities(tree):
    """Returns visible activities that can occur more than once (loop bodies)."""
    reps = set()

    def visit(node):
        if node.operator == Operator.LOOP and node.children:
            reps.update(labels_in_subtree(node.children[0]))
        for c in getattr(node, "children", []) or []:
            visit(c)

    visit(tree)
    return reps


# a1) Start activities
starts = sorted(start_activities(pt_full))
print("\n(a1) Start activities (from the model):", starts)

# a2) Mandatory activities/order for traces containing an appeal to a judge
forced_for_judge = forced_labels_if_activity_occurs(pt_full, "Appeal to Judge")
print("\n(a2) If 'Appeal to Judge' occurs, the model forces these activities to occur:")
print(sorted(forced_for_judge))
print(
    "Order (from the process tree): the trace starts with 'Create Fine'. "
    "The model also requires 'Send Fine' if 'Appeal to Judge' occurs, but allows 'Send Fine' and "
    "'Appeal to Judge' in any order after 'Create Fine' (parallel part in the model)."
)

# a3) Activities that can be executed more than once
repeatable = sorted(repeatable_activities(pt_full))
print("\n(a3) Activities that can occur more than once:", repeatable)

# a4) Credit collection without sending the fine
forced_for_cc = forced_labels_if_activity_occurs(pt_full, "Send for Credit Collection")
cc_without_send_fine_possible = "Send Fine" not in forced_for_cc
print(
    "\n(a4) Can 'Send for Credit Collection' happen without 'Send Fine'?",
    cc_without_send_fine_possible,
)

# Optional visualization (process tree)
from pm4py.visualization.process_tree import visualizer as pt_visualizer
gviz_tree_full = pt_visualizer.apply(pt_full)
gviz_tree_full

# b) Behavior Case Frequency

For this question, we recommend using the following event log filtering methods of PM4PY:
- filter_eventually_follows_relation (https://processintelligence.solutions/static/api/2.7.17/generated/pm4py.filtering.filter_eventually_follows_relation.html)
- filter_directly_follows_relation (https://processintelligence.solutions/static/api/2.7.17/generated/pm4py.filtering.filter_directly_follows_relation.html)

In [None]:
# b) Count cases exhibiting specific (allowed) but suspicious behavior

def first_event_time(activity_name: str) -> pd.Series:
    """Earliest timestamp of an activity per case (only for cases where the activity occurs)."""
    return (
        log.loc[log["concept:name"] == activity_name]
        .groupby("case:concept:name")["time:timestamp"]
        .min()
    )


min_payment = first_event_time("Payment")
min_cc = first_event_time("Send for Credit Collection")
min_penalty = first_event_time("Add penalty")
min_send_fine = first_event_time("Send Fine")
min_send_appeal_pref = first_event_time("Send Appeal to Prefecture")
min_appeal_judge = first_event_time("Appeal to Judge")
min_notify_result = first_event_time("Notify Result Appeal to Offender")

# (b1) A payment is made but the case is still sent for credit collection
b1_candidates = min_payment.index.intersection(min_cc.index)
b1_cases = [cid for cid in b1_candidates if min_payment.loc[cid] < min_cc.loc[cid]]

# (b2) A penalty is added before the fine is sent via post
b2_candidates = min_penalty.index.intersection(min_send_fine.index)
b2_cases = [cid for cid in b2_candidates if min_penalty.loc[cid] < min_send_fine.loc[cid]]

# (b3) An appeal to prefecture or judge is made after a payment
b3_pref_candidates = min_payment.index.intersection(min_send_appeal_pref.index)
b3_pref = {cid for cid in b3_pref_candidates if min_payment.loc[cid] < min_send_appeal_pref.loc[cid]}
b3_judge_candidates = min_payment.index.intersection(min_appeal_judge.index)
b3_judge = {cid for cid in b3_judge_candidates if min_payment.loc[cid] < min_appeal_judge.loc[cid]}
b3_cases = b3_pref | b3_judge

# (b4) Notified about appeal result without any appeal
notify_cases = set(min_notify_result.index)
appeal_cases = set(min_send_appeal_pref.index) | set(min_appeal_judge.index)
b4_cases = notify_cases - appeal_cases

summary_b = pd.DataFrame(
    {
        "behavior": [
            "payment -> credit collection",
            "penalty -> send fine",
            "payment -> (appeal prefecture or judge)",
            "notify result without any appeal",
        ],
        "#cases": [len(b1_cases), len(b2_cases), len(b3_cases), len(b4_cases)],
    }
)

summary_b

# c) Variants

In [None]:
# c) Variants: cumulative variant coverage + top-5 variants

variants_counts = pm4py.get_variants_as_tuples(log)  # dict: variant(tuple) -> count
sorted_variants = sorted(variants_counts.items(), key=lambda kv: kv[1], reverse=True)

variant_counts = np.array([cnt for _, cnt in sorted_variants], dtype=int)
variant_rel = variant_counts / number_cases
variant_cum = np.cumsum(variant_rel)

# Add 0th variant to start the plot at (0,0)
x = np.arange(0, len(sorted_variants) + 1)
y = np.concatenate([[0.0], variant_cum])

plt.figure(figsize=(10, 4))
plt.plot(x, y, marker="o", linewidth=1)
plt.ylim(0, 1.01)
plt.xlim(0, len(sorted_variants))
plt.grid(True, alpha=0.3)
plt.xlabel("#variants (ranked by frequency, most frequent first)")
plt.ylabel("cumulative share of cases")
plt.title("Cumulative variant frequency")
plt.tight_layout()
plt.show()

# Interpretation (2 sentences)
top5_share = float(variant_rel[:5].sum())
top10_share = float(variant_rel[:10].sum())
print(
    f"Interpretation: The curve rises very steeply—already the top 5 variants cover about {top5_share:.1%} of all cases. "
    f"After that, the curve flattens (top 10 variants cover about {top10_share:.1%}), indicating a long tail of rare behavior." 
)

# 5 most frequent variants and how many cases they cover
top5 = sorted_variants[:5]
top5_df = pd.DataFrame(
    {
        "rank": range(1, 6),
        "cases": [cnt for _, cnt in top5],
        "variant": [" -> ".join(v) for v, _ in top5],
    }
)
top5_df

# d) Case overview

In [None]:
# d) Case overview: open vs closed (paid full / dismissed / credit collection)

# Case-level amounts (expense/penalty at most once per case, but multiple payments can exist)
case_fine = log.groupby("case:concept:name")["case:fine"].first().fillna(0)
case_expense = log.groupby("case:concept:name")["expense"].max().fillna(0)
case_penalty = log.groupby("case:concept:name")["penalty"].max().fillna(0)
due_amount = case_fine + case_expense + case_penalty

paid_amount = log.groupby("case:concept:name")["paymentAmount"].sum().fillna(0)
is_dismissed = log.groupby("case:concept:name")["dismissal"].apply(lambda s: s.notna().any())

cc_case_ids = set(log.loc[log["concept:name"] == "Send for Credit Collection", "case:concept:name"].unique())
dismissed_case_ids = set(is_dismissed[is_dismissed].index)
paid_full_case_ids = set(due_amount.index[paid_amount >= due_amount])

# Classification (mutually exclusive in the provided data)
case_status = pd.Series("open", index=due_amount.index)
case_status.loc[list(cc_case_ids)] = "credit_collection"
case_status.loc[list(dismissed_case_ids - cc_case_ids)] = "dismissed"
case_status.loc[list(paid_full_case_ids - cc_case_ids - dismissed_case_ids)] = "paid_full"

status_counts = case_status.value_counts()

labels = [
    f"Open ({status_counts.get('open', 0)})",
    f"Closed: paid full ({status_counts.get('paid_full', 0)})",
    f"Closed: dismissed ({status_counts.get('dismissed', 0)})",
    f"Closed: credit collection ({status_counts.get('credit_collection', 0)})",
]
sizes = [
    int(status_counts.get("open", 0)),
    int(status_counts.get("paid_full", 0)),
    int(status_counts.get("dismissed", 0)),
    int(status_counts.get("credit_collection", 0)),
]

plt.figure(figsize=(7, 5))
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
plt.axis("equal")
plt.title("Case status (open vs closed categories)")
plt.tight_layout()
plt.show()

closed_case_ids = set(case_status.index[case_status != "open"])
print("\nClosed cases:", len(closed_case_ids))
print("Open cases:", int(status_counts.get('open', 0)))


# e) Filtered log

In [None]:
from IPython.display import display

# e) Sublog with closed cases only + keep the 5 most frequent closed-case variants
if "closed_case_ids" not in globals():
    closed_case_ids = set(case_status.index[case_status != "open"])

closed_log = log[log["case:concept:name"].isin(closed_case_ids)].copy()
print("Closed cases in sublog:", closed_log["case:concept:name"].nunique())

# Variant frequencies (closed cases only)
closed_variants_counts = pm4py.get_variants_as_tuples(closed_log)
sorted_closed_variants = sorted(closed_variants_counts.items(), key=lambda kv: kv[1], reverse=True)
top5_closed_variants = sorted_closed_variants[:5]

top5_closed_df = pd.DataFrame(
    {
        "rank": range(1, 6),
        "cases": [cnt for _, cnt in top5_closed_variants],
        "variant": [" -> ".join(v) for v, _ in top5_closed_variants],
    }
)
display(top5_closed_df)

# Filter the closed log to keep only cases belonging to these 5 variants
case_variant_tuple = (
    closed_log.sort_values(["case:concept:name", "time:timestamp"])
    .groupby("case:concept:name")["concept:name"]
    .apply(tuple)
)
top5_variant_set = {v for v, _ in top5_closed_variants}
selected_case_ids = case_variant_tuple[case_variant_tuple.isin(top5_variant_set)].index

filtered_log = closed_log[closed_log["case:concept:name"].isin(selected_case_ids)].copy()
print("\nCases in filtered log (top-5 closed variants):", filtered_log["case:concept:name"].nunique())

# Discover Petri net on the filtered log (inductive miner, default parameters)
pt_filtered = pm4py.discover_process_tree_inductive(filtered_log)
net_filtered, im_filtered, fm_filtered = pm4py.convert_to_petri_net(pt_filtered)

print("\nFiltered process tree:")
print(pt_filtered)

from pm4py.visualization.petri_net import visualizer as pn_visualizer
gviz_net_filtered = pn_visualizer.apply(net_filtered, im_filtered, fm_filtered)
gviz_net_filtered

# Commentary (3-4 sentences)
full_activities = set(log["concept:name"].unique())
filtered_activities = set(filtered_log["concept:name"].unique())
print(
    "\nComment: Compared to the full-log model, the filtered model contains only the core activities "
    f"{sorted(filtered_activities)} and removes all appeal-related behavior ({sorted(full_activities - filtered_activities)}). "
    "In this filtered model, payments occur in a loop and can happen directly after 'Create Fine' (as in the frequent variant 'Create Fine -> Payment'), "
    "so the payment timing is simplified compared to the full-log model. "
    "Also, payments and 'Send for Credit Collection' are mainly alternative outcomes (XOR) here, "
    "whereas the full-log model is more permissive and allows co-occurrence patterns (e.g., payment followed by credit collection)."
)


# f) Fitness

In [None]:
# f) Fitness of the full log on the filtered model (token-based replay)
fitness = pm4py.fitness_token_based_replay(log, net_filtered, im_filtered, fm_filtered)

# Depending on the pm4py version, both keys can exist
perfect_pct = fitness.get("percentage_of_fitting_traces", fitness.get("perc_fit_traces"))
log_fitness = fitness.get("log_fitness")

print("Percentage of perfectly fitting traces:", perfect_pct)
print("Log fitness:", log_fitness)

print(
    "\nExplanation: 'percentage of perfectly fitting traces' counts only traces with fitness = 1.0 (no missing/remaining tokens). "
    "The 'log fitness' is an average over all traces, so traces that are not perfect can still contribute a high fitness value if they deviate only slightly from the model. "
    "Therefore, log fitness can be much higher than the percentage of perfectly fitting traces."
)
