In [2]:
import pm4py

In [3]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2013_incidents.xes")



parsing log, completed traces ::   0%|          | 0/7554 [00:00<?, ?it/s]

In [4]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['org:resource', 'time:timestamp'])

df.head(10)

Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
4897,1-701284355,Accepted,-,2012-03-26 15:54:02+00:00
4898,1-701284355,Accepted,-,2012-03-26 15:54:07+00:00
8796,1-716650593,Accepted,-,2012-04-12 11:15:26+00:00
13687,1-726974974,Accepted,-,2012-04-20 07:10:11+00:00
8807,1-716650593,Accepted,-,2012-04-26 16:10:57+00:00
21610,1-731896824,Accepted,-,2012-04-28 07:02:38+00:00
33079,1-736602136,Queued,-,2012-05-02 13:30:22+00:00
41404,1-738331603,Accepted,-,2012-05-02 13:32:32+00:00
33083,1-736602136,Accepted,-,2012-05-02 14:51:19+00:00
33084,1-736602136,Queued,-,2012-05-02 14:56:29+00:00


In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def create_activity_sequences(df, prefix_length):
    sequences = []
    next_activities = []
    resources = []

    for case_id, case_df in df.groupby('concept:name'):
        activities = case_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]
            next_activity = activities[prefix_length]
            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(case_id)

    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i + 1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df


max_sequence_length = df.groupby('concept:name').size().max()
print(f"Maximum sequence length: {max_sequence_length}")

# prefix_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150] #for BPIC2013 resources
prefix_lengths = [10, 15, 20, 25, 30, 35, 40, 45, 50] #for BPIC2013 cases

# prefix_lengths = [100, 150, 200, 400, 600, 800, 1000, 1200, 1400, 1500, 2000] #for BPIC2017 resources
# prefix_lengths = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80] #for BPIC2017 cases
# prefix_lengths = [100, 150, 200, 400, 600, 800, 1000, 1200, 1400, 1500, 2000, 2500] #for BPIC2018 resources
# prefix_lengths = [100, 125, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500, 600] #for BPIC2018 cases 
# prefix_lengths = [100, 150, 200, 300, 400, 500, 600, 700, 800] #for BPIC2019 resources
# prefix_lengths = [100, 125, 150, 175, 200, 225, 250, 300, 400] #for BPIC2019 cases


results = []

for prefix_length in prefix_lengths:
    sequences_df = create_activity_sequences(df, prefix_length)

    if sequences_df.empty:
        print(f"No valid sequences for prefix length {prefix_length}")
        results.append({'Prefix Length': prefix_length, 'Accuracy': None, 'Num Samples': 0})
        continue

    # Label encode activities
    label_encoder = LabelEncoder()
    all_activities = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)

    for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    # Define features and target
    X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
    y = sequences_df['next_activity']

    # Handle rare classes
    rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()
    if len(rare_classes) > 0:
        y = y.replace(rare_classes, -1)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Majority class predictor
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    y_pred = dummy_clf.predict(X_test)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Save results
    results.append({
        'Prefix Length': prefix_length,
        'Accuracy': round(accuracy, 4),
        'Num Samples': len(sequences_df)
    })

results_df = pd.DataFrame(results)
results_df.to_excel("majority_class_accuracies_BPIC2013_resource_new.xlsx", index=False)

print("Results saved to majority_class_accuracies.xlsx")

Maximum sequence length: 40117
Results saved to majority_class_accuracies.xlsx


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def create_resource_sequences(df, prefix_length):
    sequences = []
    next_activities = []
    resources = []

    # group by resource instead of case
    for resource, res_df in df.groupby('org:resource'):
        # sort by timestamp
        res_df = res_df.sort_values('time:timestamp')
        activities = res_df['concept:name'].values

        # same logic as before, but applied to resource traces
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]
            next_activity = activities[prefix_length]

            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(resource)

    # build DataFrame
    sequences_df = pd.DataFrame(
        sequences,
        columns=[f"activity_{i+1}" for i in range(prefix_length)]
    )
    sequences_df["next_activity"] = next_activities
    sequences_df["org:resource"] = resources

    return sequences_df


# --- Compute maximum prefix length per resource ---
max_sequence_length = df.groupby("org:resource").size().max()
print(f"Maximum resource-trace length: {max_sequence_length}")


# Example prefix lengths for resource-centric BPIC2013
prefix_lengths = [10, 20, 30, 40, 50, 75, 100]


results = []

for prefix_length in prefix_lengths:
    sequences_df = create_resource_sequences(df, prefix_length)

    if sequences_df.empty:
        print(f"No valid sequences for prefix length {prefix_length}")
        results.append({
            "Prefix Length": prefix_length,
            "Accuracy": None,
            "Num Samples": 0
        })
        continue

    # label encode
    label_encoder = LabelEncoder()
    all_acts = sequences_df[
        [f"activity_{i+1}" for i in range(prefix_length)] + ["next_activity"]
    ].values.flatten()

    label_encoder.fit(all_acts)

    for col in [f"activity_{i+1}" for i in range(prefix_length)] + ["next_activity"]:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    # features and target
    X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
    y = sequences_df["next_activity"]

    # replace rare classes with -1
    rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()
    if rare_classes:
        y = y.replace(rare_classes, -1)

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # majority baseline
    dummy = DummyClassifier(strategy="most_frequent")
    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    # store results
    results.append({
        "Prefix Length": prefix_length,
        "Accuracy": round(accuracy, 4),
        "Num Samples": len(sequences_df)
    })

# save results
results_df = pd.DataFrame(results)
results_df.to_excel("majority_class_accuracies_resource_centric.xlsx", index=False)

print("Saved: majority_class_accuracies_resource_centric.xlsx")

Maximum resource-trace length: 6162
Saved: majority_class_accuracies_resource_centric.xlsx
