In [1]:
import numpy as np
import pandas as pd
import pm4py
import json
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# load the data as dataframe
file_path = '../data/BPI Challenge 2012_1_all/BPI_Challenge_2012.xes'
log = pm4py.read_xes(file_path)
df = pm4py.convert_to_dataframe(log)



parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [3]:
# sort the rows by case id and timestamp
df = df.sort_values(by=["case:concept:name", "time:timestamp"], ascending=True)
df = df.drop(columns=["lifecycle:transition", "case:REG_DATE", 'org:resource', 'case:AMOUNT_REQ'])

In [4]:
# create an id for every activity
activity = df['concept:name'].unique()
act_id = np.arange(1, len(activity) + 1)
int_to_act = {k.item(): v for k, v in zip(act_id, activity)}
act_to_int = dict([(value, key) for key, value in int_to_act.items()])
print(act_to_int)

{'A_SUBMITTED': 1, 'A_PARTLYSUBMITTED': 2, 'A_PREACCEPTED': 3, 'W_Completeren aanvraag': 4, 'A_ACCEPTED': 5, 'O_SELECTED': 6, 'A_FINALIZED': 7, 'O_CREATED': 8, 'O_SENT': 9, 'W_Nabellen offertes': 10, 'O_SENT_BACK': 11, 'W_Valideren aanvraag': 12, 'A_REGISTERED': 13, 'A_APPROVED': 14, 'O_ACCEPTED': 15, 'A_ACTIVATED': 16, 'O_CANCELLED': 17, 'W_Wijzigen contractgegevens': 18, 'A_DECLINED': 19, 'A_CANCELLED': 20, 'W_Afhandelen leads': 21, 'O_DECLINED': 22, 'W_Nabellen incomplete dossiers': 23, 'W_Beoordelen fraude': 24}


In [5]:
# add a new column to the dataframe which desribes the activity id
activity_id = pd.Series(act_to_int)
df = df.merge(activity_id.to_frame(), how='left', left_on='concept:name', right_index=True)
df = df.rename(columns={0: "activity_id"})

In [6]:
# export the act_to_int dictionary as json
with open("../data/activity_map.json", "w") as f:
    json.dump(act_to_int, f, indent=4)

In [7]:
df.head(5)

Unnamed: 0,concept:name,time:timestamp,case:concept:name,activity_id
0,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,173688,1
1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,173688,2
2,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,173688,3
3,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,173688,4
4,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,173688,4


In [8]:
# Create a sequence of activities (of a certain size, here 5) for every case and take the next activity as target
# if the sequence is less than 5, pre-pad it with zeros
# This part was done by Gemini

def create_sequences(trace, window_size=5):
    """
    Input:
        trace: A list or array of activity indices (e.g., [1, 5, 8, 2])
        window_size: The fixed length for the model input (X)
    Output:
        X: List of padded sequences
        y: List of next-step targets
    """
    X = []
    y = []

    for i in range(1, len(trace)):
        target = trace[i]          
        history = trace[:i]       

        # --- The Padding Logic ---
        # If history is shorter than window, add 0s to the left (Pre-padding)
        if len(history) < window_size:
            # Formula: [0] * (missing amount) + [history]
            padded_history = [0] * (window_size - len(history)) + list(history)
        
        # If history is too long, cut it and keep only the RECENT events
        else:
            padded_history = list(history[-window_size:])
        
        X.append(padded_history)
        y.append(target)
        
    return X, y

# to prevent data leakage, we want to split the training and test data first, and only then we create their sequence
# 1. Get unique Case IDs
case_ids = df['case:concept:name'].unique()

# 2. Split Case IDs (80% Train, 20% Test)
train_cases, test_cases = train_test_split(case_ids, test_size=0.2, random_state=42)

# 3. Create a boolean mask to filter the original DF
train_df = df[df['case:concept:name'].isin(train_cases)]
test_df = df[df['case:concept:name'].isin(test_cases)]


# Group by Case ID
train_grouped_activity = train_df.groupby('case:concept:name')['activity_id'].apply(list)
test_grouped_activity = test_df.groupby('case:concept:name')['activity_id'].apply(list)

# Iterate and collect
train_X = []
train_y = []
test_X = []
test_y = []
WINDOW_SIZE = 5 

for trace in train_grouped_activity:
    X_trace, y_trace = create_sequences(trace, window_size=WINDOW_SIZE)
    
    # We extend the main list with the results from this trace
    train_X.extend(X_trace)
    train_y.extend(y_trace)

for trace in test_grouped_activity:
    X_trace, y_trace = create_sequences(trace, window_size=WINDOW_SIZE)
    
    # We extend the main list with the results from this trace
    test_X.extend(X_trace)
    test_y.extend(y_trace)

# Final Conversion to Numpy (Ready for PyTorch)
train_X = np.array(train_X)
train_y = np.array(train_y)
test_X = np.array(test_X)
test_y = np.array(test_y)

print(f"Final X shape: {train_X.shape}")
print(f"Final y shape: {train_y.shape}")

Final X shape: (199755, 5)
Final y shape: (199755,)


In [9]:
# Convert the data to Torch tensor and create Dataloaders
# This part was also done with Gemini
# Convert to Tensors
# X needs to be Long (integers) for Embedding layers
tensor_X_train = torch.LongTensor(train_X) 
tensor_y_train = torch.LongTensor(train_y)

tensor_X_test = torch.LongTensor(test_X)
tensor_y_test = torch.LongTensor(test_y)

# Wrap in TensorDataset
train_dataset = TensorDataset(tensor_X_train, tensor_y_train)
test_dataset = TensorDataset(tensor_X_test, tensor_y_test)

# Create DataLoaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=64)

print("DataLoaders ready!")

DataLoaders ready!


In [11]:
# save the tensors
torch.save(tensor_X_train, '../data/X_train.pt')
torch.save(tensor_y_train, '../data/y_train.pt')
torch.save(tensor_X_test, '../data/X_test.pt')
torch.save(tensor_y_test, '../data/y_test.pt')