In [4]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from keras.api.models import Sequential
from keras.api.layers import Dense, LSTM
from keras.api.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split



In [5]:
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set no limit for column width

In [6]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI Challenge 2017.xes")


parsing log, completed traces :: 100%|██████████| 31509/31509 [00:40<00:00, 784.27it/s] 


In [7]:
# Assuming event_log is your DataFrame
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

# Sort by 'time:timestamp' and 'case:concept:name'
df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

df.head(n=10)


Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
686058,Application_1000086665,A_Create Application,User_1,2016-08-03 15:57:21.673000+00:00
686059,Application_1000086665,A_Submitted,User_1,2016-08-03 15:57:21.734000+00:00
686060,Application_1000086665,W_Handle leads,User_1,2016-08-03 15:57:21.963000+00:00
686061,Application_1000086665,W_Handle leads,User_1,2016-08-03 15:58:28.286000+00:00
686062,Application_1000086665,W_Complete application,User_1,2016-08-03 15:58:28.293000+00:00
686063,Application_1000086665,A_Concept,User_1,2016-08-03 15:58:28.299000+00:00
686064,Application_1000086665,W_Complete application,User_14,2016-08-04 13:39:29.557000+00:00
686065,Application_1000086665,W_Complete application,User_14,2016-08-04 13:50:12.281000+00:00
686066,Application_1000086665,A_Accepted,User_5,2016-08-05 13:57:07.419000+00:00
686067,Application_1000086665,O_Create Offer,User_5,2016-08-05 13:59:57.320000+00:00


In [8]:
def create_activity_resource_sequence(df, prefix_length):
    sequences = []
    grouped = df.groupby('case:concept:name')
    
    for _, group in grouped:
        activities = group['concept:name'].tolist()
        resources = group['org:resource'].tolist()
        
        # Only include sequences with length >= prefix_length
        if len(activities) < prefix_length:
            # Remove the sequence (skip appending it to the list)
            continue
        
        # Truncate to the desired prefix length
        current_activities = activities[:prefix_length]
        current_resources = resources[:prefix_length]  # Include all resources
        
        # Combine activities and resources into tuples (no changes for the last activity)
        sequence = []
        for i in range(len(current_activities)):
            # For all activities, include both activity and resource
            sequence.append((current_activities[i], current_resources[i]))
        
        # Add the valid sequence to the list
        sequences.append(sequence)
    
    return sequences

# Example usage
sequences = create_activity_resource_sequence(df, 4)

# # Initialize a set to store unique 'R' values
# unique_R = set()
# 
# # Loop through the list of sequences and extract the 'R' values
# for sequence in sequences:
#     for item in sequence:
#         # item[1] is the second element (the part with 'R')
#         unique_R.add(item[1])
# 
# # The length of the set will give the number of unique occurrences of 'R'
# print(len(unique_R))

In [9]:
#Prepare the list of activities and resources
activities = []
resources = []

# Loop through sequences to gather activities and resources
for seq in sequences:
    for i, item in enumerate(seq):
        activity, resource = item  # Each item is guaranteed to be (activity, resource)
        activities.append(activity)
        resources.append(resource)

#Fit the OneHotEncoder to the unique activities and resources
activity_encoder = OneHotEncoder() 
resource_encoder = OneHotEncoder()

# Fit the encoder on unique activities and resources
activity_encoder.fit([[activity] for activity in set(activities)])
resource_encoder.fit([[resource] for resource in set(resources)])

#Encode activities and resources
encoded_sequences = []
y_encoded = []  # List to store the one-hot encoded target resource for the last activity

for seq in sequences:
    activity_onehots = []
    
    # For each activity-resource pair, apply one-hot encoding
    for i, item in enumerate(seq):
        activity, resource = item
        activity_onehot = activity_encoder.transform([[activity]]).toarray()
        
        # If it's the last item, we only encode the activity and store the resource for y
        if i == len(seq) - 1:
            # Add only the activity one-hot encoding
            activity_onehots.append(activity_onehot)
            # One-hot encode the resource and store it for prediction (y)
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            y_encoded.append(resource_onehot)  # Store the one-hot encoded resource
        else:
            # For all other activities, include both activity and resource one-hot encoding
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            encoded_sequence = np.hstack([activity_onehot, resource_onehot])
            activity_onehots.append(encoded_sequence)
    
    # If there is more than one activity in the sequence, add the zero vector for the last resource
    if len(seq) > 1:
        last_activity_onehot = activity_onehots[-1]
        last_resource_onehot = np.zeros(resource_onehot.shape)  # Zero vector for the last resource
        activity_onehots[-1] = np.hstack([last_activity_onehot, last_resource_onehot])
    
    # Concatenate the encoded activities and resources for the full sequence
    encoded_sequences.append(np.vstack(activity_onehots))

X = np.array(encoded_sequences)
y = np.array(y_encoded)

print(X.shape)

print(y.shape)


(31509, 4, 121)
(31509, 1, 116)


In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.squeeze(axis=1)
y_test = y_test.squeeze(axis=1)

print(X_test.shape)
print(y_test.shape)


# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50, return_sequences=True, activation='relu'))
model.add(LSTM(50, activation='relu'))
model.add(Dense(116, activation='softmax')) 

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'precision', 'recall'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test)
print(f"Test loss: {test_loss}")
print(f"Test accuracy: {test_accuracy}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")




(6302, 4, 121)
(6302, 116)
Epoch 1/50


  super().__init__(**kwargs)


[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6295 - loss: 2.3238 - precision: 0.7969 - recall: 0.5059 - val_accuracy: 0.8651 - val_loss: 0.8540 - val_precision: 0.8759 - val_recall: 0.8634
Epoch 2/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8635 - loss: 0.8377 - precision: 0.8719 - recall: 0.8590 - val_accuracy: 0.8748 - val_loss: 0.7970 - val_precision: 0.8772 - val_recall: 0.8742
Epoch 3/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8782 - loss: 0.7678 - precision: 0.8801 - recall: 0.8771 - val_accuracy: 0.8764 - val_loss: 0.8014 - val_precision: 0.8775 - val_recall: 0.8762
Epoch 4/50
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8816 - loss: 0.7542 - precision: 0.8819 - recall: 0.8813 - val_accuracy: 0.8767 - val_loss: 0.7890 - val_precision: 0.8775 - val_recall: 0.8765
Epoch 5/50
[1m788/788[0m [32m━━━