In [1]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from keras.api.models import Sequential
from keras.api.layers import Dense, LSTM
from keras.api.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score





In [2]:
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set no limit for column width

In [57]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")


parsing log, completed traces :: 100%|██████████| 251734/251734 [01:00<00:00, 4159.88it/s]


In [58]:
# Assuming event_log is your DataFrame
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

# Sort by 'time:timestamp' and 'case:concept:name'
df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

df.head(n=10)


Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
0,2000000000_00001,SRM: Created,batch_00,2018-01-02 12:53:00+00:00
1,2000000000_00001,SRM: Complete,batch_00,2018-01-02 13:53:00+00:00
2,2000000000_00001,SRM: Awaiting Approval,batch_00,2018-01-02 13:53:00+00:00
3,2000000000_00001,SRM: Document Completed,batch_00,2018-01-02 13:53:00+00:00
4,2000000000_00001,SRM: In Transfer to Execution Syst.,batch_00,2018-01-02 13:53:00+00:00
5,2000000000_00001,SRM: Ordered,batch_00,2018-01-02 13:53:00+00:00
6,2000000000_00001,SRM: Change was Transmitted,batch_00,2018-01-02 13:53:00+00:00
7,2000000000_00001,Create Purchase Order Item,user_000,2018-01-02 13:53:00+00:00
8,2000000000_00001,Vendor creates invoice,NONE,2018-01-02 22:59:00+00:00
9,2000000000_00001,Record Goods Receipt,user_000,2018-03-06 06:44:00+00:00


In [78]:
def create_activity_resource_sequence(df, prefix_length):
    sequences = []
    grouped = df.groupby('case:concept:name')
    
    for _, group in grouped:
        activities = group['concept:name'].tolist()
        resources = group['org:resource'].tolist()
        
        # Only include sequences with length >= prefix_length
        if len(activities) < prefix_length:
            # Remove the sequence (skip appending it to the list)
            continue
        
        # Truncate to the desired prefix length
        current_activities = activities[:prefix_length]
        current_resources = resources[:prefix_length]  # Include all resources
        
        # Combine activities and resources into tuples (no changes for the last activity)
        sequence = []
        for i in range(len(current_activities)):
            # For all activities, include both activity and resource
            sequence.append((current_activities[i], current_resources[i]))
        
        # Add the valid sequence to the list
        sequences.append(sequence)
    
    return sequences

# Example usage
sequences = create_activity_resource_sequence(df,35)

# Initialize a set to store unique 'R' values
unique_R = set()

# Loop through the list of sequences and extract the 'R' values
for sequence in sequences:
    for item in sequence:
        # item[1] is the second element (the part with 'R')
        unique_R.add(item[1])

# The length of the set will give the number of unique occurrences of 'R'
print(len(unique_R))

143


In [79]:
# Prepare the list of activities and resources
activities = []
resources = []

# Loop through sequences to gather activities and resources
for seq in sequences:
    for i, item in enumerate(seq):
        activity, resource = item  # Each item is (activity, resource)
        # Replace NaN resource with 'none'
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activities.append(activity)
        resources.append(resource)

# Fit the OneHotEncoder to the unique activities and resources
activity_encoder = OneHotEncoder() 
resource_encoder = OneHotEncoder()

# Fit the encoder on unique activities and resources
activity_encoder.fit([[activity] for activity in set(activities)])
resource_encoder.fit([[resource] for resource in set(resources)])

# Encode activities and resources
encoded_sequences = []
y_encoded = []  # List to store the one-hot encoded target resource for the last activity

for seq in sequences:
    activity_onehots = []
    
    # For each activity-resource pair, apply one-hot encoding
    for i, item in enumerate(seq):
        activity, resource = item
        # Replace NaN resource with 'none' during encoding
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activity_onehot = activity_encoder.transform([[activity]]).toarray()
        
        # If it's the last item, we only encode the activity and store the resource for y
        if i == len(seq) - 1:
            # Add only the activity one-hot encoding
            activity_onehots.append(activity_onehot)
            # One-hot encode the resource and store it for prediction (y)
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            y_encoded.append(resource_onehot)  # Store the one-hot encoded resource
        else:
            # For all other activities, include both activity and resource one-hot encoding
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            encoded_sequence = np.hstack([activity_onehot, resource_onehot])
            activity_onehots.append(encoded_sequence)
    
    # If there is more than one activity in the sequence, add the zero vector for the last resource
    if len(seq) > 1:
        last_activity_onehot = activity_onehots[-1]
        last_resource_onehot = np.zeros(resource_onehot.shape)  # Zero vector for the last resource
        activity_onehots[-1] = np.hstack([last_activity_onehot, last_resource_onehot])
    
    # Concatenate the encoded activities and resources for the full sequence
    encoded_sequences.append(np.vstack(activity_onehots))

X = np.array(encoded_sequences)
y = np.array(y_encoded)

print(X.shape)
print(y.shape)



(1842, 35, 171)
(1842, 1, 143)


In [80]:
# Initialize KFold with 5 splits
kf = KFold(n_splits=5)

# Initialize the model 
def create_model():
    model = Sequential()
    # First LSTM layer with return_sequences=True
    model.add(LSTM(50, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    # Second LSTM layer
    model.add(LSTM(50))
    # Output Dense layer
    model.add(Dense(143, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Store metrics from each fold
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Initialize EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Loop through the KFold splits
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Squeeze the target arrays to remove the extra dimension
    y_train = y_train.squeeze(axis=1)
    y_test = y_test.squeeze(axis=1)
    
    # Create the model for each fold
    model = create_model()
    
    # Train the model with early stopping
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0, 
                        validation_data=(X_test, y_test), callbacks=[early_stopping])
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels
    y_test_classes = np.argmax(y_test, axis=1)  # Ensure test labels are in class label format
    
    # Calculate metrics
    accuracy = np.mean(y_pred_classes == y_test_classes)
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    
    # Store metrics
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Output average metrics
print(f'Average Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')
print(f'Average Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}')
print(f'Average Recall: {np.mean(recalls):.4f} ± {np.std(recalls):.4f}')
print(f'Average F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}')

  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Average Accuracy: 0.7602 ± 0.1699
Average Precision: 0.6642 ± 0.2469
Average Recall: 0.7602 ± 0.1699
Average F1-Score: 0.6968 ± 0.2252
