In [1]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 251734/251734 [00:31<00:00, 7989.86it/s] 


In [3]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

# Sort by 'time:timestamp' and 'case:concept:name'
df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

# df = df.sort_values(by=['org:resource', 'time:timestamp'])

df.head(n=10)

Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
118143,4507004931_00010,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118144,4507004931_00010,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118153,4507004931_00020,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118154,4507004931_00020,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118163,4507004931_00030,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118164,4507004931_00030,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118173,4507004931_00040,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118174,4507004931_00040,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118183,4507004931_00050,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118184,4507004931_00050,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00


In [26]:
def create_activity_resource_sequence(df, prefix_length):
    sequences = []
    grouped = df.groupby('case:concept:name')
    
    for _, group in grouped:
        activities = group['concept:name'].tolist()
        resources = group['org:resource'].tolist()
        
        # Only include sequences with length >= prefix_length
        if len(activities) < prefix_length:
            # Remove the sequence (skip appending it to the list)
            continue
        
        # Truncate to the desired prefix length
        current_activities = activities[:prefix_length]
        current_resources = resources[:prefix_length]  # Include all resources
        
        # Combine activities and resources into tuples (no changes for the last activity)
        sequence = []
        for i in range(len(current_activities)):
            # For all activities, include both activity and resource
            sequence.append((current_activities[i], current_resources[i]))
        
        # Add the valid sequence to the list
        sequences.append(sequence)
    
    return sequences

sequences = create_activity_resource_sequence(df,35)

In [27]:
# Prepare the list of activities and resources
activities = []
resources = []

# Loop through sequences to gather activities and resources
for seq in sequences:
    for i, item in enumerate(seq):
        activity, resource = item  # Each item is (activity, resource)
        # Replace NaN resource with 'none'
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activities.append(activity)
        resources.append(resource)

# Fit the OneHotEncoder to the unique activities and resources
activity_encoder = OneHotEncoder() 
resource_encoder = OneHotEncoder()

# Fit the encoder on unique activities and resources
activity_encoder.fit([[activity] for activity in set(activities)])
resource_encoder.fit([[resource] for resource in set(resources)])

# Encode activities and resources
encoded_sequences = []
y_encoded = []  # List to store the one-hot encoded target resource for the last activity

for seq in sequences:
    activity_onehots = []
    
    # For each activity-resource pair, apply one-hot encoding
    for i, item in enumerate(seq):
        activity, resource = item
        # Replace NaN resource with 'none' during encoding
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activity_onehot = activity_encoder.transform([[activity]]).toarray()
        
        # If it's the last item, we only encode the activity and store the resource for y
        if i == len(seq) - 1:
            # Add only the activity one-hot encoding
            activity_onehots.append(activity_onehot)
            # One-hot encode the resource and store it for prediction (y)
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            y_encoded.append(resource_onehot)  # Store the one-hot encoded resource
        else:
            # For all other activities, include both activity and resource one-hot encoding
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            encoded_sequence = np.hstack([activity_onehot, resource_onehot])
            activity_onehots.append(encoded_sequence)
    
    # If there is more than one activity in the sequence, add the zero vector for the last resource
    if len(seq) > 1:
        last_activity_onehot = activity_onehots[-1]
        last_resource_onehot = np.zeros(resource_onehot.shape)  # Zero vector for the last resource
        activity_onehots[-1] = np.hstack([last_activity_onehot, last_resource_onehot])
    
    # Concatenate the encoded activities and resources for the full sequence
    encoded_sequences.append(np.vstack(activity_onehots))

X = np.array(encoded_sequences)
y = np.array(y_encoded)

In [28]:
X_flattened = [sequence.flatten() for sequence in X]
X_flattened = np.array(X_flattened)  # Convert to a NumPy array
y_single_label = np.array([np.argmax(label) for label in y])

# Parameter distribution for RandomizedSearchCV
# param_dist = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False],
# }

# param_dist = {
#     'n_estimators': [50, 100],  # Reduced values
#     'max_depth': [10, 20],      # Narrowed search
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
#     'bootstrap': [True]
# }

param_dist = {
    'n_estimators': [50],  # Reduce options further
    'max_depth': [None],    # Single value
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'bootstrap': [False],
}

# Create the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,  # Number of iterations to sample from the parameter space
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

# Perform RandomizedSearchCV
random_search.fit(X_flattened, y_single_label)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Get the best RandomForest model
best_rf = random_search.best_estimator_

# KFold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists for performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform KFold cross-validation
for train_idx, test_idx in kf.split(X_flattened):
    X_train, X_test = X_flattened[train_idx], X_flattened[test_idx]
    y_train, y_test = y_single_label[train_idx], y_single_label[test_idx]
    
    # Fit the model on the training data
    best_rf.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = best_rf.predict(X_test)
    
    # Calculate metrics for the fold
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
    recalls.append(recall_score(y_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

# Compute mean and standard deviation for each metric
metrics = {
    'Accuracy': (np.mean(accuracies), np.std(accuracies)),
    'Precision': (np.mean(precisions), np.std(precisions)),
    'Recall': (np.mean(recalls), np.std(recalls)),
    'F1-Score': (np.mean(f1_scores), np.std(f1_scores)),
}

# Print the results in the desired format
for metric, (mean, std) in metrics.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best parameters found by RandomizedSearchCV:
{'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8197 ± 0.0175
Precision: 0.7569 ± 0.0264
Recall: 0.8197 ± 0.0175
F1-Score: 0.7713 ± 0.0221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
