In [27]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer


In [131]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2013_closed_problems.xes")

parsing log, completed traces :: 100%|██████████| 1487/1487 [00:00<00:00, 3178.42it/s]


In [132]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

# Sort by 'time:timestamp' and 'case:concept:name'
df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

df.head(n=10)

Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
0,1-109135791,Queued,Minnie,2006-01-11 15:49:42+00:00
1,1-109135791,Accepted,Minnie,2012-03-15 11:53:52+00:00
2,1-109135791,Accepted,Minnie,2012-03-15 11:56:17+00:00
3,1-109135791,Accepted,Minnie,2012-03-15 12:09:05+00:00
4,1-109135791,Completed,Minnie,2012-03-15 12:11:33+00:00
5,1-147898401,Accepted,Tomas,2006-11-07 10:00:36+00:00
6,1-147898401,Accepted,Tomas,2006-11-07 13:05:44+00:00
7,1-147898401,Accepted,Tomas,2009-12-02 14:24:32+00:00
8,1-147898401,Accepted,Tomas,2011-09-03 07:09:09+00:00
9,1-147898401,Accepted,Carrie,2012-01-20 10:23:24+00:00


In [184]:
def create_activity_resource_sequence(df, prefix_length):
    sequences = []
    grouped = df.groupby('case:concept:name')
    
    for _, group in grouped:
        activities = group['concept:name'].tolist()
        resources = group['org:resource'].tolist()
        
        # Only include sequences with length >= prefix_length
        if len(activities) < prefix_length:
            # Remove the sequence (skip appending it to the list)
            continue
        
        # Truncate to the desired prefix length
        current_activities = activities[:prefix_length]
        current_resources = resources[:prefix_length]  # Include all resources
        
        # Combine activities and resources into tuples (no changes for the last activity)
        sequence = []
        for i in range(len(current_activities)):
            # For all activities, include both activity and resource
            sequence.append((current_activities[i], current_resources[i]))
        
        # Add the valid sequence to the list
        sequences.append(sequence)
    
    return sequences

sequences = create_activity_resource_sequence(df,30)

# Initialize a set to store unique 'R' values
unique_R = set()

# Loop through the list of sequences and extract the 'R' values
for sequence in sequences:
    for item in sequence:
        # item[1] is the second element (the part with 'R')
        unique_R.add(item[1])

# The length of the set will give the number of unique occurrences of 'R'
print(len(unique_R))

7


In [185]:
# Prepare the list of activities and resources
activities = []
resources = []

# Loop through sequences to gather activities and resources
for seq in sequences:
    for i, item in enumerate(seq):
        activity, resource = item  # Each item is (activity, resource)
        # Replace NaN resource with 'none'
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activities.append(activity)
        resources.append(resource)

# Fit the OneHotEncoder to the unique activities and resources
activity_encoder = OneHotEncoder() 
resource_encoder = OneHotEncoder()

# Fit the encoder on unique activities and resources
activity_encoder.fit([[activity] for activity in set(activities)])
resource_encoder.fit([[resource] for resource in set(resources)])

# Encode activities and resources
encoded_sequences = []
y_encoded = []  # List to store the one-hot encoded target resource for the last activity

for seq in sequences:
    activity_onehots = []
    
    # For each activity-resource pair, apply one-hot encoding
    for i, item in enumerate(seq):
        activity, resource = item
        # Replace NaN resource with 'none' during encoding
        if pd.isna(resource):  # Check if the resource is NaN
            resource = 'none'
        activity_onehot = activity_encoder.transform([[activity]]).toarray()
        
        # If it's the last item, we only encode the activity and store the resource for y
        if i == len(seq) - 1:
            # Add only the activity one-hot encoding
            activity_onehots.append(activity_onehot)
            # One-hot encode the resource and store it for prediction (y)
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            y_encoded.append(resource_onehot)  # Store the one-hot encoded resource
        else:
            # For all other activities, include both activity and resource one-hot encoding
            resource_onehot = resource_encoder.transform([[resource]]).toarray()
            encoded_sequence = np.hstack([activity_onehot, resource_onehot])
            activity_onehots.append(encoded_sequence)
    
    # If there is more than one activity in the sequence, add the zero vector for the last resource
    if len(seq) > 1:
        last_activity_onehot = activity_onehots[-1]
        last_resource_onehot = np.zeros(resource_onehot.shape)  # Zero vector for the last resource
        activity_onehots[-1] = np.hstack([last_activity_onehot, last_resource_onehot])
    
    # Concatenate the encoded activities and resources for the full sequence
    encoded_sequences.append(np.vstack(activity_onehots))

X = np.array(encoded_sequences)
y = np.array(y_encoded)

print(X.shape)
print(y.shape)

(1, 30, 9)
(1, 1, 7)


In [186]:
label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform([np.argmax(y_i) for y_i in y])


In [188]:
# Flatten the sequences as you did before
X_flattened = np.array([seq.sum(axis=0) for seq in X])

kf = KFold(n_splits=2, shuffle=True, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Create the grid search object
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring={'f1_weighted': make_scorer(f1_score, average='weighted')},  # F1 weighted as scoring metric
    refit='f1_weighted',  # Refit using the best scoring parameter combination
    cv=kf,  # Cross-validation strategy
    verbose=1,
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_flattened, y_labels)

# Print the best parameters and corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Weighted Score:", grid_search.best_score_)

# Use the best classifier for evaluation
best_clf = grid_search.best_estimator_

# Perform cross-validation with the best estimator
cv_results = cross_validate(
    best_clf, X_flattened, y_labels,
    cv=kf,
    scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'],
    return_train_score=False
)

# Print cross-validation metrics with four decimal places
print("Cross-Validation Results:")
print("Accuracy (mean ± std): {:.4f} ± {:.4f}".format(np.mean(cv_results['test_accuracy']), np.std(cv_results['test_accuracy'])))
print("Precision (mean ± std): {:.4f} ± {:.4f}".format(np.mean(cv_results['test_precision_weighted']), np.std(cv_results['test_precision_weighted'])))
print("Recall (mean ± std): {:.4f} ± {:.4f}".format(np.mean(cv_results['test_recall_weighted']), np.std(cv_results['test_recall_weighted'])))
print("F1 (mean ± std): {:.4f} ± {:.4f}".format(np.mean(cv_results['test_f1_weighted']), np.std(cv_results['test_f1_weighted'])))


Fitting 2 folds for each of 432 candidates, totalling 864 fits


ValueError: Cannot have number of splits n_splits=2 greater than the number of samples: n_samples=1.