<h2> Imports and loading event-log function </h2>

In [None]:
import numpy as np
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)


<h1> Loading event-logs and transforming</h1>

<h4> Load 2013 incidents dataset </h4>

In [None]:
# Loading dataset, keeping only relevant data, grouping on resource
event_log_2013 = import_xes("datasets/BPI_Challenge_2013_incidents.xes")
df_2013 = event_log_2013[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df_2013 = df_2013.sort_values(by=['org:resource', 'time:timestamp'])

Index(['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp'], dtype='object')
['Accepted' 'Queued' 'Completed' 'Unmatched']


<h4> Loading 2017 dataset </h4>

In [17]:
event_log_2017 = import_xes("datasets/BPI_Challenge_2017.xes")
df_2017 = event_log_2017[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df_2017 = df_2017.sort_values(by=['org:resource', 'time:timestamp'])
df_2017.info()

parsing log, completed traces :: 100%|██████████| 31509/31509 [00:21<00:00, 1446.38it/s]


<class 'pandas.core.frame.DataFrame'>
Index: 1202267 entries, 0 to 1068834
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype              
---  ------             --------------    -----              
 0   case:concept:name  1202267 non-null  object             
 1   concept:name       1202267 non-null  object             
 2   org:resource       1202267 non-null  object             
 3   time:timestamp     1202267 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 45.9+ MB


<h4> Loading the BPIC 2018 event-log </h4>

In [26]:
event_log_2018 = import_xes("datasets/BPI_Challenge_2018.xes")
event_log_2018.info()
event_log_2018.head()
df_2018 = event_log_2018[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df_2018 = df_2018.sort_values(by=['org:resource', 'time:timestamp'])
df_2018.info()

parsing log, completed traces :: 100%|██████████| 43809/43809 [01:31<00:00, 480.39it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2514266 entries, 0 to 2514265
Data columns (total 75 columns):
 #   Column                  Dtype              
---  ------                  -----              
 0   success                 bool               
 1   org:resource            object             
 2   docid_uuid              object             
 3   doctype                 object             
 4   subprocess              object             
 5   docid                   object             
 6   activity                object             
 7   note                    object             
 8   eventid                 object             
 9   identity:id             object             
 10  concept:name            object             
 11  lifecycle:transition    object             
 12  time:timestamp          datetime64[ns, UTC]
 13  case:young farmer       bool               
 14  case:selected_random    bool               
 15  case:penalty_AJLP       bool               
 16  

<h4> Loading the BPIC 2019 event-log </h4>

In [27]:
event_log_2019 = import_xes("datasets/BPI_Challenge_2019.xes")
df_2019 = event_log_2019[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df_2019 = df_2019.sort_values(by=['org:resource', 'time:timestamp'])
df_2019.info()

parsing log, completed traces :: 100%|██████████| 251734/251734 [00:35<00:00, 7154.29it/s] 


<class 'pandas.core.frame.DataFrame'>
Index: 1595923 entries, 118143 to 1592235
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype              
---  ------             --------------    -----              
 0   case:concept:name  1595923 non-null  object             
 1   concept:name       1595923 non-null  object             
 2   org:resource       1595923 non-null  object             
 3   time:timestamp     1595923 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 60.9+ MB


<h1>One-Hot Encoding the event-logs</h1>	


<h4> OHE the BPIC 2013 event-log </h4>

In [28]:
df_2013['next_activity'] = df_2013.groupby('org:resource')['concept:name'].shift(-1)
df_2013 = df_2013.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# Apply one-hot encoding using pandas get_dummies
df_2013_encoded = pd.get_dummies(df_2013, columns=[column_to_encode], prefix='activity')
df_2013_encoded = pd.get_dummies(df_2013_encoded, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2013:")
print(df_2013_encoded.columns.tolist())

Columns in final dataframe 2013:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_Accepted', 'activity_Completed', 'activity_Queued', 'activity_Unmatched', 'resource_-', 'resource_Aaron', 'resource_Abby', 'resource_Abdul', 'resource_Abhinav', 'resource_Abhiseka', 'resource_Abhishek', 'resource_Achuthanandan', 'resource_Adam', 'resource_Adeline', 'resource_Adenilson', 'resource_Aditi', 'resource_Adriano', 'resource_Agata', 'resource_Agnieszka', 'resource_Agnivesh', 'resource_Agoritsa', 'resource_Agron', 'resource_Ajay', 'resource_Ajitkumar', 'resource_Akanksha', 'resource_Akiko', 'resource_Akim', 'resource_Akshata', 'resource_Ala', 'resource_Alain', 'resource_Alan', 'resource_Alejandro', 'resource_Aleksander', 'resource_Aleksandra', 'resource_Alex', 'resource_Alexander', 'resource_Alexandre', 'resource_Alexsandra', 'resource_Alf', 'resource_Aline', 'resource_Allak', 'resource_Alok', 'resource_Alvin', 'resource_Amanda', 'resource_Amar', 'resource_Amer', 'resource_Amir',

<h4> OHE the BPIC 2017 event-log </h4>

In [29]:
df_2017['next_activity'] = df_2017.groupby('org:resource')['concept:name'].shift(-1)
df_2017 = df_2017.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2017 using pandas' get_dummies
df_2017_encoded = pd.get_dummies(df_2017, columns=[column_to_encode], prefix='activity')
df_2017_encoded = pd.get_dummies(df_2017_encoded, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2017:")
print(df_2017_encoded.columns.tolist())

Columns in final dataframe 2017:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_A_Accepted', 'activity_A_Cancelled', 'activity_A_Complete', 'activity_A_Concept', 'activity_A_Create Application', 'activity_A_Denied', 'activity_A_Incomplete', 'activity_A_Pending', 'activity_A_Submitted', 'activity_A_Validating', 'activity_O_Accepted', 'activity_O_Cancelled', 'activity_O_Create Offer', 'activity_O_Created', 'activity_O_Refused', 'activity_O_Returned', 'activity_O_Sent (mail and online)', 'activity_O_Sent (online only)', 'activity_W_Assess potential fraud', 'activity_W_Call after offers', 'activity_W_Call incomplete files', 'activity_W_Complete application', 'activity_W_Handle leads', 'activity_W_Personal Loan collection', 'activity_W_Shortened completion ', 'activity_W_Validate application', 'resource_User_1', 'resource_User_10', 'resource_User_100', 'resource_User_101', 'resource_User_102', 'resource_User_103', 'resource_User_104', 'resource_User_105', 'resource_User_

<h4> OHE the BPIC 2018 event-log </h4>

In [30]:
df_2018['next_activity'] = df_2018.groupby('org:resource')['concept:name'].shift(-1)
df_2018 = df_2018.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2018 using pandas' get_dummies
df_2018_encoded = pd.get_dummies(df_2018, columns=[column_to_encode], prefix='activity')
df_2018_encoded = pd.get_dummies(df_2018_encoded, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2018:")
print(df_2018_encoded.columns.tolist())

Columns in final dataframe 2018:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_abort external', 'activity_abort payment', 'activity_approve', 'activity_begin admissibility check', 'activity_begin editing', 'activity_begin editing from refused', 'activity_begin payment', 'activity_begin preparations', 'activity_calculate', 'activity_calculate protocol', 'activity_cancel offline', 'activity_change department', 'activity_check', 'activity_check admissibility', 'activity_clear', 'activity_correction GFM17', 'activity_create', 'activity_decide', 'activity_discard', 'activity_finish editing', 'activity_finish payment', 'activity_finish pre-check', 'activity_finish preparations', 'activity_initialize', 'activity_insert document', 'activity_mail income', 'activity_mail valid', 'activity_performed', 'activity_performed offline', 'activity_plan', 'activity_prepare external', 'activity_prepare offline', 'activity_refuse', 'activity_remove document', 'activity_restart editing'

<h4> OHE the BPIC 2019 event-log </h4>

In [31]:
df_2019['next_activity'] = df_2019.groupby('org:resource')['concept:name'].shift(-1)
df_2019 = df_2019.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2017 using pandas' get_dummies
df_2019_encoded = pd.get_dummies(df_2019, columns=[column_to_encode], prefix='activity')
df_2019_encoded = pd.get_dummies(df_2019_encoded, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2019:")
print(df_2019_encoded.columns.tolist())

Columns in final dataframe 2019:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_Block Purchase Order Item', 'activity_Cancel Goods Receipt', 'activity_Cancel Invoice Receipt', 'activity_Cancel Subsequent Invoice', 'activity_Change Approval for Purchase Order', 'activity_Change Currency', 'activity_Change Delivery Indicator', 'activity_Change Final Invoice Indicator', 'activity_Change Price', 'activity_Change Quantity', 'activity_Change Rejection Indicator', 'activity_Change Storage Location', 'activity_Change payment term', 'activity_Clear Invoice', 'activity_Create Purchase Order Item', 'activity_Create Purchase Requisition Item', 'activity_Delete Purchase Order Item', 'activity_Reactivate Purchase Order Item', 'activity_Receive Order Confirmation', 'activity_Record Goods Receipt', 'activity_Record Invoice Receipt', 'activity_Record Service Entry Sheet', 'activity_Record Subsequent Invoice', 'activity_Release Purchase Order', 'activity_Release Purchase Requisition'

<h1> Training Random Forest model on OHE event-logs </h1>

<h4> Training Random forest on 2013 incidents log </h4>


In [32]:
# BPIC 2013 incidents
feature_columns_2013 = [col for col in df_2013_encoded.columns if col.startswith('activity_') or col.startswith('resource')]
X_2013 = df_2013_encoded[feature_columns_2013]
y_2013 = df_2013_encoded['next_activity']

print(f"Input Shape: {X_2013.shape}")
print(f"Target Shape: {y_2013.shape}")

X_train_2013, X_test_2013, y_train_2013, y_test_2013 = train_test_split(
    X_2013,y_2013,
    test_size=0.2,
    random_state=1,
    stratify=y_2013
)

rf_model_2013 = RandomForestClassifier(n_estimators=100, random_state=1)

print("Training model...")
rf_model_2013.fit(X_train_2013, y_train_2013)
print("Model trained!")

y_pred_2013 = rf_model_2013.predict(X_test_2013)

print(f"Accuracy: {accuracy_score(y_test_2013, y_pred_2013):.2f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test_2013, y_pred_2013))


Input Shape: (61679, 1010)
Target Shape: (61679,)
Training model...
Model trained!
Accuracy: 0.72

--- Detailed Classification Report ---
              precision    recall  f1-score   support

    Accepted       0.71      0.95      0.81      7605
   Completed       0.87      0.45      0.59      2590
      Queued       0.64      0.22      0.32      2141

    accuracy                           0.72     12336
   macro avg       0.74      0.54      0.58     12336
weighted avg       0.73      0.72      0.68     12336



<h4> Training Random Forest on 2017 log </h4>

In [33]:
# BPIC 2017
feature_columns_2017 = [col for col in df_2017_encoded.columns if col.startswith('activity_') or col.startswith('resource')]
X_2017 = df_2017_encoded[feature_columns_2017]
y_2017 = df_2017_encoded['next_activity']

print(f"Input Shape: {X_2017.shape}")
print(f"Target Shape: {y_2017.shape}")

X_train_2017, X_test_2017, y_train_2017, y_test_2017 = train_test_split(
    X_2017,y_2017,
    test_size=0.2,
    random_state=1,
    stratify=y_2017
)

rf_model_2017 = RandomForestClassifier(n_estimators=100, random_state=1)

print("Training model...")
rf_model_2017.fit(X_train_2017, y_train_2017)
print("Model trained!")

y_pred_2017 = rf_model_2017.predict(X_test_2017)

print(f"Accuracy: {accuracy_score(y_test_2017, y_pred_2017):.2f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test_2017, y_pred_2017))

Input Shape: (1201969, 173)
Target Shape: (1201969,)
Training model...
Model trained!
Accuracy: 0.67

--- Detailed Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                            precision    recall  f1-score   support

                A_Accepted       0.67      0.15      0.25      6300
               A_Cancelled       0.48      0.74      0.58      2086
                A_Complete       0.30      0.06      0.09      6271
                 A_Concept       0.93      0.59      0.72      6302
      A_Create Application       0.92      0.49      0.64      6298
                  A_Denied       0.00      0.00      0.00       750
              A_Incomplete       0.31      0.09      0.14      4611
                 A_Pending       1.00      1.00      1.00      3446
               A_Submitted       1.00      1.00      1.00      4085
              A_Validating       0.31      0.23      0.26      7763
                O_Accepted       0.48      0.15      0.23      3445
               O_Cancelled       0.95      0.48      0.64      4178
            O_Create Offer       0.94      0.70      0.80      8598
                 O_Created       1.00      1.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<h4>Training Random Forest on 2018 event-log </h4>

In [34]:
# BPIC 2018
feature_columns_2018 = [col for col in df_2018_encoded.columns if col.startswith('activity_') or col.startswith('resource')]
X_2018 = df_2018_encoded[feature_columns_2018]
y_2018 = df_2018_encoded['next_activity']

print(f"Input Shape: {X_2018.shape}")
print(f"Target Shape: {y_2018.shape}")

X_train_2018, X_test_2018, y_train_2018, y_test_2018 = train_test_split(
    X_2018,y_2018,
    test_size=0.2,
    random_state=1,
    stratify=y_2018
)

rf_model_2018 = RandomForestClassifier(n_estimators=100, random_state=1)

print("Training model...")
rf_model_2018.fit(X_train_2018, y_train_2018)
print("Model trained!")

y_pred_2018 = rf_model_2018.predict(X_test_2018)

print(f"Accuracy: {accuracy_score(y_test_2018, y_pred_2018):.2f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test_2018, y_pred_2018))

Input Shape: (2514101, 205)
Target Shape: (2514101,)
Training model...
Model trained!
Accuracy: 0.63

--- Detailed Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                            precision    recall  f1-score   support

            abort external       0.43      0.56      0.48      1506
             abort payment       1.00      1.00      1.00      6859
                   approve       0.54      0.08      0.14       180
 begin admissibility check       0.37      0.09      0.15       386
             begin editing       0.52      0.58      0.55     79416
begin editing from refused       0.50      0.12      0.20        41
             begin payment       1.00      1.00      1.00     16040
        begin preparations       0.58      0.44      0.50      2642
                 calculate       0.58      0.45      0.51     93228
        calculate protocol       0.84      0.87      0.86       276
            cancel offline       0.46      0.15      0.23        39
         change department       1.00      0.70      0.82        10
                     check       0.57      0.03      0.06      2717
       check admissibility       0.58      0.60

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<h4> Training Random Forest on 2019 event-log </h4>

In [37]:
# BPIC 2019
feature_columns_2019 = [col for col in df_2019_encoded.columns if col.startswith('activity_') or col.startswith('resource')]
X_2019 = df_2019_encoded[feature_columns_2019]
y_2019 = df_2019_encoded['next_activity']

print(f"Input Shape: {X_2019.shape}")
print(f"Target Shape: {y_2019.shape}")

X_train_2019, X_test_2019, y_train_2019, y_test_2019 = train_test_split(
    X_2019,y_2019,
    test_size=0.2,
    random_state=1,
    stratify=y_2019
)

rf_model_2019 = RandomForestClassifier(n_estimators=100, random_state=1)

print("Training model...")
rf_model_2019.fit(X_train_2019, y_train_2019)
print("Model trained!")

y_pred_2019 = rf_model_2019.predict(X_test_2019)

print(f"Accuracy: {accuracy_score(y_test_2019, y_pred_2019):.2f}")
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test_2019, y_pred_2019))

Input Shape: (1595295, 629)
Target Shape: (1595295,)
Training model...
Model trained!
Accuracy: 0.96

--- Detailed Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                     precision    recall  f1-score   support

          Block Purchase Order Item       0.77      0.61      0.68       103
               Cancel Goods Receipt       0.76      0.37      0.50       617
             Cancel Invoice Receipt       0.59      0.54      0.56      1419
          Cancel Subsequent Invoice       0.49      0.27      0.34        98
 Change Approval for Purchase Order       0.91      0.89      0.90      1506
                    Change Currency       0.00      0.00      0.00         7
          Change Delivery Indicator       0.40      0.18      0.25       657
     Change Final Invoice Indicator       0.00      0.00      0.00         2
                       Change Price       0.86      0.81      0.83      2482
                    Change Quantity       0.60      0.49      0.54      4285
            Change Storage Location       0.56      0.43      0.49        83
                Change payment term       0.00      0.00      0.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
