<h2> Imports, loading event-log function and cleaning pipeline </h2>

In [4]:
import numpy as np
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import statistics
from collections import Counter

In [22]:
# Importing dataset from file path
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

# Cleaning dataset: removing unnecessary columns, shifting to resource focus
def clean_dataset(df):
    df_final = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    return df_final.sort_values(by=['org:resource', 'time:timestamp'])

def prefix_extraction(df, min_len=1, max_len=None):
    """
    Extract prefixes per resource.
    """
    resource_traces = df.groupby('org:resource')['concept:name'].apply(list)
    all_rows = []

    for resource, seq in resource_traces.items():
        max_prefix_len = len(seq) if max_len is None else min(max_len, len(seq))

        for k in range(min_len, max_prefix_len):
            prefix = seq[:k]
            next_act = seq[k] if k < len(seq) else None

            all_rows.append({
                'resource': resource,
                'prefix': prefix,
                'prefix_length': k,
                'last_activity': prefix[-1] if len(prefix) else None,
                'next_activity': next_act
            })

    return pd.DataFrame(all_rows)

def apply_bucketing(df):
    df = df.copy()
    df['bucket_key'] = df.apply(
        lambda row: (row['prefix_length'], row['last_activity']), axis=1
    )
    df['bucket_id'] = df['bucket_key'].apply(lambda k: abs(hash(k)) % 10_000_000)
    return df


def process_dataset(file_path):
    df = import_xes(file_path)
    df_clean = clean_dataset(df)
    df_prefix = prefix_extraction(df_clean)
    df_final = apply_bucketing(df_prefix)
    return df_final 


<h1> Loading event-logs and transforming</h1>

<h4> Loading datasets </h4>

In [23]:
df_2013 = process_dataset("datasets/BPI_Challenge_2013_incidents.xes")
print("Sucessfully loaded 2013 dataset")


parsing log, completed traces :: 100%|██████████| 7554/7554 [00:02<00:00, 3214.74it/s]


Sucessfully loaded 2013 dataset


<h1>One-Hot Encoding the event-logs</h1>	


<h4> Apply One-Hot encoding function </h4>

In [24]:
def apply_one_hot_encoding(df):
    df_encoded = df.copy()

    for col in ['last_activity', 'resource', 'next_activity']:
        dummies = pd.get_dummies(df_encoded[col], prefix=col)
        df_encoded = pd.concat([df_encoded.drop(col, axis=1), dummies], axis=1)

    df_encoded = df_encoded.drop(columns=['prefix'], errors='ignore')
    df_encoded = df_encoded.drop(columns=['bucket_key'], errors='ignore')

    return df_encoded




<h4> OHE the BPIC 2013 event-log </h4>

In [25]:
df_2013_onehot = apply_one_hot_encoding(df_2013)
print(df_2013_onehot.head())

   prefix_length  bucket_id  last_activity_Accepted  last_activity_Completed  \
0              1    9460431                    True                    False   
1              2     559493                    True                    False   
2              3    4681928                    True                    False   
3              4    4701852                    True                    False   
4              5    2904627                    True                    False   

   last_activity_Queued  last_activity_Unmatched  resource_-  resource_Aaron  \
0                 False                    False        True           False   
1                 False                    False        True           False   
2                 False                    False        True           False   
3                 False                    False        True           False   
4                 False                    False        True           False   

   resource_Abby  resource_Abdul  ... 

<h1> Training Random Forest model on OHE event-logs </h1>

<h4> Random Forest Training Pipeline </h4>

In [29]:
def random_forest_pipeline(df):
    # Remove rows with no next activity (end of resource trace)
    target_columns = [c for c in df.columns if c.startswith("next_activity_")]
    df_model = df[df[target_columns].sum(axis=1) > 0]

    feature_columns = [
        col for col in df_model.columns 
        if col.startswith("last_activity_")
        or col.startswith("resource_")
        or col == "prefix_length"
        or col == "bucket_id"
    ]

    target_columns = [c for c in df_model.columns if c.startswith("next_activity_")]

    X = df_model[feature_columns]
    y = df_model[target_columns]

    print("Feature matrix shape:", X.shape)
    print("Target matrix shape:", y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    )

    rf.fit(X_train, y_train)
    print("Random Forest training completed.")

    y_pred = rf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred, average='weighted')

    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

    return accuracy, f1score


In [30]:
rf_2013ohe_accuracy, rf_2013ohe_f1 = random_forest_pipeline(df_2013_onehot)

Feature matrix shape: (64093, 1288)
Target matrix shape: (64093, 4)
Random Forest training completed.

Accuracy: 0.6556673687495125

Classification Report:

              precision    recall  f1-score   support

           0       0.70      0.85      0.77      7721
           1       0.76      0.47      0.58      2862
           2       0.57      0.22      0.32      2236
           3       0.00      0.00      0.00         0

   micro avg       0.70      0.66      0.68     12819
   macro avg       0.51      0.39      0.42     12819
weighted avg       0.69      0.66      0.65     12819
 samples avg       0.66      0.66      0.66     12819



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
