<h2> Imports, loading event-log function and cleaning pipeline </h2>

<h4> TODO List </h4>
<ul>
    <li> Refactor some code, mainly abstrahize some training and testing logic </li>
    <li> Grid search to find best parameters for each feature and model </li>
</ul>

In [2]:
import numpy as np
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import statistics
from collections import Counter

In [3]:
# Importing dataset from file path
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

# Cleaning dataset: removing unnecessary columns, shifting to resource focus
def clean_dataset(df):
    df_final = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    return df_final.sort_values(by=['org:resource', 'time:timestamp'])

def prefix_extraction(df, min_len=1, max_len=None):
    """
    Extract prefizes of activity sequences for each resource.
    Each resourece has a timeline, this creates prefixes of what 
    that resource has done up until each point in time.
    """
    resource_traces = df.groupby('org:resource')['concept:name'].apply(list)
    all_prefix_rows = []

    for resource, seq in resource_traces.items():
        if max_len is None:
            max_prefix_len = len(seq)
        else:
            max_prefix_len = min(max_len, len(seq))

        for k in range(min_len, max_prefix_len + 1):
            prefix = seq[:k]
            all_prefix_rows.append({
                'resource' : resource,
                'prefix_length' : k,
                'prefix' : prefix
            })
    return pd.DataFrame(all_prefix_rows)

def apply_bucketing(prefix_df):
    """
    Takes the prefix datafram with columns:
        1. resource that performs
        2. prefix_length
        3. prefix (list of activities)
    
    To this we add:
        1. last_activity
        2. bucket_key
        3. bucket_id

    In essence this bucket buckets prefixes that have the 
    same lenght AND share the same final activity. 

    The tuples are then also converted to hashes to make 
    sure any model can work fine with our buckets, even
    if they require numerical bucket IDs
    """

    # extract last activity for each prefix
    prefix_df['last_activity'] = prefix_df['prefix'].apply(
        lambda p: p[-1] if isinstance(p,list) and len(p) > 0 else None
    )

    # creating tuple keys used for bucketing --> defines semantic bucket
    prefix_df['bucket_key'] = prefix_df.apply(
        lambda row: (row['prefix_length'], row['last_activity']),
        axis = 1
    )

    # creating stable bucket ID based on the key
    prefix_df['bucket_id'] = prefix_df['bucket_key'].apply(
        lambda k: abs(hash(k)) % 10_000_000
    )

    return prefix_df


def last_state_encoding(df):
    # Applying last-state encoding to prevent data leakage
    df_encoded = df.copy()

    df_encoded['last_activity'] = df_encoded['prefix'].apply(
        lambda p: p[-1] if p else None
    )
    return df_encoded


def process_dataset(file_path, min_len=1, max_len=None):
    df = import_xes(file_path)
    df_clean = clean_dataset(df)
    df_prefixes = prefix_extraction(df_clean, min_len=min_len, max_len=max_len)
    df_bucketed = apply_bucketing(df_prefixes)
    df_encoded = last_state_encoding(df_bucketed)

    return df_encoded


<h1> Loading event-logs and transforming</h1>

<h4> Loading datasets </h4>

In [4]:
df_2013 = process_dataset("datasets/BPI_Challenge_2013_incidents.xes")
print("Sucessfully loaded 2013 dataset")


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 7554/7554 [00:01<00:00, 3874.05it/s]


Sucessfully loaded 2013 dataset


In [5]:
df_2017 = process_dataset("datasets/BPI_Challenge_2017.xes")
print("Sucessfully loaded 2017 dataset")


parsing log, completed traces :: 100%|██████████| 31509/31509 [00:22<00:00, 1387.65it/s]


: 

In [None]:
df_2018 = process_dataset("datasets/BPI_Challenge_2018.xes")
print("Sucessfully loaded 2018 dataset")


In [None]:
df_2019 = process_dataset("datasets/BPI_Challenge_2019.xes")
print("Succesfully loaded, bucket, and encoded all datasets")

<h1>One-Hot Encoding the event-logs</h1>	


<h4> Apply One-Hot encoding function </h4>

In [6]:
def apply_one_hot_encoding(df, columns=['last_activity', 'resource']):
    df_encoded = df.copy()

    for col in columns:
        if col in df_encoded.columns:
            dummies = pd.get_dummies(df_encoded[col], prefix=col)
            df_encoded = pd.concat([df_encoded.drop(col, axis=1), dummies], axis=1)
    return df_encoded

<h4> OHE the BPIC 2013 event-log </h4>

In [7]:
df_2013_onehot = apply_one_hot_encoding(df_2013)

print(df_2013_onehot.head())

   prefix_length                                             prefix  \
0              1                                         [Accepted]   
1              2                               [Accepted, Accepted]   
2              3                     [Accepted, Accepted, Accepted]   
3              4           [Accepted, Accepted, Accepted, Accepted]   
4              5  [Accepted, Accepted, Accepted, Accepted, Accep...   

      bucket_key  bucket_id  last_activity_Accepted  last_activity_Completed  \
0  (1, Accepted)    3013051                    True                    False   
1  (2, Accepted)    1375001                    True                    False   
2  (3, Accepted)    8870692                    True                    False   
3  (4, Accepted)     660535                    True                    False   
4  (5, Accepted)    6457247                    True                    False   

   last_activity_Queued  last_activity_Unmatched  resource_-  resource_Aaron  \
0           

<h4> OHE the BPIC 2017 event-log </h4>

In [31]:
df_2017['next_activity'] = df_2017.groupby('org:resource')['concept:name'].shift(-1)
df_2017 = df_2017.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2017 using pandas' get_dummies
df_2017_OHE = pd.get_dummies(df_2017, columns=[column_to_encode], prefix='activity')
df_2017_OHE = pd.get_dummies(df_2017_OHE, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2017:")
print(df_2017_OHE.columns.tolist())

Columns in final dataframe 2017:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_A_Accepted', 'activity_A_Cancelled', 'activity_A_Complete', 'activity_A_Concept', 'activity_A_Create Application', 'activity_A_Denied', 'activity_A_Incomplete', 'activity_A_Pending', 'activity_A_Submitted', 'activity_A_Validating', 'activity_O_Accepted', 'activity_O_Cancelled', 'activity_O_Create Offer', 'activity_O_Created', 'activity_O_Refused', 'activity_O_Returned', 'activity_O_Sent (mail and online)', 'activity_O_Sent (online only)', 'activity_W_Assess potential fraud', 'activity_W_Call after offers', 'activity_W_Call incomplete files', 'activity_W_Complete application', 'activity_W_Handle leads', 'activity_W_Personal Loan collection', 'activity_W_Shortened completion ', 'activity_W_Validate application', 'resource_User_1', 'resource_User_10', 'resource_User_100', 'resource_User_101', 'resource_User_102', 'resource_User_103', 'resource_User_104', 'resource_User_105', 'resource_User_

<h4> OHE the BPIC 2018 event-log </h4>

In [32]:
df_2018['next_activity'] = df_2018.groupby('org:resource')['concept:name'].shift(-1)
df_2018 = df_2018.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2018 using pandas' get_dummies
df_2018_OHE = pd.get_dummies(df_2018, columns=[column_to_encode], prefix='activity')
df_2018_OHE = pd.get_dummies(df_2018_OHE, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2018:")
print(df_2018_OHE.columns.tolist())

Columns in final dataframe 2018:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_abort external', 'activity_abort payment', 'activity_approve', 'activity_begin admissibility check', 'activity_begin editing', 'activity_begin editing from refused', 'activity_begin payment', 'activity_begin preparations', 'activity_calculate', 'activity_calculate protocol', 'activity_cancel offline', 'activity_change department', 'activity_check', 'activity_check admissibility', 'activity_clear', 'activity_correction GFM17', 'activity_create', 'activity_decide', 'activity_discard', 'activity_finish editing', 'activity_finish payment', 'activity_finish pre-check', 'activity_finish preparations', 'activity_initialize', 'activity_insert document', 'activity_mail income', 'activity_mail valid', 'activity_performed', 'activity_performed offline', 'activity_plan', 'activity_prepare external', 'activity_prepare offline', 'activity_refuse', 'activity_remove document', 'activity_restart editing'

<h4> OHE the BPIC 2019 event-log </h4>

In [33]:
df_2019['next_activity'] = df_2019.groupby('org:resource')['concept:name'].shift(-1)
df_2019 = df_2019.dropna(subset=['next_activity'])

# Encoding the 'concept:name' column, as these are the activities
column_to_encode = 'concept:name'

# OHE BPIC 2017 using pandas' get_dummies
df_2019_OHE = pd.get_dummies(df_2019, columns=[column_to_encode], prefix='activity')
df_2019_OHE = pd.get_dummies(df_2019_OHE, columns=['org:resource'], prefix='resource')

print("Columns in final dataframe 2019:")
print(df_2019_OHE.columns.tolist())

Columns in final dataframe 2019:
['case:concept:name', 'time:timestamp', 'next_activity', 'activity_Block Purchase Order Item', 'activity_Cancel Goods Receipt', 'activity_Cancel Invoice Receipt', 'activity_Cancel Subsequent Invoice', 'activity_Change Approval for Purchase Order', 'activity_Change Currency', 'activity_Change Delivery Indicator', 'activity_Change Final Invoice Indicator', 'activity_Change Price', 'activity_Change Quantity', 'activity_Change Rejection Indicator', 'activity_Change Storage Location', 'activity_Change payment term', 'activity_Clear Invoice', 'activity_Create Purchase Order Item', 'activity_Create Purchase Requisition Item', 'activity_Delete Purchase Order Item', 'activity_Reactivate Purchase Order Item', 'activity_Receive Order Confirmation', 'activity_Record Goods Receipt', 'activity_Record Invoice Receipt', 'activity_Record Service Entry Sheet', 'activity_Record Subsequent Invoice', 'activity_Release Purchase Order', 'activity_Release Purchase Requisition'

<h1> 2-Gram encoding the event-logs </h1>

<h4> Creating bigram features function </h4>

In [9]:
def generate_bigram_features(df, window_size=5):
    """
    Splits resource streams into sliding windows and computes
    traditional 2-gram (Bigram) counts for each window.
    """
    print(f"Generating 2-gram features with window size: {window_size})")
    
    feature_dicts = []
    targets = []    
    
    for resource, group in df.groupby('org:resource'):
        activities = group['activity'].values
        
        # Sliding window over the resource's timeline
        for i in range(len(activities) - window_size):
            # The context (past N events)
            window = activities[i : i + window_size]
            # The target (next event)
            next_act = activities[i + window_size]
            
            bigrams = [
                f"{window[j]}->{window[j+1]}" 
                for j in range(len(window) - 1)
            ]

            bigram_counts = dict(Counter(bigrams))
            
            feature_dicts.append(bigram_counts)
            targets.append(next_act)

    return feature_dicts, targets

<h4> 2-Gram encoding the 2013 log </h4>

In [22]:
df_2013_2gram = event_log_2013[[
    'case:concept:name', 
    'concept:name', 
    'org:resource', 
    'time:timestamp', 
]].copy()

df_2013_2gram['activity'] = df_2013_2gram['concept:name']

df_2013_2gram = df_2013_2gram.sort_values(by=['org:resource', 'time:timestamp'])

df_2013_2gram = df_2013_2gram.dropna(subset=['org:resource'])

feature_dicts_2013, y_2gram_2013 = generate_bigram_features(df_2013_2gram)

vectorizer = DictVectorizer(sparse=True)
X_2gram_2013 = vectorizer.fit_transform(feature_dicts_2013)

Generating 2-gram features with window size: 5)


<h4> 2-Gram encoding the 2017 log </h4>

In [11]:
df_2017_2gram = event_log_2017[[
    'case:concept:name', 
    'concept:name', 
    'org:resource', 
    'time:timestamp', 
]].copy()

df_2017_2gram['activity'] = df_2017_2gram['concept:name']

df_2017_2gram = df_2017_2gram.sort_values(by=['org:resource', 'time:timestamp'])

df_2017_2gram = df_2017_2gram.dropna(subset=['org:resource'])

# Run it
feature_dicts_2017, y_2gram_2017 = generate_bigram_features(df_2017_2gram)

# Vectorize
vectorizer = DictVectorizer(sparse=True)
X_2gram_2017 = vectorizer.fit_transform(feature_dicts_2017)

Generating 2-gram features with window size: 5)


<h4> 2-Gram encoding the 2018 log </h4>

In [None]:
df_2018_2gram = event_log_2018[[
    'case:concept:name', 
    'concept:name', 
    'org:resource', 
    'time:timestamp', 
]].copy()

df_2018_2gram['activity'] = df_2018_2gram['concept:name']

df_2018_2gram = df_2018_2gram.sort_values(by=['org:resource', 'time:timestamp'])

df_2018_2gram = df_2018_2gram.dropna(subset=['org:resource'])

# Run it
feature_dicts_2018, y_2gram_2018 = generate_bigram_features(df_2018_2gram)

# Vectorize
vectorizer = DictVectorizer(sparse=True)
X_2gram_2018 = vectorizer.fit_transform(feature_dicts_2018)


⚙️ Generating 2-gram features (Window Size: 5)...


<h4> 2-Gram encoding the 2019 log </h4>

In [12]:
df_2019_2gram = event_log_2019[[
    'case:concept:name', 
    'concept:name', 
    'org:resource', 
    'time:timestamp', 
]].copy()

df_2019_2gram['activity'] = df_2019_2gram['concept:name']

df_2019_2gram = df_2019_2gram.sort_values(by=['org:resource', 'time:timestamp'])

df_2019_2gram = df_2019_2gram.dropna(subset=['org:resource'])

# Run it
feature_dicts_2019, y_2gram_2019 = generate_bigram_features(df_2019_2gram)

# Vectorize
vectorizer = DictVectorizer(sparse=True)
X_2gram_2019 = vectorizer.fit_transform(feature_dicts_2019)

Generating 2-gram features with window size: 5)


<h1> Training Random Forest model on OHE event-logs </h1>

<h4> Random Forest Training Pipeline </h4>

In [10]:
def train_evaluate_rf(X, y, random_state=1):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    rf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted')


def add_next_activity(df):

    resource_sequences = df.groupby('resource')['prefix'].apply(lambda x: x.iloc[-1]).to_dict()
    next_activities = []

    for idx, row in df.iterrows():
        prefix = row['prefix']
        resource = row['resource']
        full_seq = resource_sequences[resource]

        if len(prefix) < len(full_seq):
            next_activities.append(full_seq[len(prefix)])  # next activity
        else:
            next_activities.append(None)  # last prefix has no next activity
    
    df['next_activity'] = next_activities
    df = df.dropna(subset=['next_activity'])

    return df
    

<h4> Training Random forest on 2013 incidents log </h4>


In [None]:
# BPIC 2013 incidents
feature_columns_2013 = [col for col in df_2013_OHE.columns if col.startswith('activity_') or col.startswith('resource')]
X_2013_OHE = df_2013_OHE[feature_columns_2013]
y_2013_OHE = df_2013_OHE['next_activity']

accuracy_2013_OHE, f1_2013_OHE = train_evaluate_rf(X_2013_OHE, y_2013_OHE)

Class distribution:
Accepted     36157
Completed    13490
Queued       10084
Unmatched        1
Name: count, dtype: int64
Got to here


<h4> Training Random Forest on 2017 log </h4>

In [None]:
# BPIC 2017
feature_columns_2017 = [col for col in df_2017_OHE.columns if col.startswith('activity_') or col.startswith('resource')]
X_2017_OHE = df_2017_OHE[feature_columns_2017]
y_2017_OHE = df_2017_OHE['next_activity']

accuracy_2017_OHE_OHE, f1_2017_OHE = train_evaluate_rf(X_2017_OHE, y_2017_OHE)


<h4>Training Random Forest on 2018 event-log </h4>

In [None]:
# BPIC 2018
feature_columns_2018 = [col for col in df_2018_OHE.columns if col.startswith('activity_') or col.startswith('resource')]
X_2018_OHE = df_2018_OHE[feature_columns_2018]
y_2018_OHE = df_2018_OHE['next_activity']

accuracy_2018_OHE_OHE, f1_2018_OHE = train_evaluate_rf(X_2018_OHE, y_2018_OHE)

<h4> Training Random Forest on 2019 event-log </h4>

In [None]:
# BPIC 2019
feature_columns_2019 = [col for col in df_2019_OHE.columns if col.startswith('activity_') or col.startswith('resource')]
X_2019_OHE = df_2019_OHE[feature_columns_2019]
y_2019_OHE = df_2019_OHE['next_activity']

accuracy_2019_OHE_OHE, f1_2019_OHE = train_evaluate_rf(X_2019_OHE, y_2019_OHE)

<h1> Random Forest Classifier Results </h1>

<h4> One-Hot Encoding results </h4>

In [None]:
# Results OHE 2013 log
print(f"Accuracy 2013: {accuracy_2013_OHE_OHE}")
print(f"F1-score 2013: {f1_2013_OHE}")

# Results OHE 2017 log
print(f"Accuracy 2017: {accuracy_2017_OHE_OHE}")
print(f"F1-score 2017: {f1_2017_OHE}")


# Results OHE 2018 log
print(f"Accuracy 2018: {accuracy_2018_OHE_OHE}")
print(f"F1-score 2018: {f1_2018_OHE}")

# Results OHE 2019 log
print(f"Accuracy 2019: {accuracy_2019_OHE_OHE}")
print(f"F1-score 2019: {f1_2019_OHE}")


# Aggregated results 
aggregated_accuracy = statistics.mean([accuracy_2013_OHE_OHE,accuracy_2017_OHE_OHE, accuracy_2018_OHE_OHE, accuracy_2019_OHE_OHE])
print(f"Average accuracy is: {aggregated_accuracy}")
aggregated_f1score = statistics.mean([f1_2013_OHE,f1_2017_OHE,f1_2018_OHE,f1_2019_OHE])
print(f"Average f1-score is: {aggregated_f1score}")

Accuracy 2013: 0.7071534441064046
F1-score 2013: 0.6681529055831646
Accuracy 2017: 0.6663145110305128
F1-score 2017: 0.6437193163447493
Accuracy 2018: 0.6308069869794619
F1-score 2018: 0.6211972047571935
Accuracy 2019: 0.9579513506906246
F1-score 2019: 0.9543415517430779
Average accuracy is: 0.740556573201751
Average f1-score is: 0.7218527446070464


<h1> Training Random Forest on 2-gram encoding </h1>

<h4> Training Random Forest on 2-gram encoding on 2013 incidents log </h4>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_2gram_2013, y_2gram_2013, test_size=0.2, random_state=1)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

y_pred = rf.predict(X_test)

accuracy_2013_2gram, f1_2013_gram = train_evaluate_rf(X_2gram_2013, y_2gram_2013)



Class distribution:
Accepted     36157
Completed    13490
Queued       10084
Unmatched        1
Name: count, dtype: int64
Got to here


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

<h4> Training Random Forest on 2-gram encoding on 2017 log </h4>

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_2gram_2017, y_2gram_2017, test_size=0.2, random_state=1)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy_2gram_2017 = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_2gram_2017:.4f}")
f1score_2gram_2017 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 score: {f1score_2gram_2017}")


Accuracy: 0.8605
F1 score: 0.8501345416297915


<h4> Training Random Forest on 2-gram encoding on 2018 log </h4>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_2gram_2018, y_2gram_2018, test_size=0.2, random_state=1)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy_2gram_2018 = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_2gram_2018:.4f}")
f1score_2gram_2018 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 score: {f1score_2gram_2018}")

<h4> Training Random Forest on 2-gram encoding on 2019 log </h4>

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_2gram_2019, y_2gram_2019, test_size=0.2, random_state=1)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy_2gram_2019 = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_2gram_2019:.4f}")
f1score_2gram_2019 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 score: {f1score_2gram_2019}")

Accuracy: 0.9568
F1 score: 0.9526134041550297


<h4> Aggregating Random Forest with 2-gram results </h4>

In [16]:
# Aggregated results 
aggregated_accuracy = statistics.mean([accuracy_2gram_2013, accuracy_2gram_2017, accuracy_2gram_2019])
print(f"Average accuracy is: {aggregated_accuracy}")
aggregated_f1score = statistics.mean([f1score_2gram_2013, f1score_2gram_2017, f1score_2gram_2019])
print(f"Average f1-score is: {aggregated_f1score}")

Average accuracy is: 0.8384522867796331
Average f1-score is: 0.8114549707865334
