In [1]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [8]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")


parsing log, completed traces :: 100%|██████████| 251734/251734 [00:53<00:00, 4730.67it/s]


In [9]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['org:resource', 'time:timestamp'])

df.head(n=20)

Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
118143,4507004931_00010,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118144,4507004931_00010,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118153,4507004931_00020,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118154,4507004931_00020,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118163,4507004931_00030,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118164,4507004931_00030,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118173,4507004931_00040,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118174,4507004931_00040,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118183,4507004931_00050,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118184,4507004931_00050,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00


In [10]:
# Count unique resources
unique_resources = df['org:resource'].nunique()

print(f"Number of unique resources: {unique_resources}")

Number of unique resources: 628


In [11]:
# Group by 'org:resource' and count how many activities each resource has performed
activity_counts = df.groupby('org:resource')['concept:name'].count()

# Calculate average activity sequence length per resource
average_activity_length = activity_counts.mean()

print(f"Average activity sequence length per resource: {average_activity_length}")

Average activity sequence length per resource: 2541.2786624203823


In [12]:
# Group by resource and get list of activities performed in order
resource_activity_sequences = df.sort_values('time:timestamp').groupby('org:resource')['concept:name'].apply(list)

# Function to count repetitions in a sequence
def count_repetitions(activity_list):
    from collections import Counter
    counts = Counter(activity_list)
    repetitions = sum(count - 1 for count in counts.values() if count > 1)
    return repetitions

# Apply repetition count to each resource's sequence
repetition_counts = resource_activity_sequences.apply(count_repetitions)

# Average repetition count per resource
average_repetitions = repetition_counts.mean()

print(f"Average activity repetition per resource: {average_repetitions}")

Average activity repetition per resource: 2538.457006369427


In [13]:
import pandas as pd
import numpy as np

# Get the relative frequency of activities per resource
activity_counts = df.groupby(['org:resource', 'concept:name']).size()
total_counts = activity_counts.groupby('org:resource').transform('sum')
probabilities = activity_counts / total_counts

# Compute entropy per resource
entropy = probabilities.groupby('org:resource').apply(lambda x: -np.sum(x * np.log2(x)))

# Normalize by max entropy possible (log2 of number of unique activities)
num_unique_acts = df.groupby('org:resource')['concept:name'].nunique()
max_entropy = np.log2(num_unique_acts)

# Avoid division by zero (when only 1 unique activity)
max_entropy_replaced = max_entropy.replace(0, np.nan)
normalized_entropy = entropy / max_entropy_replaced

# Invert to get specialization score
specialization_score = 1 - normalized_entropy

# Treat resources with only 1 activity as fully specialized
specialization_score = specialization_score.fillna(1.0)
normalized_entropy = normalized_entropy.fillna(0.0)

# Combine into DataFrame
specialization_df = pd.DataFrame({
    'entropy': entropy,
    'normalized_entropy': normalized_entropy,
    'specialization_score': specialization_score
})

print(specialization_df)

# Calculate and print average specialization
average_specialization = specialization_score.mean()
print(f"Average specialization across all resources: {average_specialization:.4f}")


               entropy  normalized_entropy  specialization_score
org:resource                                                    
NONE          1.207015            0.603507              0.396493
batch_00      3.223846            0.773118              0.226882
batch_01      0.867767            0.867767              0.132233
batch_02      1.616886            0.625497              0.374503
batch_03      1.020293            0.643733              0.356267
...                ...                 ...                   ...
user_602      1.077512            0.538756              0.461244
user_603     -0.000000            0.000000              1.000000
user_604     -0.000000            0.000000              1.000000
user_605      1.419556            0.895640              0.104360
user_606     -0.000000            0.000000              1.000000

[628 rows x 3 columns]
Average specialization across all resources: 0.7830
