In [None]:
import pm4py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

output_dir = Path("../../Results/Advanced_Analysis/activity_lifecycle_kmeans")
output_dir.mkdir(parents=True, exist_ok=True)

print("Libraries ready")


In [None]:
log_path = "../../Dataset/BPI Challenge 2017.xes"
log = pm4py.read_xes(log_path)
df = pm4py.convert_to_dataframe(log)

# Ensure ordering
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'], utc=True)
df = df.sort_values(['case:concept:name', 'time:timestamp']).reset_index(drop=True)

print(f"Events: {len(df):,}")
print(f"Cases: {df['case:concept:name'].nunique():,}")
print(f"Activities: {df['concept:name'].nunique()}, lifecycle tags: {df['lifecycle:transition'].nunique()}")


In [None]:
def classify_outcome(seq):
    if seq is None or len(seq) == 0:
        return 'Unknown'
    acts = set(seq)
    if 'A_Denied' in acts:
        return 'Denied'
    if 'A_Cancelled' in acts:
        return 'Cancelled'
    if 'A_Pending' in acts:
        return 'Pending'
    return 'Other'


In [None]:
case_data = df.groupby('case:concept:name').agg({
    'time:timestamp': ['min', 'max'],
    'concept:name': list
}).reset_index()

case_data.columns = ['case_id', 'start_time', 'end_time', 'activity_sequence']
case_data['duration_days'] = (case_data['end_time'] - case_data['start_time']).dt.total_seconds() / (24 * 3600)
case_data['num_events'] = case_data['activity_sequence'].apply(len)
case_data['outcome'] = case_data['activity_sequence'].apply(classify_outcome)

summary = {
    'cases': len(case_data),
    'median_duration_days': case_data['duration_days'].median(),
    'median_events': case_data['num_events'].median()
}

print(summary)
print("Outcome counts:\n", case_data['outcome'].value_counts())



In [None]:
df['lifecycle:transition'] = df['lifecycle:transition'].fillna('unknown')
df['act_lifecycle'] = df['concept:name'] + '/' + df['lifecycle:transition']

combo_counts = df['act_lifecycle'].value_counts()
top_n = 25
selected_combos = combo_counts.head(top_n).index.tolist()
print(f"Using top {len(selected_combos)} activity-lifecycle pairs out of {combo_counts.shape[0]}")

case_combo = (df[df['act_lifecycle'].isin(selected_combos)]
              .groupby(['case:concept:name', 'act_lifecycle'])
              .size()
              .unstack(fill_value=0))

case_combo = case_combo.reindex(columns=selected_combos, fill_value=0)
case_combo = case_combo.div(case_combo.sum(axis=1).replace(0, 1), axis=0)  # normalized frequencies

feature_df = case_combo.merge(case_data[['case_id', 'duration_days', 'num_events']],
                               left_index=True, right_on='case_id')
feature_df = feature_df.set_index('case_id')

print(feature_df.head())



In [None]:
# Silhouette score is computed right after fitting k-means (see next cell)


In [None]:
X = feature_df[selected_combos]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(X_scaled)

feature_df['cluster'] = cluster_labels
case_data = case_data.merge(feature_df[['cluster']], left_on='case_id', right_index=True)

sil_score = silhouette_score(X_scaled, cluster_labels)
print(feature_df['cluster'].value_counts().sort_index())
print(f"Silhouette score (k={k}): {sil_score:.3f}")



In [None]:
pca = PCA(n_components=2, random_state=42)
pca_coords = pca.fit_transform(X_scaled)

plot_df = pd.DataFrame({
    'pc1': pca_coords[:, 0],
    'pc2': pca_coords[:, 1],
    'cluster': cluster_labels,
    'duration_days': case_data.set_index('case_id').loc[feature_df.index]['duration_days']
})

plt.figure(figsize=(8, 6))
sns.scatterplot(data=plot_df, x='pc1', y='pc2', hue='cluster', palette='tab10', s=25)
plt.title('PCA projection of case vectors (k-means clusters)')
plt.tight_layout()
plt.savefig(output_dir / 'pca_clusters.png', dpi=150)
plt.show()



In [None]:
if 'outcome' not in case_data.columns:
    case_data['outcome'] = case_data['activity_sequence'].apply(classify_outcome)

cluster_outcome = pd.crosstab(case_data['cluster'], case_data['outcome'], normalize='index')
print(cluster_outcome)

plt.figure(figsize=(8, 4))
sns.heatmap(cluster_outcome, annot=True, fmt='.2f', cmap='Blues')
plt.title('Outcome mix per cluster')
plt.ylabel('Cluster')
plt.tight_layout()
plt.savefig(output_dir / 'cluster_outcome_heatmap.png', dpi=150)
plt.show()



In [None]:
cluster_sizes = feature_df['cluster'].value_counts().sort_index()
print("Cluster sizes:\n", cluster_sizes)

plt.figure(figsize=(6, 4))
sns.barplot(x=cluster_sizes.index, y=cluster_sizes.values, palette='tab10')
plt.xlabel('Cluster')
plt.ylabel('Cases')
plt.title('Case count per cluster')
plt.tight_layout()
plt.savefig(output_dir / 'cluster_sizes.png', dpi=150)
plt.show()

plt.figure(figsize=(6, 4))
sns.boxplot(data=case_data, x='cluster', y='duration_days', palette='tab10')
plt.ylabel('Duration (days)')
plt.title('Duration distribution per cluster')
plt.tight_layout()
plt.savefig(output_dir / 'cluster_duration_boxplot.png', dpi=150)
plt.show()



In [None]:
cluster_combo_mean = feature_df.groupby('cluster')[selected_combos].mean()

plt.figure(figsize=(12, 5))
sns.heatmap(cluster_combo_mean, cmap='YlOrRd', annot=False)
plt.title('Average normalized activity-lifecycle usage by cluster')
plt.xlabel('Activity / Lifecycle')
plt.ylabel('Cluster')
plt.tight_layout()
plt.savefig(output_dir / 'cluster_combo_heatmap.png', dpi=150)
plt.show()

