In [None]:
#reload source files automatically
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from event_clustering.preprocessing import *
from event_clustering.clustering import *
from event_clustering.feature_engineering import *
from event_clustering.postprocessing import *
from event_clustering.process_mining import *

from sklearn.cluster import *
import sklearn.metrics as metrics
import scipy.cluster.hierarchy as sch

# Loading Preprocessing Analyzing

In [None]:
file_path = 'data/BPIC15_1.xes'
df = preprocess(load(file_path))
original_df_columns = df.columns

In [None]:
analyze(df)
df.head(10)

In [None]:
df = df[:1000]

# Feature Generation

In [None]:
add_event_type_representative(df)
add_event_reference(df, 1, 1, True)

In [None]:
add_timestamp_features(df)

In [None]:
df

In [None]:
df.columns

# Encoding

In [None]:
#df_cn = tfidf_encode(df, 'activityNameEN', TfidfVectorizer(stop_words = 'english'))
#df_role = one_hot_encode(df, 'org:resource')

In [None]:
df_cn = tfidf_encode(df, 'concept:name', TfidfVectorizer(stop_words = 'english'))
df_role = one_hot_encode(df, 'org:role')
#df_time_previous = one_hot_encode(df, 'feature_time_since_-1')

In [None]:
df_vectorized = df_cn
df_vectorized = df_vectorized.join(df_role)
#df_vectorized = df_vectorized.join(df['feature_day_nr'])

In [None]:
df_vectorized.columns

# Clustering
* KMeans
* Hierarchical (Agglomerative)

### KMeans

In [None]:
find_optimal_clusters(df_vectorized, 10)

In [None]:
cluster_nr = 6
km = MiniBatchKMeans(n_clusters=cluster_nr, init_size=1024, batch_size=2048, random_state=20)
clusters_km = km.fit_predict(df_vectorized)

### Hierarchical Clustering

In [None]:
# setting distance_threshold=0 ensures we compute the full tree.
hier_dendogram = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage = "single").fit(df_vectorized)

# Plot the corresponding dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
#plot_dendrogram(hier_dendogram, truncate_mode='level', p=5)
plt.xlabel("Sample Index or (Cluster Size)")
plt.ylabel("Distance")
plt.show()

plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(hier_dendogram, truncate_mode='level', p=40)
plt.xlabel("Sample Index or (Cluster Size)")
plt.ylabel("Distance")
plt.show()

In [None]:
# --> determine amount of clusters
agg_clusters = 3
hier = AgglomerativeClustering(n_clusters=agg_clusters, linkage="single").fit(df_vectorized)

## Evaluation

In [None]:
kmeans = True
hierarchical = False

if kmeans:
    plot_pca(df_vectorized, clusters_km, cluster_nr)

if hierarchical:
    plot_pca(df_vectorized, hier.labels_, cluster_nr)

In [None]:
print("Silhouette Coefficient KMeans: ", metrics.silhouette_score(df_vectorized, clusters_km))
print("Silhouette Coefficient Hierarchical Clustering: ", metrics.silhouette_score(df_vectorized, hier.labels_))

In [None]:
# https://en.wikipedia.org/wiki/Cophenetic_correlation? --> for hierarchical clustering maybe?

In [None]:
# !!! have to load df every time new, other wise there are just the representatives in the dataframe
# df = preprocess(load(file_path))

# set cluster label as a column in the original df

kmeans = True
hierarchical = False

if kmeans:
    df_km = add_cluster_label_km(df, df_vectorized, km)
if hierarchical:
    df_hier = add_cluster_label_hier(df, df_vectorized, hier)
df.head(5)

## Check cluster labels "manually"

In [None]:
#eval_manually = df_km.groupby('cluster_label')['activityNameEN'].apply(set)

In [None]:
# group by cluster label and aggregate concept:name
eval_manually = df_hier.groupby('cluster_label')['concept:name'].apply(set)

In [None]:
pd.set_option('display.max_colwidth', -1)
eval_manually

## Replace events with cluster representative

In [None]:
# overwrite activityNameEN with mapped cluster representative
#abstracted_df = replace_with_representative(df_km, "activityNameEN", "cluster_label", original_df_columns)
#abstracted_df.head(5)

In [None]:
# overwrite concept:name with mapped cluster representative
abstracted_df = replace_with_representative(df_hier, "concept:name", "cluster_label", original_df_columns)
abstracted_df.head(5)

In [None]:
# write abstracted dataframe as csv file
csv_filename = "data/abs_km_bpic15.csv"
write_to_csv(abstracted_df, csv_filename, index=False)

# Compare Process Models from Original Dataset to Abstracted Dataset

In [None]:
# read original and abstracted dataset as event logs
filepath_original = "data/BPIC15_1.xes"
filepath_abstracted = csv_filename

original_log = read_as_log_xes(filepath_original)
abstracted_log = read_as_log_csv(filepath_abstracted)

print("Original Log:")
print(original_log)
print("\nAbstracted Log:")
print(abstracted_log)

## Mining Algorithms
* Alpha Miner
* Inductive Miner
* Heuristic Miner
* Directly-Follows Graph

In [None]:
# Alpha Miner
net_alpha_orig, im_alpha_orig, fm_alpha_orig = a_miner(original_log)
net_alpha, im_alpha, fm_alpha = a_miner(abstracted_log)

In [None]:
# Inductive Miner
net_ind_orig, im_ind_orig, fm_ind_orig = ind_miner(original_log)
net_ind, im_ind, fm_ind = ind_miner(abstracted_log)

In [None]:
# Heuristic Miner
net_heu_orig, im_heu_orig, fm_heu_orig = heu_miner(original_log)
net_heu, im_heu, fm_heu = heu_miner(abstracted_log)

In [None]:
# Directly-Follows Graph
dfg_miner(original_log)
dfg_miner(abstracted_log)

## Process Model Metrics
* Fitness
* Precision
* Generalization
* Simplicity

In [None]:
# Evaluation --> all four metrices in one line
print("Original Log with Heuristic Miner:")
evaluation_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
evaluation_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
print("Original Log with Inductive Miner:")
evaluation_metric(original_log, net_ind_orig, im_ind_orig, fm_ind_orig )

print("\nAbstracted Log with Inductive Miner")
evaluation_metric(abstracted_log, net_ind, im_ind, fm_ind)

In [None]:
# Fitness
print("Original Log with Heuristic Miner:")
fitness_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
fitness_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Precision
print("Original Log with Heuristic Miner:")
precision_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
precision_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Generalization
print("Original Log with Heuristic Miner:")
generalization_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
generalization_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Simplicity
print("Original Log with Heuristic Miner:")
simplicity_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
simplicity_metric(abstracted_log, net_heu, im_heu, fm_heu)