In [None]:
#reload source files automatically
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from event_clustering.preprocessing import *
from event_clustering.clustering import *
from event_clustering.feature_engineering import *
from event_clustering.postprocessing import *
from event_clustering.process_mining import *

from sklearn.cluster import *
import sklearn.metrics as metrics

# Loading Preprocessing Analyzing

In [None]:
file_path = 'data/DomesticDeclarations.xes'
df = preprocess(load(file_path))

In [None]:
analyze(df)
df.head(10)

In [None]:
df = df[:1000]

# Feature Generation

In [None]:
add_event_type_representative(df)
add_event_reference(df, 1, 1, True)

In [None]:
add_timestamp_features(df)

# Encoding

In [None]:
df_cn = tfidf_encode(df, 'concept:name', TfidfVectorizer(stop_words = 'english'))
df_role = one_hot_encode(df, 'org:role')

In [None]:
df_vectorized = df_cn
df_vectorized = df_vectorized.join(df_role)
#df_vectorized = df_vectorized.join(df['feature_day_nr'])

# Clustering + Evaluation

In [None]:
find_optimal_clusters(df_vectorized, 10)

In [None]:
km = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20)
clusters = km.fit_predict(df_vectorized)

In [None]:
plot_pca(df_vectorized, clusters)

In [None]:
metrics.silhouette_score(df_vectorized, clusters)

In [None]:
# set cluster label as a column in the original df
df = add_cluster_label(df, df_vectorized, km)
df.head(5)

In [None]:
# TODO: check which events are in which cluster!

In [None]:
# overwrite concept:name with mapped cluster representative
abstracted_df = replace_with_representative(df, "concept:name_representative", "cluster_label")
abstracted_df.head(5)

In [None]:
# write abstracted dataframe as csv file
csv_filename = "data/abstracted_df_domestic_declarations.csv"
write_to_csv(abstracted_df, csv_filename, index=False)

# Compare Process Models from Original Dataset to Abstracted Dataset

In [None]:
# read original and abstracted dataset as event logs
filepath_original = "data/DomesticDeclarations.xes"
filepath_abstracted = csv_filename

original_log = read_as_log_xes(filepath_original)
abstracted_log = read_as_log_csv(filepath_abstracted)

print("Original Log:")
print(original_log)
print("\nAbstracted Log:")
print(abstracted_log)

## Mining Algorithms
* Alpha Miner
* Inductive Miner
* Heuristic Miner
* Directly-Follows Graph

In [None]:
# Alpha Miner
net_alpha_orig, im_alpha_orig, fm_alpha_orig = a_miner(original_log)
net_alpha, im_alpha, fm_alpha = a_miner(abstracted_log)

In [None]:
# Inductive Miner
net_ind_orig, im_ind_orig, fm_ind_orig = ind_miner(original_log)
net_ind, im_ind, fm_ind = ind_miner(abstracted_log)

In [None]:
# Heuristic Miner
net_heu_orig, im_heu_orig, fm_heu_orig = heu_miner(original_log)
net_heu, im_heu, fm_heu = heu_miner(abstracted_log)

In [None]:
# Directly-Follows Graph
dfg_miner(original_log)
dfg_miner(abstracted_log)

## Process Model Metrics
* Fitness
* Precision
* Generalization
* Simplicity

In [None]:
# Fitness
print("Original Log with Heuristic Miner:")
fitness_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
fitness_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Precision
print("Original Log with Heuristic Miner:")
precision_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
precision_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Generalization
print("Original Log with Heuristic Miner:")
generalization_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
generalization_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Simplicity
print("Original Log with Heuristic Miner:")
simplicity_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
simplicity_metric(abstracted_log, net_heu, im_heu, fm_heu)

In [None]:
# Evaluation --> all four metrices in one line
print("Original Log with Heuristic Miner:")
evaluation_metric(original_log, net_heu_orig, im_heu_orig, fm_heu_orig)

print("\nAbstracted Log with Heuristic Miner")
evaluation_metric(abstracted_log, net_heu, im_heu, fm_heu)