In [None]:
#reload source files automatically
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import graphviz

from event_clustering.preprocessing import *
from event_clustering.clustering import *
from event_clustering.postprocessing import *
from event_clustering.process_mining import *

from sklearn.cluster import *
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
import sklearn.metrics.cluster as cluster_metrics

from pm4py.algo.discovery.inductive import algorithm as inductive_miner
#http://pm4py.pads.rwth-aachen.de/documentation/conformance-checking/evaluation-log-model/
from pm4py.evaluation import factory as evaluation_factory

import scipy.cluster.hierarchy as sch

# Load Encoded Events

In [None]:
# change the column name map to suit your dataset by replacing the values in this dictionary 
column_name_map = {
    'timestamp': 'time:timestamp',
    'caseid' : 'case:id',
    'eventname' : 'concept:name',
    'resource' : 'org:resource',
    'role' : 'org:role',
}  

In [None]:
# These are the PREFIXES of the feature names, the evaluation code automatically includes all columns for each prefix
# Change the array to include the features you would like to use
feature_names = [
    'concept:name',
    'org:role',
    'feature_position_relative',
    'feature_position_window',
    'feature_time_to_successor',
    'feature_time_of_day',
]

In [None]:
# Here you can add the experiments, one array per experiment.
# Within the array for an experiment you define the indices for the feature_names you want to use in that experiment
experiments = [
    [0],
    [0,1],
    [0,2],
    [0,1,2],
    [0,3],
    [0,1,3],
    [0,1,2,3]
]

In [None]:
# load and preprocess the dataset
# specify your data folder and the filename you want to analyze
data_folder = 'data/'
file_name = 'DomesticDeclarations'
df = preprocess(load(data_folder + file_name + '.xes'), column_name_map)
# load the encoded dataset
df_encoded = pd.read_csv(data_folder + file_name + '_encoded.csv')
original_df_columns = df.columns

# Clustering

### KMeans

In [None]:
# find the optimal clusters using the elbow method
find_optimal_clusters(df_encoded, 10)

In [None]:
# set cluster number (e.g. by determining cluster nr with method above)
cluster_nr = 6

## Evaluation

In [None]:
# this code runs the experiments you defined and stores 
# the results for the different metrics and the process models in the results folder 
for experiment in experiments:
    
    selected_feature_names = []
    # get the columns for each feature
    column_names = []
    for name_index in experiment:
        feature_name = feature_names[name_index]
        selected_feature_names.append(feature_name.replace(":", ""))
        column_names.extend([x for x in df_encoded.columns if feature_name in x])
        
    experiment_name = file_name + '_exp_' + '+'.join(selected_feature_names)
    
    # select the feature columns that are used in the experiment from the dataset
    df_experiment = df_encoded[column_names]
    
    experiment_results_path_prefix = 'results/' + experiment_name
    
    # run the clustering
    km = MiniBatchKMeans(n_clusters=cluster_nr, init_size=1024, batch_size=2048, random_state=20)
    cluster_labels = km.fit_predict(df_experiment)
    # save the cluster labels, so we can use them later to evaluate the data manually
    np.save(experiment_results_path_prefix + '_cluster_labels', cluster_labels)
    
    #plot pca
    plot_pca(df_experiment, cluster_labels, experiment_results_path_prefix + '_pcaplot')
    
    # calculate and store silhoette score
    silhouette_path = experiment_results_path_prefix + "_silhouette.txt"
    if os.path.exists(silhouette_path):
        os.remove(silhouette_path)
    silhouette = metrics.silhouette_score(df_experiment, cluster_labels)
    f = open(silhouette_path, "x")
    f.write(str(silhouette))
    f.close()
             
    # calculate and store completeness score
    completeness_path = experiment_results_path_prefix + "_completeness.txt"
    if os.path.exists(completeness_path):
        os.remove(completeness_path)
    completness = metrics.completeness_score(df[column_name_map['eventname']], cluster_labels)
    f = open(completeness_path, "x")
    f.write(str(completness))
    f.close()
    
    # generate the abstracted dataset
    abstracted_df = df.copy()
    abstracted_df[column_name_map['eventname']] = ['c_' + str(x) for x in cluster_labels]
    merge_consecutive_same_events(abstracted_df, column_name_map['caseid'], column_name_map['eventname'])
                                                 
    # write abstracted dataset as csv file
    filepath_abstracted = data_folder + experiment_name + '_abstracted.csv'
    abstracted_df.to_csv(filepath_abstracted, index=False)
    
    # read logs with pm4py                                           
    original_log = read_as_log_xes(data_folder + file_name + '.xes')
    abstracted_log = read_as_log_csv(filepath_abstracted)
                                                 
    # mine process model for original and abstracted dataset
    net_ind_orig, im_ind_orig, fm_ind_orig = inductive_miner.apply(original_log)
    net_ind, im_ind, fm_ind = inductive_miner.apply(abstracted_log)
                  
    # visualize and store the petri nets for both models
    visualize_as_petri_net(net_ind_orig, im_ind_orig, fm_ind_orig, experiment_results_path_prefix + '_petrinet.svg')
    visualize_as_petri_net(net_ind, im_ind, fm_ind, experiment_results_path_prefix + '_petrinet_abstracted.svg')
             
    # determine the model metrics for both models
    model_metrics_original = evaluation_factory.apply(original_log, net_ind_orig, im_ind_orig, fm_ind_orig)
    model_metrics_abstracted = evaluation_factory.apply(abstracted_log, net_ind, im_ind, fm_ind)
    
    # sore the model metrics for the original model
    model_metrics_path = experiment_results_path_prefix + "_model_metrics.txt"
    if os.path.exists(model_metrics_path):
        os.remove(model_metrics_path)
    f = open(model_metrics_path, "x")
    f.write(str(model_metrics_original))
    f.close()
        
    # sore the model metrics for the abstracted model
    model_metrics_abstracted_path = experiment_results_path_prefix + "_model_metrics_abstracted.txt"
    if os.path.exists(model_metrics_abstracted_path):
        os.remove(model_metrics_abstracted_path)
    f = open(model_metrics_abstracted_path, "x")
    f.write(str(model_metrics_abstracted))
    f.close()                                                                                          

In [None]:
# This code is used to check the cluster labels "manually", by grouping by cluster label and aggregate on concept:name
# change experiment_results_path_prefix if you want to use the cluster labels of another experiment
cluster_labels = np.load(experiment_results_path_prefix + '_cluster_labels.npy')
df2 = df.copy()
df2['cluster_labels'] = cluster_labels
eval_manually = df2.groupby('cluster_labels')[column_name_map['eventname']].apply(set)
eval_manually