In [None]:
# helper code to reload source files automatically
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from event_clustering.preprocessing import *

# Loading Preprocessing Analyzing
- load and preprocess the dataset
- analyze the dataset to determine which features you want to generate and how they should be encoded

In [None]:
# change the column name map to suit your dataset by replacing the values in this dictionary 
column_name_map = {
    'timestamp': 'time:timestamp',
    'caseid' : 'case:id',
    'eventname' : 'concept:name',
    'resource' : 'org:resource',
    'role' : 'org:role'
}  

In [None]:
# load and preprocess the dataset

# specify your data folder and the filename you want to analyze
data_folder = 'data/'
file_name = 'DomesticDeclarations'
df = preprocess(load(data_folder + file_name + '.xes'), column_name_map)

In [None]:
# analyze the structure and some highlevel insights on the dataset
# set include_casetime to true if you want to get insights about the length and duration of the cases in your dataset
analyze(df, column_name_map, show_examples=False, include_casetime=False)

# Feature Generation
- generate the additional features

In [None]:
# add a reference and the calculated timediference to neighboring events of each event.
add_neighbor_event(df, 1, column_name_map)
add_neighbor_event(df, -1, column_name_map)

In [None]:
# add one hot encoded start, middle and end events using neighbor reference
add_event_position_relative_feature(df, column_name_map)

In [None]:
# add one hot encoded start and end events using window length. Insert the desired window in seconds
add_event_position_window_feature(df, column_name_map, start_window_length=3600, end_window_length=3600)

In [None]:
# add one hot encoded time of day features
add_time_of_day_feature(df, column_name_map)

In [None]:
# save the df with generated features as csv, so it can be used in the next step:
df.to_csv(data_folder + file_name +'_features.csv', index=False)

# Encoding
- encode the features you want to use

In [None]:
# load the dataframe with features already added (see previous step)
df = preprocess(pd.read_csv(data_folder + file_name +'_features.csv'), column_name_map)

In [None]:
df_name = tfidf_encode(df, column_name_map['eventname'], TfidfVectorizer(stop_words = 'english'))

# features depending on dataset 
df_role = one_hot_encode(df, column_name_map['role'])
#df_resource = one_hot_encode(df, column_name_map['resource'])

# generated features
df_position_relative = df[['feature_position_relative_beginning', 'feature_position_relative_middle', 'feature_position_relative_end']]
df_position_window = df[['feature_position_window_start', 'feature_position_window_end']]
df_time_to_successor = binning(df, 'neighbor_event_timedif_1', 10, 'feature_time_to_successor')
df_time_of_day = df[filter_column_names(df, 'feature_time_of_day')]

In [None]:
# define the features you want to combine to your final vector

df_encoded = df_name
df_encoded = df_encoded.join(df_role)
df_encoded = df_encoded.join(df_position_relative)
df_encoded = df_encoded.join(df_position_window)
df_encoded = df_encoded.join(df_time_to_successor)
df_encoded = df_encoded.join(df_time_of_day)

# features 
#df_encoded = df_encoded.join(df_resource)
#df_encoded = df_encoded.join(df["case:SUMleges"].fillna(0))
#df_encoded = df_encoded.join(df_act_name)

In [None]:
df_encoded.columns

In [None]:
# save the encoded events as csv, so it can be used in the evaluation:

df_encoded.to_csv(data_folder + file_name +'_encoded.csv', index=False)