In [None]:
#reload source files automatically
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from event_clustering.preprocessing import *

# Loading Preprocessing Analyzing
- load and preprocess the dataset
- analyze the dataset to determine which features you want to generate and how they should be encoded

In [None]:
# change the column name map to suit your dataset
column_name_map = {
    'timestamp': 'time:timestamp',
    'caseid' : 'case:id',
    'eventname' : 'concept:name',
    'resource' : 'org:resource',
    'role' : 'org:role'
}  

In [None]:
# load and preprocess the dataset
file_name = 'DomesticDeclarations'
df = preprocess(load('data/' + file_name + '.xes'), column_name_map)

In [None]:
# analyze the structure and some highlevel insights on the dataset
analyze(df, column_name_map, show_examples=False, include_casetime=False)

In [None]:
df = df[:1000]

# Feature Generation
- generate the features you want to use

In [None]:
# add features for neighboring events and timedif to these neighbors
add_neighbor_event(df, 1, column_name_map)
add_neighbor_event(df, -1, column_name_map)

In [None]:
# add one hot encoded start and end events using window length. Insert the desired window in seconds
determine_time_frame_feature(df, column_name_map, start_window_length=3600, end_window_length=3600)

In [None]:
# add one hot encoded start, middle and end events using neighbor reference
determine_start_end_event_feature(df, column_name_map)

In [None]:
# add timestamp features
add_timestamp_features(df, column_name_map)

In [None]:
# save the df with generated features as csv, so it can be used in the next step:

df.to_csv('data/' + file_name +'_features.csv', index=False)

In [None]:
df.columns

# Encoding
- encode the features you want to use

In [None]:
# load the dataframe with features already added (see previous step)

df = preprocess(pd.read_csv('data/' + file_name +'_features.csv'), column_name_map)

In [None]:
df_name = tfidf_encode(df, column_name_map['eventname'], TfidfVectorizer(stop_words = 'english'))
#df_resource = one_hot_encode(df, column_name_map['resource'])
df_role = one_hot_encode(df, column_name_map['role'])

df_event_position = df[['feature_position_beginning', 'feature_position_middle', 'feature_position_end']]
df_time_window = df[['feature_window_start', 'feature_window_end']]
df_time = binning(df, 'feature_timedif_neighbor_event_1', 10, 'feature_timedif_to_neighbor')
df_time_of_day = df[filter_column_names(df, 'feature_time_')]

In [None]:
# uncomment lines to add more features you want to use in your encoding:

df_encoded = df_name
#df_encoded = df_encoded.join(df_resource)
#df_encoded = df_encoded.join(df["case:SUMleges"].fillna(0))
#df_encoded = df_encoded.join(df_act_name)
df_encoded = df_encoded.join(df_event_position)
df_encoded = df_encoded.join(df_time_window)
df_encoded = df_encoded.join(df_role)
df_encoded = df_encoded.join(df_time)
df_encoded = df_encoded.join(df['feature_day_nr'])
df_encoded = df_encoded.join(df_time_of_day)

In [None]:
df_encoded.columns

In [None]:
# save the encoded events as csv, so it can be used in the evaluation:
# file_name_structure: [Dataset-name]_[Feature-groups]_encoded.csv

df_encoded.to_csv('data/' + file_name +'_encoded.csv', index=False)