In [1]:
from utility import log_config as lg
from jinja2 import Template
import pandas as pd
import numpy as np
import pickle
import torch
from itertools import chain, repeat, islice

Load & Preprocess Event Log

In [2]:
log_name = "helpdesk"  
log = pd.read_csv(f'../event_log/{log_name}.csv')

# Clean up column names
log['activity'] = log['activity'].str.replace(' ', '').str.replace('+', '').str.replace('-', '').str.replace('_', '')

if log_name != 'sepsis':
    log['resource'] = log['resource'].astype(str).str.replace(' ', '').str.replace('+', '').str.replace('-', '').str.replace('_', '')

log.fillna('UNK', inplace=True)

In [3]:
print(log.head())
print(log.info())

     case            activity resource                timestamp customer  \
0  Case 1   Assignseriousness   Value1  2012/10/09 14:50:17.000  Value 1   
1  Case 1  Takeinchargeticket   Value1  2012/10/09 14:51:01.000  Value 1   
2  Case 1  Takeinchargeticket   Value2  2012/10/12 15:02:56.000  Value 1   
3  Case 1       Resolveticket   Value1  2012/10/25 11:54:26.000  Value 1   
4  Case 1              Closed   Value3  2012/11/09 12:54:39.000  Value 1   

   product responsiblesection seriousness2 servicelevel servicetype  \
0  Value 1            Value 1      Value 1      Value 1     Value 1   
1  Value 1            Value 1      Value 1      Value 1     Value 1   
2  Value 1            Value 1      Value 1      Value 2     Value 1   
3  Value 1            Value 1      Value 1      Value 2     Value 1   
4  Value 1            Value 1      Value 1      Value 2     Value 1   

  supportsection workgroup  
0        Value 1   Value 1  
1        Value 1   Value 1  
2        Value 1   Value 1  


Compute Timestamp Features

In [4]:
def extract_timestamp_features(group):
    timestamp_col = 'timestamp'
    group = group.sort_values(timestamp_col, ascending=True)
    start_date = group[timestamp_col].iloc[0]

    group["timesincelastevent"] = group[timestamp_col].diff().fillna(pd.Timedelta(seconds=0))
    group["timesincelastevent"] = group["timesincelastevent"].apply(lambda x: float(x / np.timedelta64(1, 's')))

    group["timesincecasestart"] = (group[timestamp_col] - start_date).fillna(pd.Timedelta(seconds=0))
    group["timesincecasestart"] = group["timesincecasestart"].apply(lambda x: float(x / np.timedelta64(1, 's')))

    return group

log['timestamp'] = pd.to_datetime(log['timestamp'])
log = log.groupby('case', group_keys=False).apply(extract_timestamp_features)


  log = log.groupby('case', group_keys=False).apply(extract_timestamp_features)


In [5]:
print(log.head())

     case            activity resource           timestamp customer  product  \
0  Case 1   Assignseriousness   Value1 2012-10-09 14:50:17  Value 1  Value 1   
1  Case 1  Takeinchargeticket   Value1 2012-10-09 14:51:01  Value 1  Value 1   
2  Case 1  Takeinchargeticket   Value2 2012-10-12 15:02:56  Value 1  Value 1   
3  Case 1       Resolveticket   Value1 2012-10-25 11:54:26  Value 1  Value 1   
4  Case 1              Closed   Value3 2012-11-09 12:54:39  Value 1  Value 1   

  responsiblesection seriousness2 servicelevel servicetype supportsection  \
0            Value 1      Value 1      Value 1     Value 1        Value 1   
1            Value 1      Value 1      Value 1     Value 1        Value 1   
2            Value 1      Value 1      Value 2     Value 1        Value 1   
3            Value 1      Value 1      Value 2     Value 1        Value 1   
4            Value 1      Value 1      Value 2     Value 1        Value 1   

  workgroup  timesincelastevent  timesincecasestart  
0 

Train-Test Split

In [6]:
grouped = log.groupby("case")
start_timestamps = grouped["timestamp"].min().reset_index()
start_timestamps = start_timestamps.sort_values("timestamp", ascending=True, kind="mergesort")

train_ids = list(start_timestamps["case"])[:int(0.66 * len(start_timestamps))]
train = log[log["case"].isin(train_ids)].sort_values("timestamp", ascending=True, kind='mergesort')
test = log[~log["case"].isin(train_ids)].sort_values("timestamp", ascending=True, kind='mergesort')

print("Train Size:", len(train))
print("Test Size:", len(test))

Train Size: 14193
Test Size: 7155


Generate Prefix Histories //
history_train----train  //
len_prefix_train----len_train //
dict_suffix_train----suffix_train //
_dict_label_train[lg.log[__log_name]['target']----label_train

In [7]:
def pad_infinite(iterable, padding=None):
    return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
    return islice(pad_infinite(iterable, padding), size)


In [13]:
def gen_prefix_history(df, log_name):
    print('Generating prefix history...')
    list_seq = []
    list_len_prefix = []
    sequence = df.groupby('case', sort=False)

    event_template = Template(lg.log[log_name]['event_template'])
    trace_template = Template(lg.log[log_name]['trace_template'])

    dict_event_label = {v: [] for v in lg.log[log_name]['event_attribute']}
    dict_trace_label = {v: [] for v in lg.log[log_name]['trace_attribute']}
    dict_len_label = {i: [] for i in range(max(df['case'].value_counts()))}

    for group_name, group_data in sequence:
        event_dict_hist = {}
        trace_dict_hist = {}
        event_text = ''
        len_prefix = 1
        activity_list = []

        for index, row in group_data.iterrows():
            activity_list.append(row['activity'])
            for v in lg.log[log_name]['event_attribute']:
                value = row[v]
                event_dict_hist[v] = value.replace(' ', '') if isinstance(value, str) else value
            event_text += event_template.render(event_dict_hist) + ' '

            for w in lg.log[log_name]['trace_attribute']:
                value = row[w]
                trace_dict_hist[w] = value.replace(' ', '') if isinstance(value, str) else value
            trace_text = trace_template.render(trace_dict_hist)

            prefix_hist = event_text + trace_text
            list_seq.append(prefix_hist)
            list_len_prefix.append(len_prefix)
            len_prefix += 1

        suffixes = []
        activity_list.pop(0)
        activity_list.append('ENDactivity')

        for i in range(len(activity_list)):
            suffixes.append(list(pad(activity_list[i:], max(df['case'].value_counts()), 'ENDactivity')))
        for s in suffixes:
            for i in range(len(s)):
                dict_len_label[i].append(s[i])

    return list_seq, dict_event_label, list_len_prefix, dict_len_label

history_train, dict_label_train, len_prefix_train, dict_suffix_train = gen_prefix_history(train, log_name)
history_test, dict_label_test, len_prefix_test, dict_suffix_test = gen_prefix_history(test, log_name)
for v in dict_label_train:
    if v!='timesincecasestart':
        temp_list = []
        for key in dict_label_train[v]:
                temp_list.append(label2id[v].get(key))
        dict_label_train[v] = torch.tensor(temp_list)
    else:
        dict_label_train[v] = torch.tensor(dict_label_train[v]).view(-1, 1)

Generating prefix history...
Generating prefix history...


In [15]:
# Print the first three history strings
print("First three history_train values:")
for i, hist in enumerate(history_train[:3]):
    print(f"History {i+1}: {hist}")
print("\n" + "="*50 + "\n")

# Print the first three prefix lengths
print("First three len_prefix_train values:")
print(len_prefix_train[:3])
print("\n" + "="*50 + "\n")

# Print the first three entries for each key in dict_label_train
target_key = lg.log[log_name]['target']
print(f"First three values for target '{target_key}':")
print(dict_label_train[lg.log[log_name]['target']][:3])


# Print the first three entries for each key in dict_suffix_train
print("First three values in dict_suffix_train for each index:")
for key, values in dict_suffix_train.items():
    print(f"Index {key}: {values[:3]}")


First three history_train values:
History 1: Value2 performed Assignseriousness 0.0 seconds ago from the beginning of the trace. Value3 managed the request for the Value3 of Value63 with service Value1 of level Value2. Section Value4 led by Value5
History 2: Value2 performed Assignseriousness 0.0 seconds ago from the beginning of the trace. Value3 managed the request for the Value3 of Value63 with service Value1 of level Value2. Value2 performed Takeinchargeticket 1383122.0 seconds ago from the beginning of the trace. Value3 managed the request for the Value3 of Value63 with service Value1 of level Value2. Section Value4 led by Value5
History 3: Value2 performed Assignseriousness 0.0 seconds ago from the beginning of the trace. Value3 managed the request for the Value3 of Value63 with service Value1 of level Value2. Value2 performed Takeinchargeticket 1383122.0 seconds ago from the beginning of the trace. Value3 managed the request for the Value3 of Value63 with service Value1 of level

Serialization

In [11]:
def serialize_object(obj, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(obj, f)

serialize_object(history_train, 'history_train.pkl')
serialize_object(history_test, 'history_test.pkl')
serialize_object(len_prefix_train, 'len_prefix_train.pkl')
serialize_object(len_prefix_test, 'len_prefix_test.pkl')
serialize_object(dict_suffix_train, 'suffix_train.pkl')
serialize_object(dict_suffix_test, 'suffix_test.pkl')
