In [1]:
#Function to Encode and Store Distinct Traces with Their Frequencies
import pandas as pd
import pickle
from itertools import chain, repeat, islice
import torch

In [2]:
log_name = 'sepsis'
TYPE = 'all'
log = pd.read_csv('../event_log/'+log_name+'.csv')
distinct_traces = {}
label2id = {}
id2label = {}

In [3]:
print("Processing log to extract distinct traces...")
# Clean activity names
log['activity'] = log['activity'].str.replace(' ', '').str.replace('+', '').str.replace('-', '').str.replace('_', '')
print(log['activity'])

Processing log to extract distinct traces...
0        ERRegistration
1            Leucocytes
2                   CRP
3            LacticAcid
4              ERTriage
              ...      
15209               CRP
15210          ReleaseA
15211    ERRegistration
15212          ERTriage
15213    ERSepsisTriage
Name: activity, Length: 15214, dtype: object


In [4]:
# Extract unique traces
#grouped = log.groupby('case', sort=False)['activity'].apply(list)
grouped = log.groupby('case', sort=False)['activity'].apply(lambda x: tuple(x)) 
print(grouped)

case
1       (ERRegistration, Leucocytes, CRP, LacticAcid, ...
2       (ERRegistration, ERTriage, CRP, LacticAcid, Le...
3       (ERRegistration, ERTriage, ERSepsisTriage, Leu...
4       (ERRegistration, ERTriage, ERSepsisTriage, CRP...
5       (ERRegistration, ERTriage, ERSepsisTriage, IVL...
                              ...                        
1046    (ERRegistration, ERTriage, ERSepsisTriage, Leu...
1047           (ERRegistration, ERTriage, ERSepsisTriage)
1048           (ERRegistration, ERTriage, ERSepsisTriage)
1049    (ERRegistration, ERTriage, ERSepsisTriage, Leu...
1050           (ERRegistration, ERTriage, ERSepsisTriage)
Name: activity, Length: 1050, dtype: object


In [5]:
# Count distinct traces
total_traces = len(grouped)  # Total number of traces in the event log
trace_counts = grouped.value_counts()  # Get count of each unique trace
distinct_traces = grouped.nunique()  # Count distinct traces
print(total_traces)
print(distinct_traces)
print(trace_counts)

1050
846
activity
(ERRegistration, ERTriage, ERSepsisTriage)                                                                                                                                                                                                                                                                                                                                                                                                               35
(ERRegistration, ERTriage, ERSepsisTriage, Leucocytes, CRP)                                                                                                                                                                                                                                                                                                                                                                                              24
(ERRegistration, ERTriage, ERSepsisTriage, CRP, Leucocytes)                                   

In [6]:
# Calculate frequency as percentage
distinct_traces = {trace: (count / total_traces) * 100 for trace, count in trace_counts.items()}
#print(distinct_traces)

In [7]:
# Generate label mappings
all_activities = set(chain(*distinct_traces.keys()))
print(len(all_activities))
print(all_activities)

16
{'AdmissionNC', 'ReleaseC', 'AdmissionIC', 'IVAntibiotics', 'LacticAcid', 'ReleaseA', 'ReleaseD', 'ERTriage', 'IVLiquid', 'ReturnER', 'ERRegistration', 'CRP', 'Leucocytes', 'ReleaseE', 'ERSepsisTriage', 'ReleaseB'}


In [8]:
with open('../semantic_data/' + log_name + '/' + log_name + '_id2label_'+TYPE+'.pkl', 'rb') as f:
    id2label = pickle.load(f)

with open('../semantic_data/' + log_name + '/' + log_name + '_label2id_'+TYPE+'.pkl', 'rb') as f:
    label2id = pickle.load(f)

In [9]:
print(label2id)
print(id2label)

{'activity': {'ERRegistration': 0, 'Leucocytes': 1, 'CRP': 2, 'LacticAcid': 3, 'ERTriage': 4, 'ERSepsisTriage': 5, 'IVLiquid': 6, 'IVAntibiotics': 7, 'AdmissionNC': 8, 'ReleaseA': 9, 'ReturnER': 10, 'AdmissionIC': 11, 'ReleaseB': 12, 'ReleaseC': 13, 'ReleaseD': 14, 'ReleaseE': 15, 'ENDactivity': 16}, 'orggroup': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, '?': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'ENDorggroup': 26}, 'Leucocytes': {np.float64(0.0): 0, np.float64(9.6): 1, np.float64(8.7): 2, np.float64(10.7): 3, np.float64(13.0): 4, np.float64(11.3): 5, np.float64(10.9): 6, np.float64(13.8): 7, np.float64(15.4): 8, np.float64(18.1): 9, np.float64(7.4): 10, np.float64(7.7): 11, np.float64(7.6): 12, np.float64(8.5): 13, np.float64(18.8): 14, np.float64(24.1): 15, np.float64(7.9): 16, np.float64(9.7): 17, np.float64(6.4): 18, np.float64(11.0): 19, np.flo

In [10]:
# Encode traces using label2id
encoded_traces = {tuple(label2id[a] for a in trace): freq for trace, freq in distinct_traces.items()}
# print(len(encoded_traces[0]), encoded_traces[0])
# print(len(encoded_traces[10]), encoded_traces[10])
# print(len(encoded_traces[20]), encoded_traces[20])

KeyError: 'ERRegistration'

In [None]:
# Convert to tensors
tensor_traces = {torch.tensor(trace): freq for trace, freq in encoded_traces.items()}
print(tensor_traces)

In [None]:
print(len(tensor_traces))

In [None]:
def save_data(filename, data):
    with open(f'../semantic_data/{log_name}/{filename}', 'wb') as f:
        pickle.dump(data, f)

In [None]:
save_data("/distinct_traces.pkl", tensor_traces)