## Loading the data

In [1]:
import pm4py
from label_functions import *
import editdistance
from functions import *


# import data
def import_xes(file_path):
    event_log = pm4py.read_xes(file_path)
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))
    return event_log

event_log = import_xes("data/Hospital_log.xes.gz")
print(type(event_log))
print(event_log)

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

Start activities: {'1e consult poliklinisch': 71, 'inwend.geneesk.  korte kaart kosten-out': 8, 'verlosk.-gynaec. korte kaart kosten-out': 198, 'vervolgconsult poliklinisch': 234, 'verlosk.-gynaec.   jaarkaart kosten-out': 122, 'cytologisch onderzoek - ectocervix -': 28, 'echografie  - genitalia interna': 44, 'aanname laboratoriumonderzoek': 225, 'ligdagen - alle spec.beh.kinderg.-reval.': 39, 'telefonisch consult': 15, 'behandeltijd - eenheid t3 - megavolt': 1, 'ct abdomen': 2, 'behandeltijd - eenheid t2 - megavolt': 1, 'thorax': 12, 'e.c.g.      - elektrocardiografie': 63, 'cytologisch onderzoek - vagina -': 12, 'mammografie thoraxwand': 3, 'histologisch onderzoek - biopten nno': 19, 'dagverpleging - alle spec.beh.kind.-rev.': 5, 'inwend.geneesk.    jaarkaart kosten-out': 1, 'coupe ter inzage': 19, 'immunopathologisch onderzoek': 10, 'vagina      - scopie incl.evt.vulvabiops': 3, 'punctie tbv cytologisch onderzoek door p': 3, 'cytologisch onderzoek - buiktumorpunctie': 1, 'vrw.gesl.o

## Get traces

In [None]:
from functions import *

# set parameters
extract_prefixes = True
prefix_max_len = 21 # maximum length of prefixes
prefix_steps = 5 # number of steps to skip when extracting prefixes
n_most_similar = 100 # number of most similar historic traces to include
X_cols = ['case:Diagnosis code', 'case:Treatment code']
lf = label_function_1
n_validation = 1000 # number of validation runs (LOOCV)

# get all traces
all_traces = event_log.groupby('case:concept:name')['Activity code'].apply(list)
print("These are all traces")
print("Type of all_traces", type(all_traces))
print(all_traces)

if extract_prefixes:
    # expand each list into prefixes
    all_traces = all_traces.apply(lambda x: prefix_extraction(x, prefix_max_len, prefix_steps))
    all_traces = all_traces.explode()

print("After prefix extraction")
print(all_traces)

# get a label function map
lf_map = get_lf_map(event_log, lf)
print("Lf map")
print("Type of lf_map:", type(lf_map))
print(lf_map)


These are all traces
Type of all_traces <class 'pandas.core.series.Series'>
case:concept:name
00000000    [410100, 419100, 10107, 339486E, 410100, 41910...
00000001    [10113, 410100, 419100, 10107, 411100, 419100,...
00000002    [10113, 411100, 411100, 419100, 411100, 411100...
00000003    [10113, 370000, 378149, 379999, 411100, 419100...
00000004    [10107, 355201, 370000, 370000, 370000, 370401...
                                  ...                        
00001138    [339486E, 370000, 370000, 370000, 370401, 3704...
00001139    [355111, 370000, 370000, 370401, 370401C, 3704...
00001140    [370000, 370000, 370000, 370401, 370401C, 3704...
00001141    [330001B, 370000, 370000, 370000, 370401, 3704...
00001142    [370000, 370000, 370000, 370401, 370401C, 3704...
Name: Activity code, Length: 1143, dtype: object
After prefix extraction
case:concept:name
00000000                                             [410100]
00000000     [410100, 419100, 10107, 339486E, 410100, 419100]
00000000 

## Representing traces as frequency vectors (part 1)
### Each activity that is in the trace prefix is counted and its value is replaced by its frequency
### Example: trace prefix: [A, B, B, C] -> trace prefix: [1, 2, 1]


In [None]:
from collections import Counter
import pandas as pd

# Function to encode each trace as a list of frequencies
def encode_trace_frequencies(trace):
    # Count occurrences of each activity
    activity_counts = Counter(trace)
    # Return a list of counts in the order they appear in the original trace
    return [activity_counts[activity] for activity in trace]

# Apply the encoding function to each trace in the Series
encoded_traces = all_traces.apply(encode_trace_frequencies)

# Display the encoded Series
print(type(encoded_traces))
print(all_traces)
print(encoded_traces)
print(encoded_traces.shape)

<class 'pandas.core.series.Series'>
case:concept:name
00000000                                             [410100]
00000000     [410100, 419100, 10107, 339486E, 410100, 419100]
00000000    [410100, 419100, 10107, 339486E, 410100, 41910...
00000000    [410100, 419100, 10107, 339486E, 410100, 41910...
00000000    [410100, 419100, 10107, 339486E, 410100, 41910...
                                  ...                        
00001142                                             [370000]
00001142    [370000, 370000, 370000, 370401, 370401C, 370402]
00001142    [370000, 370000, 370000, 370401, 370401C, 3704...
00001142    [370000, 370000, 370000, 370401, 370401C, 3704...
00001142    [370000, 370000, 370000, 370401, 370401C, 3704...
Name: Activity code, Length: 4408, dtype: object
[2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2]
(4408,)


  print(encoded_traces.loc["00000000"][2])


##  Representing traces as frequency vectors part 2
### Each trace predix will be represented as a frequency vector based on all activities. Each trace prefix will have length 673. 
### Example: trace prefix: [A, B, B, C] -> trace prefix: [ 0,1, 2, 1, 0, 0 , 0 .... 0, 0]


In [None]:
import numpy as np
import pandas as pd
from collections import Counter

all_activities = event_log["Activity code"].value_counts().index.tolist()
print("All activities")
print(all_activities)

# Convert all_activities list to a dictionary mapping each activity to its index position
activity_to_index = {activity: idx for idx, activity in enumerate(all_activities)}

# Function to encode each trace as a frequency vector based on all_activities
def encode_trace_full_vector(trace):
    # Initialize a frequency vector of zeros with length equal to all_activities
    frequency_vector = np.zeros(len(all_activities), dtype=int)
    # Count occurrences of each activity in the trace
    activity_counts = Counter(trace)
    # Populate the frequency vector
    for activity, count in activity_counts.items():
        if activity in activity_to_index:  # Only update if the activity is in all_activities
            index = activity_to_index[activity]
            frequency_vector[index] = count
    return frequency_vector

# Apply the encoding function to each trace in the Series
encoded_traces_full = all_traces.apply(encode_trace_full_vector)

print(encoded_traces_full)


All activities
[370000, 40014, 613000, 379999, 614400, 411100, 370443, 370407.0, 370442, 370419, '370712B', '370715A', 370701, 419100, 610001, '377498A', 370402, 375075, '370701S', 370403, 370606, 370604, 378607, '370488G', '370488E', '378453A', '370504A', 370423, 372417, 415100, '370488J', 378458, '370401C', '378619A', 410100, '370443S', 710170, '370442S', '377121S', 370424, '370715S', 372414, '370407C', 370440, 370401, 370711, 612000, 378452, 386002, '370419S', '375138A', '387042A', 40016, 619600, '370426S', '370402S', 378149, 686405, '370505A', '376480A', '376482S', 376400.0, 378858, '376482C', 370420, 370501.0, 390550, 356133, 350503, 387090, '330001B', 370111, 356134, 370421, '370737C', 378720, 709999, 355111, 10107, 390003, '370737Z', 410500, 390520, '370403S', '339988E', '370489S', 339956, '370488S', 390183, 10307, '339486E', 375004, 355401, 386042, '375003A', '370401S', 355201, '370423T', 370415, '370111S', '370737S', 356132, 359999, '378453S', 388170, '370707S', 387001, '37241

## Clustering the traces as vector of occurences using DBSCAN 

In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN

# Convert the Series to a 2D array where each row is a trace vector
trace_data = np.vstack(encoded_traces_full.values)  # Stack each list as a row in a 2D array

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=3, min_samples=2)
cluster_labels = dbscan.fit_predict(trace_data)

# Step 4: Create a DataFrame to store the trace IDs and their corresponding cluster labels
clustered_traces = pd.DataFrame({
    'case:concept:name': encoded_traces_full.index,
    'cluster_label': cluster_labels,
    'activities': encoded_traces_full.values  # Original list of activities for each trace
})

# Set trace_id as the index for easy lookup if needed
clustered_traces.set_index('case:concept:name', inplace=True)

# Display the result
print(clustered_traces)

# Get unique values and their counts in the 'cluster_label' column
cluster_counts = clustered_traces['cluster_label'].value_counts()

print("Unique cluster labels and their counts:")
print(cluster_counts)


          cluster_label                                         activities
trace_id                                                                  
00000000              0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00000000              0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...
00000000              0  [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...
00000000              0  [2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 2, 0, ...
00000000              0  [3, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 2, 0, ...
...                 ...                                                ...
00001142              0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00001142              0  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00001142              0  [3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...
00001142              0  [3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
00001142              0  [3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...

[4408 rows x 2 columns]


## Split the traces based on cluster

In [35]:
# Group by the cluster labels
clustered_series = {}
for cluster_label, group in clustered_traces.groupby('cluster_label'):
    # Step 2: Create a Pandas Series for each cluster
    clustered_series[cluster_label] = pd.Series(data=group['activities'].values, 
                                                index=group.index, 
                                                name=f'Cluster_{cluster_label}')

# Display each cluster's separate Series
for label, series in clustered_series.items():
    print(f"\nCluster {label}:\n", series)



Cluster -1:
 trace_id
00000005    [0, 4, 4, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000005    [0, 7, 6, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000069    [3, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00000072    [5, 0, 0, 1, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, ...
00000079    [2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, ...
00000110    [5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000132    [4, 1, 0, 1, 0, 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, ...
00000140    [5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000364    [5, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, ...
00000371    [0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000457    [4, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...
00000464    [6, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000498    [1, 0, 0, 1, 0, 7, 0, 0, 0, 0, 0, 0, 0, 4, 0, ...
00000598    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
00000600    [3, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
00000671    [5, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0

## Train model for each of the clusters (not yet finished)

In [28]:
clustered_traces = clustered_series[0]
df_predictions = []
for i in range(n_validation):   
    running_trace = clustered_traces.sample(1)
    historic_traces = clustered_traces.drop(running_trace.index)

    running_trace_length = len(running_trace.iloc[0])

    # calculate similarity scores
    similarity_scores = get_similarity(running_trace, historic_traces, similarity=True)

    # get n most similar traces
    sorted_similar_traces = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    most_similar_traces = sorted_similar_traces[:n_most_similar]

    # calculate average similarity score
    avg_similarity = np.mean([x[1] for x in most_similar_traces])

    # get label function
    similar_traces_list = [x[0] for x in most_similar_traces]

    # get y_train and y_test
    y_train = [lf_map[trace] for trace in similar_traces_list]
    y_test = int(lf_map[running_trace.index[0]])

    # train model
    clf, le_dict = train_model(X_cols, event_log, similar_traces_list, lf_map)

    # predict label
    y_pred, y_prob = predict_label(X_cols, event_log, running_trace, clf, le_dict)

    results = [running_trace_length, avg_similarity, sum(y_train), y_test, y_pred, y_prob]
    df_predictions.append(results)

col_names = ['trace_len', 'avg_similarity', 'pos_y_train', 'y_test', 'y_pred', 'y_prob']
df_predictions = pd.DataFrame(df_predictions, columns=col_names)

# accuracy from most common label
acc1 = max(lf_map.mean(), 1-lf_map.mean())

# accuracy from most common label among similar traces
acc2 = ((df_predictions['pos_y_train'] > n_most_similar/2) == df_predictions['y_test']).mean()

# accuracy from decision tree model
acc3 = (df_predictions['y_test'] == df_predictions['y_pred']).mean()

# print accuracies
print(f'Accuracy from most common label: {acc1:.3f}')
print(f'Accuracy from most common label among similar traces: {acc2:.3f}')
print(f'Accuracy from decision tree model: {acc3:.3f}')

  X_train = X_train.fillna('Missing', axis=1)
  X_test = X_test.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_test = X_test.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', axis=1)
  X_train = X_train.fillna('Missing', 

Accuracy from most common label: 0.598
Accuracy from most common label among similar traces: 0.749
Accuracy from decision tree model: 0.773


  X_test = X_test.fillna('Missing', axis=1)


## Running trace should choose cluster 