## Loading the data

In [2]:
import pm4py
from label_functions import *
import editdistance

# import data
def import_xes(file_path):
    event_log = pm4py.read_xes(file_path)
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))
    return event_log

event_log = import_xes("data/Hospital_log.xes.gz")
print(event_log)

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

Start activities: {'1e consult poliklinisch': 71, 'inwend.geneesk.  korte kaart kosten-out': 8, 'verlosk.-gynaec. korte kaart kosten-out': 198, 'vervolgconsult poliklinisch': 234, 'verlosk.-gynaec.   jaarkaart kosten-out': 122, 'cytologisch onderzoek - ectocervix -': 28, 'echografie  - genitalia interna': 44, 'aanname laboratoriumonderzoek': 225, 'ligdagen - alle spec.beh.kinderg.-reval.': 39, 'telefonisch consult': 15, 'behandeltijd - eenheid t3 - megavolt': 1, 'ct abdomen': 2, 'behandeltijd - eenheid t2 - megavolt': 1, 'thorax': 12, 'e.c.g.      - elektrocardiografie': 63, 'cytologisch onderzoek - vagina -': 12, 'mammografie thoraxwand': 3, 'histologisch onderzoek - biopten nno': 19, 'dagverpleging - alle spec.beh.kind.-rev.': 5, 'inwend.geneesk.    jaarkaart kosten-out': 1, 'coupe ter inzage': 19, 'immunopathologisch onderzoek': 10, 'vagina      - scopie incl.evt.vulvabiops': 3, 'punctie tbv cytologisch onderzoek door p': 3, 'cytologisch onderzoek - buiktumorpunctie': 1, 'vrw.gesl.o

## Selecting Top 7 activities

In [16]:
# Get the 7 most frequent activities in the 'concept:name' column
top_activities = event_log["concept:name"].value_counts().head(7).index.tolist()
top_activities_code = event_log["Activity code"].value_counts().head(7).index.tolist()
# Display the list of top 7 activities
print("Top 7 most popular activities:", top_activities)
print("Top 7 most popular activities code:", top_activities_code)


Top 7 most popular activities: ['aanname laboratoriumonderzoek', 'ligdagen - alle spec.beh.kinderg.-reval.', '190205 klasse 3b        a205', 'ordertarief', '190101 bovenreg.toesl.  a101', 'vervolgconsult poliklinisch', 'kalium potentiometrisch']
Top 7 most popular activities code: ['370000', '40014', '613000', '379999', '614400', '411100', '370443']


## Getting all traces

In [3]:
# get all traces
all_traces = event_log.groupby('case:concept:name')['Activity code'].apply(list)
print("All traces ")
print(all_traces)

All traces 
case:concept:name
00000000    [410100, 419100, 10107, 339486E, 410100, 41910...
00000001    [10113, 410100, 419100, 10107, 411100, 419100,...
00000002    [10113, 411100, 411100, 419100, 411100, 411100...
00000003    [10113, 370000, 378149, 379999, 411100, 419100...
00000004    [10107, 355201, 370000, 370000, 370000, 370401...
                                  ...                        
00001138    [339486E, 370000, 370000, 370000, 370401, 3704...
00001139    [355111, 370000, 370000, 370401, 370401C, 3704...
00001140    [370000, 370000, 370000, 370401, 370401C, 3704...
00001141    [330001B, 370000, 370000, 370000, 370401, 3704...
00001142    [370000, 370000, 370000, 370401, 370401C, 3704...
Name: Activity code, Length: 1143, dtype: object


## We could encode it as according to the section 2.3 from the paper. Please see a portion from section 2.3.1 from the paper

#### patterns composed of a single log event. For example, in the scenario in Fig. 1, we can represent the alphabet of the events as an ordered vector L ¼ hA; C; D;M; P; R; S; V i. In this case, trace t1 will be encoded as a vector of frequencies h3; 2; 2; 1; 1; 1; 0; 0i, obtained by replacing each symbol of the alphabet in vector L by its frequency in trace t

In [None]:
import pandas as pd

# Identify the top 7 activities
top_activities = event_log["Activity code"].value_counts().index.tolist()
print("Length of top_activities: ", len(top_activities))

# Define a function to encode each trace as a frequency vector
def encode_trace(trace, top_activities):
    # Count occurrences of each activity in the trace
    activity_counts = {activity: trace.count(activity) for activity in top_activities}
    # Return the counts as a list in the order of top_activities
    return [activity_counts[activity] for activity in top_activities]

# Apply this encoding to each trace in all_traces
encoded_traces = all_traces.apply(lambda trace: encode_trace(trace, top_activities))

# Convert to DataFrame for readability
encoded_traces_df = pd.DataFrame(encoded_traces.tolist(), columns=top_activities, index=all_traces.index)


# Display the encoded traces
print("Encoded Traces as Frequency Vectors:")
print(encoded_traces_df.shape)
print(encoded_traces_df)



Encoded Traces as Frequency Vectors:
(1143, 673)
                   370000  40014  613000  379999  614400  411100  370443  \
case:concept:name                                                          
00000000                8      9       8       5       0       0       2   
00000001               27     12      12      16      12       6       1   
00000002                1      0       0       1       0      12       0   
00000003                1      0       0       1       0      12       0   
00000004               21     13       0      14      13       8       3   
...                   ...    ...     ...     ...     ...     ...     ...   
00001138                3      0       0       1       0       0       1   
00001139                2      0       0       1       0       0       1   
00001140                3      0       0       1       0       0       1   
00001141                3      0       0       1       0       0       1   
00001142                3      0       

## Clustering the traces as vector of occurences

In [5]:
from sklearn.cluster import KMeans

# Choose the number of clusters, e.g., K = 5
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(encoded_traces_df)

# Add cluster labels to the original DataFrame
encoded_traces_df['Cluster'] = clusters

# Display the clusters assigned to each trace
print(encoded_traces_df)

# Count occurrences of each cluster
kmeans_cluster_counts = encoded_traces_df['Cluster'].value_counts()

print("Occurrences of each K-Means cluster:")
print(kmeans_cluster_counts)

TypeError: Feature names are only supported if all input features have string names, but your input has ['float', 'int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

## Defining features


In [4]:

# Assuming your DataFrame is named 'df'
X_cols = ['case:Diagnosis code', 'case:Treatment code']

print(event_log.columns)
# Create a regex pattern to match any column that contains one of the X_cols elements as a substring
pattern = '|'.join([f"{col}" for col in X_cols])

# Filter columns that match the pattern
filtered_df = event_log.filter(regex=pattern)

print("Filtered DataFrame based on Diagnosis or Treatment code columns:")
print(filtered_df)

Index(['org:group', 'Number of executions', 'Specialism code', 'concept:name',
       'Producer code', 'Section', 'Activity code', 'time:timestamp',
       'lifecycle:transition', 'case:End date',
       ...
       'case:Treatment code:14', 'case:Treatment code:15', 'case:Diagnosis:15',
       'case:Diagnosis:14', 'case:Diagnosis:11', 'case:Diagnosis:13',
       'case:Diagnosis:12', 'case:Diagnosis code:14', 'case:Diagnosis code:13',
       'case:Diagnosis code:15'],
      dtype='object', length=128)
Filtered DataFrame based on Diagnosis or Treatment code columns:
        case:Treatment code:2  case:Treatment code:1 case:Diagnosis code:2  \
0                        23.0                   13.0                   106   
1                        23.0                   13.0                   106   
2                        23.0                   13.0                   106   
3                        23.0                   13.0                   106   
4                        23.0          

In [None]:
def get_similarity(running_trace, traces, similarity=True):
    '''
    Calculate edit distances between running trace and all traces in the log.
    Convert to similarity score if similarity=True.
    '''
    # convert running trace to list if it is a series
    if isinstance(running_trace, pd.Series):
        running_trace = running_trace.iloc[0]
    
    # calculate edit distances
    str_edit_dist = []
    for case, activities in traces.items():
        distance = editdistance.eval(running_trace, activities)
        if similarity:
            similarity_score = 1 - distance / max(len(running_trace), len(activities))
            str_edit_dist.append((case, similarity_score))
        else:
            str_edit_dist.append((case, distance))
    return dict(str_edit_dist)


In [None]:
import numpy as np 

running_trace = all_traces.iloc[[i]]
historic_traces = all_traces.drop(running_trace.index)

# TODO: prefix extraction of historic traces?

# calculate similarity scores
similarity_scores = get_similarity(running_trace, historic_traces, similarity=True)

# get n most similar traces
n = 100
sorted_similar_traces = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
most_similar_traces = sorted_similar_traces[:n]

# calculate average similarity score
avg_similarity = np.mean([x[1] for x in most_similar_traces])
#print_label = 'Average similarity score' ; print(f'{print_label:<25}: {avg_similarity:.3f}')

# get label function
similar_traces_list = [x[0] for x in most_similar_traces]
lf_map = get_lf_map(event_log, label_function_1)

# get y_train and y_test
y_train = [lf_map[trace] for trace in similar_traces_list]
#print_label = 'Traces in class 1' ; print(f'{print_label:<25}: {sum(y_train)}/{len(y_train)}')
y_test = int(lf_map[running_trace.index[0]])
#print_label = 'True class running trace' ; print(f'{print_label:<25}: {y_test}')

# train model
X_cols = ['case:Diagnosis code', 'case:Treatment code']

sorted_similar_traces = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
most_similar_traces = sorted_similar_traces[:n]

# calculate average similarity score
avg_similarity = np.mean([x[1] for x in most_similar_traces])
print_label = 'Average similarity score' ; print(f'{print_label:<25}: {avg_similarity:.3f}')

# train model
X_cols = ['case:Diagnosis code', 'case:Treatment code']

# Get last event for each trace in similar_traces_list (training set)
X_train = event_log.groupby('case:concept:name')[X_cols].last().loc[similar_traces_list]

# Fill missing values and convert to string
X_train = X_train.fillna('Missing', axis=1)
for col in X_cols:
    X_train[col] = X_train[col].astype(str)
