In [16]:
import pm4py
from label_functions import *
import editdistance
from functions import *


# import data
def import_xes(file_path):
    event_log = pm4py.read_xes(file_path)
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))
    return event_log

event_log = import_xes("data/Hospital_log.xes.gz")
print(type(event_log))
print(event_log)

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

Start activities: {'1e consult poliklinisch': 71, 'inwend.geneesk.  korte kaart kosten-out': 8, 'verlosk.-gynaec. korte kaart kosten-out': 198, 'vervolgconsult poliklinisch': 234, 'verlosk.-gynaec.   jaarkaart kosten-out': 122, 'cytologisch onderzoek - ectocervix -': 28, 'echografie  - genitalia interna': 44, 'aanname laboratoriumonderzoek': 225, 'ligdagen - alle spec.beh.kinderg.-reval.': 39, 'telefonisch consult': 15, 'behandeltijd - eenheid t3 - megavolt': 1, 'ct abdomen': 2, 'behandeltijd - eenheid t2 - megavolt': 1, 'thorax': 12, 'e.c.g.      - elektrocardiografie': 63, 'cytologisch onderzoek - vagina -': 12, 'mammografie thoraxwand': 3, 'histologisch onderzoek - biopten nno': 19, 'dagverpleging - alle spec.beh.kind.-rev.': 5, 'inwend.geneesk.    jaarkaart kosten-out': 1, 'coupe ter inzage': 19, 'immunopathologisch onderzoek': 10, 'vagina      - scopie incl.evt.vulvabiops': 3, 'punctie tbv cytologisch onderzoek door p': 3, 'cytologisch onderzoek - buiktumorpunctie': 1, 'vrw.gesl.o

In [17]:
def encode_data_attributes(event_log, prefixes, attributes):
    # Extract data attributes for the last event in each prefix
    encoded_data = []
    for prefix in prefixes:
        last_event = prefix[-1]  # Get last event of the prefix
        data_vector = [event_log.loc[event_log['Activity code'] == last_event, attr].values[0] for attr in attributes]
        encoded_data.append(data_vector)
    return encoded_data

In [18]:
from functions import *
from sklearn.cluster import DBSCAN
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from Levenshtein import distance as edit_distance
import numpy as np

# Step 1: Sort traces temporally by the first event's timestamp and split 80/20
event_log_sorted = event_log.sort_values(by='time:timestamp') 
unique_cases = event_log_sorted['case:concept:name'].unique()

# Split cases into training and testing sets based on 80/20 split
train_cases = unique_cases[:int(0.8 * len(unique_cases))]
test_cases = unique_cases[int(0.8 * len(unique_cases)):]

# Separate event log into training and testing logs
train_log = event_log_sorted[event_log_sorted['case:concept:name'].isin(train_cases)]
test_log = event_log_sorted[event_log_sorted['case:concept:name'].isin(test_cases)]

In [19]:

# Step 2: Extract prefixes from training set with specified step sizes
def extract_prefixes(trace, max_len=21, step=5):
    return [trace[:i] for i in range(1, max_len + 1, step) if i <= len(trace)]

# Group and extract prefixes from training traces
train_traces = train_log.groupby('case:concept:name')['Activity code'].apply(list)
all_prefixes = train_traces.apply(lambda x: extract_prefixes(x)).explode()

# Apply label function to each prefix to create lf_map
#lf_map = {i: label_function_1(prefix) for i, prefix in enumerate(all_prefixes)}
lf_map = train_log.groupby('case:concept:name')['Activity code'].apply(label_function_1)

# Convert prefixes to sequence encoding for DBSCAN
sequence_encoded_prefixes = [' '.join(map(str, prefix)) for prefix in all_prefixes]

In [38]:

# Step 3: Compute edit distance matrix for the training set only
n_train = len(sequence_encoded_prefixes)
train_distance_matrix = np.zeros((n_train, n_train))
for i in range(n_train):
    for j in range(i + 1, n_train):
        dist = edit_distance(sequence_encoded_prefixes[i], sequence_encoded_prefixes[j])
        train_distance_matrix[i, j] = dist
        train_distance_matrix[j, i] = dist  # Symmetric matrix

# Step 4: Apply DBSCAN on the training set using the specified parameters
#dbscan = DBSCAN(eps=0.125, min_samples=4, metric='precomputed')
#dbscan = DBSCAN(eps=0.15, min_samples=6, metric='precomputed')
dbscan = DBSCAN(eps=0.2, min_samples=6, metric='precomputed')
train_cluster_labels = dbscan.fit_predict(train_distance_matrix)

In [39]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Define columns for data attributes
X_cols = ['case:Diagnosis code', 'case:Treatment code']

# Initialize label encoders with a placeholder for unknown values
label_encoders = {col: LabelEncoder() for col in X_cols}

# Fit each encoder to the unique values in the corresponding column of the training log
for col in X_cols:
    label_encoders[col].fit(train_log[col].astype(str))  # Ensure data is treated as strings

# Define a function to safely transform data using LabelEncoder, handling unknown values
def safe_transform(encoder, value):
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        # Adding the unknown label on the fly (or set it to a pre-defined value, like -1)
        return -1  # or handle as desired for unknowns

# Step 2: Modify encode_data_attributes function with safe_transform

def encode_data_attributes(event_log, prefixes, attributes):
    encoded_data = []
    for prefix in prefixes:
        last_event = prefix[-1]  # Get the last event of the prefix
        # Apply safe encoding for each attribute
        data_vector = [safe_transform(label_encoders[attr], event_log.loc[event_log['Activity code'] == last_event, attr].values[0]) for attr in attributes]
        encoded_data.append(data_vector)
    return encoded_data



In [42]:
# Columns used for classifier training
X_cols = ['case:Diagnosis code', 'case:Treatment code']

# Define a threshold for the minimum number of samples required -- OPTIONAL -- TO IMPROVE ROBUSTNESS
min_samples_threshold = 10

# Step 5: Train a classifier for each cluster in the training set
clusters = {}
for cluster_label in np.unique(train_cluster_labels):
    if cluster_label == -1:
        continue  # Skip noise points

    # Get indices of prefixes in the current cluster
    cluster_indices = [i for i, label in enumerate(train_cluster_labels) if label == cluster_label]
    X_cluster = [encode_data_attributes(train_log, [all_prefixes.iloc[i]], X_cols)[0] for i in cluster_indices]
    y_cluster = [lf_map[all_prefixes.index[i]] for i in cluster_indices]  # Get labels for each prefix
    
    # Skip training for clusters with fewer samples than the threshold -- OPTIONAL -- TO IMPROVE ROBUSTNESS
    if len(X_cluster) < min_samples_threshold:
        print(f"Skipping small cluster {cluster_label} with {len(X_cluster)} samples.")
        continue
    
     # Print cluster size for inspection
    print(f"Cluster {cluster_label}: Size of X_cluster = {len(X_cluster)}, Size of y_cluster = {len(y_cluster)}")
    

    # Check if the cluster size is too small
    #if len(X_cluster) < 5:
        #print(f"Warning: Cluster {cluster_label} has very few samples ({len(X_cluster)}). Consider adjusting DBSCAN parameters.")
    
    # Train decision tree classifier for the current cluster
    clf = DecisionTreeClassifier()
    clf.fit(X_cluster, y_cluster)
    clusters[cluster_label] = clf  # Store classifier by cluster label

Cluster 0: Size of X_cluster = 47, Size of y_cluster = 47
Cluster 1: Size of X_cluster = 178, Size of y_cluster = 178
Cluster 2: Size of X_cluster = 47, Size of y_cluster = 47
Cluster 3: Size of X_cluster = 12, Size of y_cluster = 12
Cluster 4: Size of X_cluster = 163, Size of y_cluster = 163
Cluster 5: Size of X_cluster = 18, Size of y_cluster = 18
Cluster 6: Size of X_cluster = 120, Size of y_cluster = 120
Skipping small cluster 7 with 9 samples.
Cluster 8: Size of X_cluster = 10, Size of y_cluster = 10
Skipping small cluster 9 with 9 samples.
Cluster 10: Size of X_cluster = 13, Size of y_cluster = 13
Cluster 11: Size of X_cluster = 14, Size of y_cluster = 14
Cluster 12: Size of X_cluster = 12, Size of y_cluster = 12
Skipping small cluster 13 with 6 samples.
Cluster 14: Size of X_cluster = 41, Size of y_cluster = 41
Skipping small cluster 15 with 6 samples.
Cluster 16: Size of X_cluster = 12, Size of y_cluster = 12
Cluster 17: Size of X_cluster = 14, Size of y_cluster = 14
Skipping s

In [43]:
# Step 6: Define functions for finding the closest cluster and for reliable prediction

def find_closest_cluster(running_trace, train_prefixes, train_cluster_labels):
    # Convert running trace to sequence encoding
    running_sequence = ' '.join(map(str, running_trace))
    # Calculate edit distances between the running trace and each training prefix
    distances = [edit_distance(running_sequence, train_seq) for train_seq in train_prefixes]
    # Get the closest training prefix and its corresponding cluster label
    closest_idx = np.argmin(distances)
    closest_cluster = train_cluster_labels[closest_idx]
    return closest_cluster, closest_idx

def predict_with_reliability(clf, X_test, thresholds=[0.5, 0.6], min_class_support=3):
    probs = clf.predict_proba([X_test])[0]
    predicted_class = np.argmax(probs)
    class_support = np.sum(clf.classes_ == predicted_class)
    class_prob = probs[predicted_class]
    if class_support >= min_class_support and any(class_prob >= t for t in thresholds):
        return predicted_class, class_prob
    return None, None

In [44]:

# Step 7: Replay each test trace for prediction
for test_trace in test_log.groupby('case:concept:name')['Activity code'].apply(list):
    reliable_prediction = False
    # Incrementally generate prefixes every 5 events (replay logic)
    for i in range(1, len(test_trace) + 1, 5):  # Replay every 5 events
        current_prefix = test_trace[:i]
        
        # Find the closest cluster using minimum edit distance
        closest_cluster, closest_idx = find_closest_cluster(current_prefix, sequence_encoded_prefixes, train_cluster_labels)
        
        if closest_cluster == -1:
            continue  # Skip if the closest cluster is noise

        clf = clusters.get(closest_cluster)
        if clf:
            X_test = encode_data_attributes(test_log, [current_prefix], X_cols)[0]
            predicted_class, confidence = predict_with_reliability(clf, X_test)
            print(predicted_class,confidence)
            if predicted_class is not None:
                print(f"Reliable prediction achieved: {predicted_class} with confidence {confidence}")
                reliable_prediction = True
                break  # Stop if a reliable prediction is made
    
    if not reliable_prediction:
        print("Prediction failure: No reliable prediction achieved by end of trace.")


None None
Prediction failure: No reliable prediction achieved by end of trace.
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
Prediction failure: No reliable prediction achieved by end of trace.
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Prediction failure: No reliable prediction achieved by end of trace.
None None
Pre