In [None]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
import random
import pickle
import os
import pandas as pd
import numpy as np

In [None]:
log_folder = r'C:\SSDProject\raw datasets'
dict_folder = r'C:\SSDProject\ssd_dicts'
dummy_folder = r'C:\SSDProject\dummy'
case_column = 'concept:name'
timestamp_column = 'time:timestamp'
split_ratio = 0.8
val_ratio = 0.2
ssd_id = 1

In [None]:
def get_training_test_dataframes(dataset_name):
    log_path = os.path.join(log_folder, dataset_name + '.xes')
    dict_path = os.path.join(dict_folder, dataset_name + '.pkl')
    df_path = os.path.join(dummy_folder, dataset_name + '.csv')
    df_ssd_path = os.path.join(dummy_folder, dataset_name + '_ssd_.csv')
    log = xes_importer.apply(log_path) 
    with open(dict_path, 'rb') as f:
        ssd_dict =  pickle.load(f)
    selected_cases = ssd_dict.get(ssd_id) 
    subset_log = pm4py.filter_trace_attribute_values(log, 'concept:name', selected_cases)
    prefix_dict = {'case_id': [], 'prefix_length': [], 'start': [], 'rem_time': [], 
                   'train': [], 'val': [], 'test': []}
    # iterate over all cases of event log
    for i in range (len(log)): 
        current_case = log[i]
        case_id = current_case.attributes.get(case_column)
        case_length = len(current_case)
        start_date = current_case[0].get(timestamp_column)
        end_date = current_case[case_length-1].get(timestamp_column)
        # iterate over all prefixes of each case
        # j represent the prefix length
        for j in range(2, case_length):
            event_time = current_case[j-1].get(timestamp_column)
            rem_time = (end_date - event_time).total_seconds()/3600/24
            prefix_dict['case_id'].append(case_id)
            prefix_dict['prefix_length'].append(j)
            prefix_dict['start'].append(start_date)
            prefix_dict['rem_time'].append(rem_time)
            prefix_dict['train'].append(0)
            prefix_dict['val'].append(0)
            prefix_dict['test'].append(0)
            
    prefix_dict2 = {'case_id': [], 'prefix_length': [], 'start': [], 'rem_time': [], 
                   'train': [], 'val': [], 'test': []}
    # iterate over all cases of event log
    for i in range (len(subset_log)): 
        current_case = subset_log[i]
        case_id = current_case.attributes.get(case_column)
        case_length = len(current_case)
        start_date = current_case[0].get(timestamp_column)
        end_date = current_case[case_length-1].get(timestamp_column)
        # iterate over all prefixes of each case
        # j represent the prefix length
        for j in range(2, case_length):
            event_time = current_case[j-1].get(timestamp_column)
            rem_time = (end_date - event_time).total_seconds()/3600/24
            prefix_dict2['case_id'].append(case_id)
            prefix_dict2['prefix_length'].append(j)
            prefix_dict2['start'].append(start_date)
            prefix_dict2['rem_time'].append(rem_time)
            prefix_dict2['train'].append(0)
            prefix_dict2['val'].append(0)
            prefix_dict2['test'].append(0)
            
    index_df = pd.DataFrame(prefix_dict) 
    ssd_df = pd.DataFrame(prefix_dict2)
    df_sorted = index_df.sort_values(by='start')
    ssd_df_sorted = ssd_df.sort_values(by='start')
    unique_case_ids = df_sorted['case_id'].drop_duplicates().tolist()
    unique_case_ids2 = ssd_df_sorted['case_id'].drop_duplicates().tolist()
    train_val_idx = int(len(unique_case_ids) * split_ratio)
    train_idx = int(train_val_idx * (1-val_ratio))
    train_case_ids = unique_case_ids[:train_idx]
    val_case_ids = unique_case_ids[train_idx:train_val_idx]
    test_case_ids = unique_case_ids[train_val_idx:]
    index_df['train'] = index_df['case_id'].apply(lambda x: 1 if x in train_case_ids else 0)
    index_df['val'] = index_df['case_id'].apply(lambda x: 1 if x in val_case_ids else 0)
    index_df['test'] = index_df['case_id'].apply(lambda x: 1 if x in test_case_ids else 0)
    train_val_idx = int(len(unique_case_ids2) * split_ratio)
    train_idx = int(train_val_idx * (1-val_ratio))
    train_case_ids = unique_case_ids2[:train_idx]
    val_case_ids = unique_case_ids2[train_idx:train_val_idx]
    test_case_ids = unique_case_ids2[train_val_idx:]
    ssd_df['train'] = ssd_df['case_id'].apply(lambda x: 1 if x in train_case_ids else 0)
    ssd_df['val'] = ssd_df['case_id'].apply(lambda x: 1 if x in val_case_ids else 0)
    ssd_df['test'] = ssd_df['case_id'].apply(lambda x: 1 if x in test_case_ids else 0)
    index_df.to_csv(df_path, index=False)
    ssd_df.to_csv(df_ssd_path, index=False)
    train_val_df = index_df[(index_df['train'] == 1) | (index_df['val'] == 1)]
    test_df = index_df[index_df['test'] == 1]
    unique_cases1 = len(train_val_df['case_id'].unique())
    unique_cases2 = len(test_df['case_id'].unique())
    print(unique_cases1, unique_cases2)
    train_val_ssd_df = ssd_df[(ssd_df['train'] == 1) | (ssd_df['val'] == 1)]
    test_ssd_df = ssd_df[ssd_df['test'] == 1]
    return train_val_df, test_df, train_val_ssd_df, test_ssd_df      

In [None]:
def get_closest_average(key, dictionary):
    if key in dictionary:
        return dictionary[key]
    else:
        print('condition is met')
        # Get the closest keys
        keys = np.array(list(dictionary.keys()))
        closest_keys = keys[np.argsort(np.abs(keys - key))[:2]]  # Get two closest keys
        return dictionary[closest_keys[0]] if len(closest_keys) == 1 else np.mean([dictionary[k] for k in closest_keys])

In [None]:
dataset_name = 'BPI_Challenge_2012' # BPIC20_DomesticDeclarations BPIC20_InternationalDeclarations BPIC20RFP BPIC20PTC BPIC20TPD BPIC15_1 BPIC13I BPIC12 HelpDesk Sepsis

train_val_df, test_df, train_val_ssd_df, test_ssd_df  = get_training_test_dataframes(dataset_name)
prefix_lengths = train_val_df['prefix_length'].drop_duplicates().tolist()
prefix_ssd_lengths = train_val_ssd_df['prefix_length'].drop_duplicates().tolist()
dummy_dict, dummy_ssd_dict = {}, {}
for length in prefix_lengths:
    selected_rows = train_val_df[train_val_df['prefix_length'] == length]
    dummy_dict[length] = selected_rows['rem_time'].mean()
for length in prefix_ssd_lengths:
    selected_rows = train_val_ssd_df[train_val_ssd_df['prefix_length'] == length]
    dummy_ssd_dict[length] = selected_rows['rem_time'].mean()   

test_df['dummy_predictions'] = test_df['prefix_length'].apply(lambda x: get_closest_average(x, dummy_dict))
test_ssd_df['dummy_predictions'] = test_ssd_df['prefix_length'].apply(lambda x: get_closest_average(x, dummy_ssd_dict))
MAE = (test_df['rem_time'] - test_df['dummy_predictions']).abs().mean()
MAE_SSD = (test_ssd_df['rem_time'] - test_ssd_df['dummy_predictions']).abs().mean()
print(MAE, MAE_SSD)