In [None]:
# We import the neccessary packages in the beginning
import os
import math
from statistics import mean,stdev
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.conversion.bpmn import converter as bpmn_converter
from sklearn.impute import SimpleImputer
import copy
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from imblearn.under_sampling import OneSidedSelection
from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import sklearn
import tqdm
import time
import xgboost as xgb

In [None]:
# Returns a path to the file selected by the user
# Input: The folder in which to look for the files - the default is the current folder
def ask_for_path(rel_path='', index = -1):
    #Crawl all files in the input folder
    print("The following files are available in the input folder:\n")

    count = 0
    file_list = os.listdir(os.getcwd() + rel_path)
    for file in file_list:
        print(str(count) + " - " + file)
        count+=1

    if(index == -1):
        #Ask for which of the files shall be transformed and select it.
        inp = input("Please choose from the list above which of the files shall be transformed by typing the corresponding number.")
    else:
        #Automatic iteration
        print('Automatic Iteration.')
        inp = index

    input_file = file_list[int(inp)]

    return (os.getcwd() + rel_path + input_file)

In [None]:
# this is a help function to print petri nets
def output_petri_net(net, initial_marking, final_marking, file_name, label):

    #init visualizer
    parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: OUTPUT_FORMAT, 'label':'The Round Table'}   #Add frequency to graph
    gviz = pn_visualizer.apply(net, initial_marking, final_marking, parameters=parameters,
                               variant=pn_visualizer.Variants.FREQUENCY, log=log)

    gviz.attr(label=label)
    pn_visualizer.save(gviz, os.getcwd() + REL_OUTPUT_PATH + file_name + "." + OUTPUT_FORMAT)

In [None]:
def get_output_path(file_name,REL_OUTPUT_PATH = "/Output Tree/"):
    return (os.getcwd() + REL_OUTPUT_PATH + file_name)

In [None]:
# this function converts a selected file in the path that is the input into a log
def transform_to_log(file_path):
    filename, file_extension = os.path.splitext(file_path)
    x,z =os.path.split(file_path)
    
    if file_extension == '.csv':
        log_csv = pd.read_csv(file,sep=None,encoding='utf-8-sig')
        if z =='mobis_challenge_log_2019.csv' or z =='mobis_challenge_log_2019_only_complete_cases.csv':
            log_csv['end'] = pd.to_datetime(log_csv['end'])
            log_csv['start'] = pd.to_datetime(log_csv['start'])
            log_csv['cost'] = log_csv['cost'].apply(pd.to_numeric, errors='coerce')
            log_csv.rename(columns={'cost': 'case:cost','case':'case:concept:name','activity':'concept:name','end':'time:timestamp', 'user':'org:resource'}, inplace=True)
        elif z =='mobis_challenge_log_2019_original.csv':
            log_csv['end'] = pd.to_datetime(log_csv['end'])
            log_csv['start'] = pd.to_datetime(log_csv['start'])
            log_csv['cost'] = log_csv['cost'].apply(pd.to_numeric, errors='coerce')
            log_csv.rename(columns={'case':'case:concept:name','activity':'concept:name','start':'time:timestamp', 'user':'org:resource'}, inplace=True)
        log_csv['time:timestamp'] = pd.to_datetime(log_csv['time:timestamp'])
        log = log_converter.apply(log_csv)

    elif file_extension == '.xes':
        log = pm4py.read_xes(file_path)
        log = pm4py.convert_to_event_log(log)
    elif file_extension == '.dfg':
        log = pm4py.read_dfg(file_path)
    else:
        print("Current filetype is equal to {}. \nPlease input a file with any of the following extensions: - csv; - xes; - dfg".format(str(file_extension)))
        return -1

    return log

In [None]:
def get_all_activities_from_log(log):
    activities=[]
    for trace in log:
        for event in trace:
            if activities.count(event['concept:name'])==0:
                activities.append(event['concept:name'])
    return activities

In [None]:
# this function enriches each trace by the event 1...m, resource 1...m, Weekday start and end attributes until a given prefix length
def complex_index_encoding(log, pref_length=5):
    max_ev=0
    for trace in log:
        i=0
        for event in trace:
            i+=1
        if i>max_ev:
            max_ev=i
    
    if pref_length > max_ev:
        print('The prefix length is larger than the maximum trace length; Maximum trace length will be used.')
        pref_length = max_ev

    #weekdays
    weekDaysMapping = ("Monday", "Tuesday",
                    "Wednesday", "Thursday",
                    "Friday", "Saturday",
                    "Sunday")

    for trace in log:
        for event in trace:
            trace.attributes['weekday_start']=weekDaysMapping[event['time:timestamp'].weekday()]
            break
    if pref_length == max_ev:
        for trace in log:
            for event in trace:
                trace.attributes['weekday_end']=weekDaysMapping[event['time:timestamp'].weekday()]
    
    
    j=0
    no_evs={}
    for trace in log:
        i=0
        for event in trace:
            i+=1
            if i==1:
                st_time=event['time:timestamp'].day+event['time:timestamp'].hour/24+event['time:timestamp'].minute/(24*60)
            if i<= pref_length:
                trace.attributes['event_'+str(i)]=event['concept:name']
                trace.attributes['resource_'+str(i)]=str(event['org:resource'])
                trace.attributes['month_'+str(i)]=(str(event['time:timestamp'].month)+'_'+str(event['time:timestamp'].year))
                #trace.attributes['elapsed_time']=event['time:timestamp'].day+event['time:timestamp'].hour/24+event['time:timestamp'].minute/(24*60)-st_time
        no_evs[j]=i
        j+=1

    j=0
    for trace in log:
        if no_evs[j]<max_ev:
            fill=no_evs[j]+1
            for k in range(fill,max(max_ev,pref_length)+1):
                trace.attributes['event_'+str(k)]=np.nan
                trace.attributes['resource_'+str(k)]=np.nan

    return log

In [None]:
import caffeine

In [None]:
##########
"""Settings"""
##########
# set the input and output path according to the files you want to select
REL_INPUT_PATH = "/../BPIC12/" # here lie the event logs (.csv), the to-be model (.bpmn) and the already aligned traces (.pkl)
REL_OUTPUT_PATH = "/../BPIC12/"
OUTPUT_FORMAT = "png"

In [None]:
# generate the log from the input path
file= ask_for_path(REL_INPUT_PATH,9) # adjust to your path
log=transform_to_log(file)
ref_log=transform_to_log(file)


In [None]:
file= ask_for_path(REL_INPUT_PATH,0)# adjust to your path
bpmn_graph = pm4py.read_bpmn(file)
#pm4py.write_bpmn(bpmn_graph, "ru.bpmn", enable_layout=True)
net, initial_marking, final_marking = bpmn_converter.apply(bpmn_graph)
#net, initial_marking, final_marking=pm4py.read_pnml(file)
# pm4py.visualization.petri_net.visualizer(net, initial_marking, final_marking)
# output_petri_net(net, initial_marking, final_marking,'Basis_PN', 'test')
pm4py.view_petri_net(net, initial_marking, final_marking)

In [None]:
def generate_alignments_pkl(log, net, initial_marking, final_marking):
    aligned_traces = pm4py.conformance_diagnostics_alignments(log, net, initial_marking, final_marking)
    i=0
    dev=[]
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i+=1

    f = open('aligned_traces_binet_12A.pkl','wb')
    pickle.dump(aligned_traces,f)
    f.close()
    return dev

In [None]:
#dev, aligned_traces=generate_alignments_pkl(log, net, initial_marking, final_marking)
#print(len(dev))
#dev

In [None]:
file= ask_for_path(REL_INPUT_PATH,13)# adjust to your path
with open(file, 'rb') as f:
    aligned_traces=pickle.load(f)

In [None]:
## train data
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
# we define our FFN 
class BinaryClassificationIndiv(nn.Module):
    def __init__(self, no_columns):
        super(BinaryClassificationIndiv, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(no_columns, 256)
        self.activation1 = nn.LeakyReLU()
        self.layer_2 = nn.Linear(256, 256)
        self.activation2 = nn.LeakyReLU()
        self.layer_out = nn.Linear(256, 2)
        
        
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.LayerNorm(256)
        self.batchnorm2 = nn.LayerNorm(256)
        self.Softmax = nn.Softmax()
        
    def forward(self, inputs):
        x = self.activation1(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.activation2(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
class BPDP_LSTM(nn.Module):
    def __init__(self, vocab_events, vocab_resources, no_TA, vocab_month):
        super(BPDP_LSTM, self).__init__()
        self.embedding_e = nn.Embedding(vocab_events, 16) # hier auf 8 / 16
        self.activation1 = nn.LeakyReLU()
        self.lstm_e = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
        self.linear_e = nn.Linear(64, 32)
        self.embedding_r = nn.Embedding(vocab_resources, 16)
        self.lstm_r = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
        self.linear_r = nn.Linear(64, 32)
        self.embedding_m = nn.Embedding(vocab_month, 16)
        self.lstm_m = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
        self.linear_m = nn.Linear(64, 32)
        self.linear_ta = nn.Linear(no_TA, 32)
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.LayerNorm(128)
        self.linear = nn.Linear(128, 2)
    def forward(self, evs, rs,tas, ms):
        evs= self.embedding_e(evs)
        evs, _ = self.lstm_e(evs)
        evs=self.linear_e(evs)
        evs=evs[:, -1, :]
        evs=self.activation1(evs)
        rs= self.embedding_r(rs)
        rs, _ = self.lstm_r(rs)
        rs=rs[:, -1, :]
        rs=self.activation1(rs)
        rs=self.linear_r(rs)
        ms= self.embedding_m(ms)
        ms, _ = self.lstm_m(ms)
        ms=ms[:, -1, :]
        ms=self.activation1(ms)
        ms=self.linear_m(ms)
        tas= self.linear_ta(tas)
        fin=torch.cat((evs,rs),dim=1)
        fin=torch.cat((fin,ms),dim=1)
        fin=torch.cat((fin,tas),dim=1)
        fin=self.batchnorm1(fin)
        #fin = self.dropout(fin)
        fin = self.linear(fin)
        return fin

In [None]:
class LargerBinaryClassificationIndiv(nn.Module):
    def __init__(self, no_columns):
        super(LargerBinaryClassificationIndiv, self).__init__()
        self.layer_1 = nn.Linear(no_columns, 512)
        self.activation1 = nn.LeakyReLU()
        self.layer_2 = nn.Linear(512, 256)
        self.activation2 = nn.LeakyReLU()
        self.layer_3 = nn.Linear(256, 256)
        self.activation3 = nn.LeakyReLU()
        self.layer_out = nn.Linear(256, 2)


        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.LayerNorm(512)
        self.batchnorm2 = nn.LayerNorm(256)
        self.Softmax = nn.Softmax()

    def forward(self, inputs):
        x = self.activation1(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.activation2(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.activation3(self.layer_3(x))
        x = self.dropout(x)
        x = self.layer_out(x)

        return x

In [None]:
class EarlyStopping():
  def __init__(self, patience=10, min_delta=0, restore_best_weights=True):
    self.patience = patience
    self.min_delta = min_delta
    self.restore_best_weights = restore_best_weights
    self.best_model = None
    self.best_loss = None
    self.counter = 0
    self.status = ""
    
  def __call__(self, model, val_loss):
    if self.best_loss == None:
      self.best_loss = val_loss
      self.best_model = copy.deepcopy(model)
    elif self.best_loss - val_loss > self.min_delta:
      self.best_loss = val_loss
      self.counter = 0
      self.best_model.load_state_dict(model.state_dict())
    elif self.best_loss - val_loss <= self.min_delta:
      self.counter += 1
      if self.counter >= self.patience:
        self.status = f"Stopped on {self.counter}"
        if self.restore_best_weights:
          model.load_state_dict(self.best_model.state_dict())
        return True
    self.status = f"{self.counter}/{self.patience}"
    return False

In [None]:
# just for printing the accuracy during training - only informational
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.nn.functional.softmax(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
def IDP_separate_CIBE(log, ref_log, aligned_traces, split=1/3, u_sample=True, early_stop=True,explained=False):
    xt,z =os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i=0
    dev=[] # stores all deviations that happened
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i+=1

    y_cum_test={} # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df=pd.DataFrame(data=0,columns=dev, index=range(len(log))) # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order={} # dict with event sequences for each trace
    event_count={} # dict with trace length for each trace
    max_ev=0 # will be maximum trace length
    k=0
    for trace in log:
        event_order[k]=[]
        i=0
        for event in trace:
            i+=1
            event_order[k].append(event['concept:name'])
        if i>max_ev:
            max_ev=i
        event_count[k]=len(event_order[k])
        k+=1
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i]=1
        i+=1
    for ev in range(1,max_ev+1):
        y_cum_test[ev]=dev_df.copy() # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1,max_ev+1):
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< ev:
                drop_idx.append(trace_idx) # drop all trace labels that do not go until prefix length
        y_cum_test[ev]=y_cum_test[ev].drop(drop_idx)
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        j=no_moves-1 # iterator over moves in alignment, starting at the end
        m=len(event_order[i]) # iterator over event sequence, starting at the end
        while j >=0:
            if aligned_traces[i]['alignment'][j][1] == None: # if silent move, just go one move further to the beginning in the alignment
                j-=1
            elif aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j-=1
                    m-=1
            elif event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # log move detected
                for q in range(m,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j-=1
                m-=1
            elif m==max_ev:
                j-=1
            else: # model move deteceted
                for q in range(m+1,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j-=1
        i+=1
    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000) # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe=ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe=ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe=ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe=ref_dataframe.reset_index()
    ref_raw_dat=ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ']= pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat=ref_raw_dat.drop('REG_DATE', axis=1)
    elif z=='aligned_traces_20int.pkl':
            ref_clean_dat=ref_raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
    elif z=='aligned_traces_20dom.pkl':
            ref_clean_dat=ref_raw_dat.drop(['DeclarationNumber','id'], axis=1)
    elif z=='aligned_traces_20prep.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
    elif z=='aligned_traces_20RfP.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
    else:
        ref_clean_dat=ref_raw_dat.copy()
    ref_enc_dat=pd.get_dummies(ref_clean_dat)



    EPOCHS = 30
    BATCH_SIZE = 128
    LEARNING_RATE=0.0001

    X_cum={}
    metrics=pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC','LenTrain', 'LenTrain_beforeUS_0', 'LenTrain_beforeUS_1', 'LenTrain_afterUS_0', 'LenTrain_afterUS_1'])

    # prepare X for all prefix lengths
    for prefix in range(1,max_ev+1):
        complex_index_encoding(log,prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe=dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe=dataframe.filter(like='case:', axis=1)
        dataframe=dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe=dataframe.reset_index()
        raw_dat=dataframe.drop('index', axis=1)
        if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ']= pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat=raw_dat.drop('REG_DATE', axis=1)
        elif z=='aligned_traces_20int.pkl':
            clean_dat=raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
        elif z=='aligned_traces_20dom.pkl':
            clean_dat=raw_dat.drop(['DeclarationNumber','id'], axis=1)
        elif z=='aligned_traces_20prep.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
        elif z=='aligned_traces_20RfP.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
        else:
            clean_dat=raw_dat.copy()
        enc_dat=pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key]=0 # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
        enc_dat=pd.DataFrame(data=imp.fit_transform(enc_dat),columns=enc_dat.columns)

        X_cum[prefix]=enc_dat.copy()
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix]=X_cum[prefix].drop(drop_idx)

    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 16
        negative_weights[label] = 1



    path=(os.getcwd()+'/BPDP_Classifier') # output path
    xt,z =os.path.split(file)

    writer = pd.ExcelWriter(path+'/'+z+'_BPDP_CIBE_classification_testcount.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split, random_state=0)

    for d in dev:
        metrics[str('NoDev'+d)]=0
        metrics[d]['LenTrain_beforeUS_1']=sum(dev_df[d][i] for i in x_train_idx)
        metrics[d]['LenTrain_beforeUS_0']=len(x_train_idx)-sum(dev_df[d][i] for i in x_train_idx)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx]+1):
                if y_cum_test[i][d][idx]==1: dev_position[d][idx]=i+1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness={}

    dev_distribution = pd.DataFrame(data=0, index=['Training','Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training']=sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test']=sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))

    dev_trained=[]
    for d in dev:
        if dev_distribution[d]['Training'] ==0:
            metrics[d]='No Deviation in Training Set'
            continue
        elif dev_distribution[d]['Test'] ==0:
            metrics[d]='No Deviation in Test Set'
            continue
        else:
            dev_trained.append(d)


        Y_cum_dev={}
        for prefix in range(1,max_ev+1):
            Y_cum_dev[prefix]=pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev']=0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i]=1-y_cum_test[prefix][d][i]
            if prefix==1:
                print(Y_cum_dev[prefix].columns)

        if u_sample:
            imb_ref_enc_dat=ref_enc_dat.copy()
            imb_ref_enc_dat['ind']=0
            for i in range(len(imb_ref_enc_dat)):
                imb_ref_enc_dat['ind'][i]=i
            imb_traces=pd.DataFrame(data=0, columns=['Dev'], index = range(len(log)))
            for trace in range(len(log)):
                if dev_df[d][trace]>0:
                    imb_traces['Dev'][trace]=1


            imb_traces=imb_traces.drop(x_test_idx)
            imb_ref_enc_dat=imb_ref_enc_dat.drop(x_test_idx)
            imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
            imb_ref_enc_dat=pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat),columns=imb_ref_enc_dat.columns)

            oss=OneSidedSelection(random_state=0,n_seeds_S=250,n_neighbors=7)

            X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

            x_train_idx= list(X_resampled['ind'])
            y_train_idx= list(X_resampled['ind'])

        print('index length ', len(x_train_idx),len(x_test_idx),len(y_train_idx),len(y_test_idx))
        metrics[d]['LenTrain']=len(x_train_idx)
        metrics[d]['LenTrain_afterUS_1']=imb_traces['Dev'].sum()
        metrics[d]['LenTrain_afterUS_0']=len(x_train_idx)-imb_traces['Dev'].sum()
        # validation set for early stopping
        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2, random_state=0)


        enumerated_trace_idx={}
        for prefix in range(1,max_ev+1):
            drop_idx=[]
            for trace_idx in range(len(log)):
                if event_count[trace_idx]< prefix:
                    drop_idx.append(trace_idx) # drop all trace encoding that do not go until prefix length

            x_te=X_cum[prefix].loc[[j for j in list(set(y_test_idx)-set(drop_idx))]].to_numpy().astype(float)
            x_tr=X_cum[prefix].loc[[j for j in list(set(y_train_idx)-set(drop_idx))]].to_numpy().astype(float)
            x_va=X_cum[prefix].loc[[j for j in list(set(y_val_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_te=Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_tr=Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_va=Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx)-set(drop_idx))]].to_numpy().astype(float)
            enumerated_trace_idx[prefix]=list(set(y_test_idx)-set(drop_idx))
            print('subset length ',prefix, len(x_te),len(x_tr),len(y_te),len(y_tr))

            if prefix ==1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append( X_train, x_tr, axis=0)
                X_test = np.append( X_test, x_te, axis=0)
                y_train = np.append( y_train, y_tr, axis=0)
                y_test = np.append( y_test, y_te, axis=0)
                y_val = np.append( y_val, y_va, axis=0)
                X_val = np.append( X_val, x_va, axis=0)# combine all X data from all prefixes into one array
        print(d, len(X_train),len(y_train),len(X_val),len(y_val),len(X_test),len(y_test))


        print('split done')
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BinaryClassificationIndiv(no_columns=len(ref_enc_dat.loc[0]))
        model.to(device)
        weights = torch.FloatTensor(list([positive_weights[d], negative_weights[d]]))
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if early_stop:
            EPOCHS=300
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(X_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch<EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss=0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device))/len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1)* len(x_batch)
                    epoch_loss+=loss
                    if i == len(steps)-1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model,vloss): done = True
                        pbar.set_description(f"Epoch: {epoch}, tloss: {epoch_loss/len(train_loader)}, Acc: {epoch_acc/len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(f"Epoch: {epoch}, tloss {epoch_loss/len(train_loader):}, Acc: {epoch_acc/len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(X_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []

        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))

                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        metrics[d]['Precision']=CM[0][1][1]/(CM[0][1][1]+CM[0][0][1])
        metrics[d]['Recall']=CM[0][1][1]/(CM[0][1][1]+CM[0][1][0])
        metrics[d]['Support']=CM[0][1][1]+CM[0][1][0]
        try:
            metrics[d]['ROC_AUC'] =  sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev'+d)]['Precision']=CM[1][1][1]/(CM[1][1][1]+CM[1][0][1])
        metrics[str('NoDev'+d)]['Recall']=CM[1][1][1]/(CM[1][1][1]+CM[1][1][0])
        metrics[str('NoDev'+d)]['Support']=CM[1][1][1]+CM[1][1][0]
        print(CM)

        to_be_checked_idx={}
        for idx in x_test_idx:
            cum_idx=0
            for prefix in range(1,event_count[idx]+1):
                if prefix==1:
                    to_be_checked_idx[idx]=[enumerated_trace_idx[1].index(idx)]
                else:
                    to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx)+cum_idx)
                cum_idx+=len(enumerated_trace_idx[prefix])


        for prefix in range(1, max_ev+1):
            for idx in to_be_checked_idx.keys():
                if event_count[idx]>= prefix:
                    if prefix==1:
                        dev_position_pred[d][idx]=y_pred_list[to_be_checked_idx[idx][prefix-1]][0]
                    else:
                        if y_pred_list[to_be_checked_idx[idx][prefix-1]][0]==1 and y_pred_list[to_be_checked_idx[idx][prefix-2]][0]==0:
                            if dev_position[d][idx]<= prefix:
                                dev_position_pred[d][idx]=dev_position[d][idx]
                            else:
                                dev_position_pred[d][idx]=prefix



        earliness[d]=0
        tobe_devs=0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx]==0 or dev_position_pred[d][idx]==0:
                continue
            tobe_devs+=1
            earliness[d]+=dev_position_pred[d][idx]/dev_position[d][idx]
        if not tobe_devs==0:
            earliness[d]=earliness[d]/tobe_devs


        if explained:
            import shap
            import matplotlib.pyplot as plt
            np.random.seed(42)
            e = shap.DeepExplainer(model, torch.FloatTensor(X_train[np.random.choice(X_train.shape[0], 1000, replace=False)]))

            shap_idx=[]
            for j in range(len(y_pred_list)):
                if y_pred_list[j][0]==y_test[j][0]==1:
                    shap_idx.append(j)
            shap_values = e.shap_values(torch.FloatTensor(X_test[shap_idx]))
            fig=shap.summary_plot(shap_values[0], X_test[shap_idx], plot_type = 'dot', feature_names = enc_dat.columns, max_display=10, plot_size=(10,5), show=False)
            plt.savefig(path+'/ShapValues/Dev_'+z+'_'+d+'.png')
            plt.close()

            fig=shap.summary_plot(shap_values[1], X_test[shap_idx], plot_type = 'dot', feature_names = enc_dat.columns, max_display=10, plot_size=(10,5), show=False)
            plt.savefig(path+'/ShapValues/NoDev_'+z+'_'+d+'.png')
            plt.close()


        print(metrics)

    avg_dev_pos={}
    for d in dev:
        if dev_distribution[d]['Test'] ==0:
            metrics[d]='No Deviation in Test Set'
            continue
        if dev_distribution[d]['Training'] ==0:
            metrics[d]='No Deviation in Training Set'
            continue
        devs=0
        positions=0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx]>0:
                devs+=1
                positions+=dev_position[d][idx]
        if devs ==0:
            continue
        avg_dev_pos[d]=positions/devs

    metrics.to_excel(writer, sheet_name=('Metrics'))
    df=pd.DataFrame(data=earliness, index=[0])
    df.to_excel(writer, sheet_name=('Earliness'))
    df=pd.DataFrame(data=avg_dev_pos, index=[0])
    df.to_excel(writer, sheet_name=('Position'))

    writer.close()

In [None]:
IDP_separate_CIBE(log, ref_log, aligned_traces, split=1/3, u_sample=True, early_stop=True,explained=False)

In [None]:
def IDP_separate_LSTM_CIBE(log, ref_log, aligned_traces, split=1/3, u_sample=True, early_stop=True,relevance_ths = .5):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length
    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    trainin_dev_df = dev_df.loc[x_train_idx]
    trainin_dev_df.corr()
    corrMatrix = trainin_dev_df.corr()

    corrMatrix.loc[:, :] = np.tril(corrMatrix, k=-1)  # borrowed from Karl D's answer

    already_in = set()
    max_combs_l = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] >= relevance_ths].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            max_combs_l.append(perfect_corr)
    test_counts = {}
    for comb in max_combs_l:
        for y in range(len(comb)):
            test_counts[comb[y]] = dev_df.loc[x_test_idx].sum()[comb[y]]
        if any(dev_df.loc[x_test_idx].sum()[comb[y]] == 0 for y in range(len(comb))):
            max_combs_l.remove(comb)
            print(comb)

    max_combs = {}
    for comb in max_combs_l:
        max_combs[str(comb)] = comb

    y_cum_test_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_combs[prefix] = y_cum_test[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_test_combs[prefix][comb] = 0
            for i in list(y_cum_test_combs[prefix].index):
                if event_count[i] < prefix:
                    continue
                if all(y_cum_test_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_test_combs[prefix][j][i] = 0
                    y_cum_test_combs[prefix][comb][i] = 1
    trainin_dev_df.sum()

    y_cum_test_o_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_o_combs[prefix] = y_cum_test_combs[prefix][list(max_combs.keys())]


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log,
                                     4000)  # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()

    ref_enc_dat = ref_clean_dat.copy()

    BATCH_SIZE = 128
    LEARNING_RATE = 0.00001

    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC','Time'])

    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = clean_dat.copy()
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 'No'  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='No')
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/BPDP_LSTM')  # output path
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter('BPDP_LSTM' + '/' + z + '_BPDP_LSTM_time_stopped.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)

    x_train_idx_c, x_test_idx_c, y_train_idx_c, y_test_idx_c = train_test_split(range(len(log)), range(len(log)),
                                                                                test_size=split,
                                                                                random_state=0)
    x_train_idx_c, x_val_idx_c, y_train_idx_c, y_val_idx_c = train_test_split(x_train_idx_c, x_train_idx_c, test_size=0.2,
                                                                              random_state=0)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx] + 1):
                if y_cum_test[i][d][idx] == 1: dev_position[d][idx] = i + 1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness = {}

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))


    def flatten_comprehension(matrix):
        return [item for row in matrix for item in row]


    ref_enc_dat

    evs_c = []
    resource_c = []
    month_c = []
    trace_attr = []
    for ca in X_cum[1].columns:
        if ca.startswith('event'): evs_c.append(ca)
    for ca in X_cum[1].columns:
        if ca.startswith('resource'): resource_c.append(ca)
    for ca in X_cum[1].columns:
        if ca.startswith('month'): month_c.append(ca)
    for ca in X_cum[1].columns:
        if not (ca in evs_c or ca in resource_c or ca in month_c): trace_attr.append(ca)
    print(evs_c)
    print(resource_c)
    print(month_c)
    print(trace_attr)

    X_events = {}
    X_resource = {}
    X_month = {}
    X_tracea = {}
    for prefix in range(1, max_ev + 1):
        X_events[prefix] = X_cum[prefix][evs_c]
        X_resource[prefix] = X_cum[prefix][resource_c]
        X_month[prefix] = X_cum[prefix][month_c]
        X_tracea[prefix] = X_cum[prefix][trace_attr]

    cat_tas = []
    for cat in X_tracea[1].columns:
        if type(X_tracea[1][cat][0]) == str:
            cat_tas.append(cat)

    uniques_cats = {}
    for cat in cat_tas:
        uniques_cats[cat] = []
    for prefix in range(1, max_ev + 1):
        for cat in cat_tas:
            for reals in list(X_tracea[prefix][cat].unique()):
                if not reals in uniques_cats[cat]:
                    uniques_cats[cat].append(reals)

    for cat in cat_tas:
        for prefix in range(1, max_ev + 1):
            for j in list(X_tracea[prefix].index):
                X_tracea[prefix][cat][j] = uniques_cats[cat].index(X_tracea[prefix][cat][j])

    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 16
        negative_weights[label] = 1
    models_collect = {}
    dev_trained = []
    outputs_train = pd.DataFrame()
    outputs_test = pd.DataFrame()
    outputs_val = pd.DataFrame()

    for d in dev:
        time_start = time.clock()
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        elif dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        if u_sample:
            imb_ref_enc_dat = ref_enc_dat.copy()
            imb_ref_enc_dat = pd.get_dummies(imb_ref_enc_dat)
            imb_ref_enc_dat['ind'] = 0
            for i in range(len(imb_ref_enc_dat)):
                imb_ref_enc_dat['ind'][i] = i
            imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
            for trace in range(len(log)):
                if dev_df[d][trace] > 0:
                    imb_traces['Dev'][trace] = 1

            imb_traces = imb_traces.drop(x_test_idx)
            imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
            imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
            imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)

            oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

            X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

            x_train_idx = list(X_resampled['ind'])
            y_train_idx = list(X_resampled['ind'])

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        # validation set for early stopping
        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)

        enumerated_trace_idx = {}
        cum_trace_idxs = []
        pref_list = []
        pref_list_train_c = []
        pref_list_test_c = []
        pref_list_val_c = []
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_events[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
            x_tr = X_events[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
            x_va = X_events[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_te_c = X_events[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
            x_tr_c = X_events[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
            x_va_c = X_events[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
            y_te_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(
                float)
            y_tr_c = y_cum_test_o_combs[prefix].loc[
                [j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            y_va_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(
                float)
            cum_trace_idxs.append(list(set(y_test_idx) - set(drop_idx)))
            pref_list.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            pref_list_train_c.append([prefix] * len(list(set(y_train_idx_c) - set(drop_idx))))
            pref_list_test_c.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            pref_list_val_c.append([prefix] * len(list(set(y_val_idx_c) - set(drop_idx))))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train_event = x_tr
                X_test_event = x_te
                y_train = y_tr
                y_test = y_te
                X_val_event = x_va
                y_val = y_va
                X_train_event_c = x_tr_c
                X_test_event_c = x_te_c
                X_val_event_c = x_va_c
                y_train_c = y_tr_c
                y_test_c = y_te_c
                y_val_c = y_va_c
            else:
                X_train_event = np.append(X_train_event, x_tr, axis=0)
                X_test_event = np.append(X_test_event, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val_event = np.append(X_val_event, x_va, axis=0)
                X_train_event_c = np.append(X_train_event_c, x_tr_c, axis=0)
                X_test_event_c = np.append(X_test_event_c, x_te_c, axis=0)
                X_val_event_c = np.append(X_val_event_c, x_va_c, axis=0)
                y_train_c = np.append(y_train_c, y_tr_c, axis=0)
                y_test_c = np.append(y_test_c, y_te_c, axis=0)
                y_val_c = np.append(y_val_c, y_va_c, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train_event), len(y_train), len(y_train_c), len(X_val_event), len(y_val), len(y_val_c),
              len(X_test_event), len(y_test), len(y_test_c))

        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_resource[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
            x_tr = X_resource[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
            x_va = X_resource[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
            x_te_c = X_resource[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
            x_tr_c = X_resource[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
            x_va_c = X_resource[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train_resource = x_tr
                X_test_resource = x_te
                X_val_resource = x_va
                X_train_resource_c = x_tr_c
                X_test_resource_c = x_te_c
                X_val_resource_c = x_va_c
            else:
                X_train_resource = np.append(X_train_resource, x_tr, axis=0)
                X_test_resource = np.append(X_test_resource, x_te, axis=0)
                X_val_resource = np.append(X_val_resource, x_va, axis=0)
                X_train_resource_c = np.append(X_train_resource_c, x_tr_c, axis=0)
                X_test_resource_c = np.append(X_test_resource_c, x_te_c, axis=0)
                X_val_resource_c = np.append(X_val_resource_c, x_va_c,
                                             axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train_resource), len(y_train), len(X_val_resource), len(y_val), len(X_test_resource), len(y_test))

        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_month[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
            x_tr = X_month[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
            x_va = X_month[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
            x_te_c = X_month[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
            x_tr_c = X_month[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
            x_va_c = X_month[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train_m = x_tr
                X_test_m = x_te
                X_val_m = x_va
                X_train_m_c = x_tr_c
                X_test_m_c = x_te_c
                X_val_m_c = x_va_c
            else:
                X_train_m = np.append(X_train_m, x_tr, axis=0)
                X_test_m = np.append(X_test_m, x_te, axis=0)
                X_val_m = np.append(X_val_m, x_va, axis=0)
                X_train_m_c = np.append(X_train_m_c, x_tr_c, axis=0)
                X_test_m_c = np.append(X_test_m_c, x_te_c, axis=0)
                X_val_m_c = np.append(X_val_m_c, x_va_c, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train_m), len(y_train), len(X_val_m), len(y_val), len(X_test_m), len(y_test))

        print('split done')

        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_tracea[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
            x_tr = X_tracea[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
            x_va = X_tracea[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
            x_te_c = X_tracea[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
            x_tr_c = X_tracea[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
            x_va_c = X_tracea[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train_TA = x_tr
                X_test_TA = x_te
                X_val_TA = x_va
                X_train_TA_c = x_tr_c
                X_test_TA_c = x_te_c
                X_val_TA_c = x_va_c
            else:
                X_train_TA = np.append(X_train_TA, x_tr, axis=0)
                X_test_TA = np.append(X_test_TA, x_te, axis=0)
                X_val_TA = np.append(X_val_TA, x_va, axis=0)
                X_train_TA_c = np.append(X_train_TA_c, x_tr_c, axis=0)
                X_test_TA_c = np.append(X_test_TA_c, x_te_c, axis=0)
                X_val_TA_c = np.append(X_val_TA_c, x_va_c, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train_TA), len(y_train), len(X_val_TA), len(y_val), len(X_test_TA), len(y_test))

        events_encoder = list(
            np.unique(np.append(np.append(X_train_event_c, X_test_event_c, axis=0), X_val_event_c, axis=0)))
        events_encoder.index('No')
        resource_encoder = list(
            np.unique(np.append(np.append(X_train_resource_c, X_test_resource_c, axis=0), X_val_resource_c, axis=0)))
        resource_encoder.index('No')
        cat_ecnoders = {}
        for cat in cat_tas:
            cat_ecnoders[cat] = list(np.unique(np.append(np.append(X_train_TA_c, X_test_TA_c, axis=0), X_val_TA_c, axis=0)))

        month_encoder = list(np.unique(np.append(np.append(X_train_m_c, X_test_m_c, axis=0), X_val_m_c, axis=0)))
        month_encoder
        for i in range(len(X_test_event)):
            for j in range(len(X_test_event[0])):
                X_test_event[i][j] = events_encoder.index(X_test_event[i][j])
            for j in range(len(X_test_resource[0])):
                X_test_resource[i][j] = resource_encoder.index(X_test_resource[i][j])
            for j in range(len(X_test_m[0])):
                X_test_m[i][j] = month_encoder.index(X_test_m[i][j])
        for i in range(len(X_train_event)):
            for j in range(len(X_train_event[0])):
                X_train_event[i][j] = events_encoder.index(X_train_event[i][j])
            for j in range(len(X_train_resource[0])):
                X_train_resource[i][j] = resource_encoder.index(X_train_resource[i][j])
            for j in range(len(X_train_m[0])):
                X_train_m[i][j] = month_encoder.index(X_train_m[i][j])
        for i in range(len(X_val_event)):
            for j in range(len(X_val_event[0])):
                X_val_event[i][j] = events_encoder.index(X_val_event[i][j])
            for j in range(len(X_val_resource[0])):
                X_val_resource[i][j] = resource_encoder.index(X_val_resource[i][j])
            for j in range(len(X_val_m[0])):
                X_val_m[i][j] = month_encoder.index(X_val_m[i][j])

        # for combs_output
        for i in range(len(X_test_event_c)):
            for j in range(len(X_test_event_c[0])):
                X_test_event_c[i][j] = events_encoder.index(X_test_event_c[i][j])
            for j in range(len(X_test_resource[0])):
                X_test_resource_c[i][j] = resource_encoder.index(X_test_resource_c[i][j])
            for j in range(len(X_test_m[0])):
                X_test_m_c[i][j] = month_encoder.index(X_test_m_c[i][j])
        for i in range(len(X_train_event_c)):
            for j in range(len(X_train_event_c[0])):
                X_train_event_c[i][j] = events_encoder.index(X_train_event_c[i][j])
            for j in range(len(X_train_resource[0])):
                X_train_resource_c[i][j] = resource_encoder.index(X_train_resource_c[i][j])
            for j in range(len(X_train_m[0])):
                X_train_m_c[i][j] = month_encoder.index(X_train_m_c[i][j])
        for i in range(len(X_val_event_c)):
            for j in range(len(X_val_event_c[0])):
                X_val_event_c[i][j] = events_encoder.index(X_val_event_c[i][j])
            for j in range(len(X_val_resource_c[0])):
                X_val_resource_c[i][j] = resource_encoder.index(X_val_resource_c[i][j])
            for j in range(len(X_val_m_c[0])):
                X_val_m_c[i][j] = month_encoder.index(X_val_m_c[i][j])
        scaler = StandardScaler()
        X_test_TA = scaler.fit_transform(X_test_TA)
        X_train_TA = scaler.fit_transform(X_train_TA)
        X_val_TA = scaler.fit_transform(X_val_TA)
        X_test_TA_c = scaler.fit_transform(X_test_TA_c)
        X_train_TA_c = scaler.fit_transform(X_train_TA_c)
        X_val_TA_c = scaler.fit_transform(X_val_TA_c)
        X_test_event = X_test_event.astype(int)
        X_train_event = X_train_event.astype(int)
        X_val_event = X_val_event.astype(int)
        X_test_resource = X_test_resource.astype(int)
        X_train_resource = X_train_resource.astype(int)
        X_val_resource = X_val_resource.astype(int)
        X_test_m = X_test_m.astype(int)
        X_train_m = X_train_m.astype(int)
        X_val_m = X_val_m.astype(int)
        #for combs again
        X_test_event_c = X_test_event_c.astype(int)
        X_train_event_c = X_train_event_c.astype(int)
        X_val_event_c = X_val_event_c.astype(int)
        X_test_resource_c = X_test_resource_c.astype(int)
        X_train_resource_c = X_train_resource_c.astype(int)
        X_val_resource_c = X_val_resource_c.astype(int)
        X_test_m_c = X_test_m_c.astype(int)
        X_train_m_c = X_train_m_c.astype(int)
        X_val_m_c = X_val_m_c.astype(int)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BPDP_LSTM(vocab_events=len(events_encoder), vocab_resources=len(resource_encoder), no_TA=len(X_train_TA[0]),
                          vocab_month=len(month_encoder))
        model.to(device)
        weights = torch.FloatTensor(list([positive_weights[d], negative_weights[d]]))
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        torch.autograd.set_detect_anomaly(True)
        if early_stop:
            EPOCHS = 300
            model.train()
            train_data_event = TrainData(torch.FloatTensor(X_train_event),
                                         torch.FloatTensor(y_train))
            train_data_resource = TestData(torch.FloatTensor(X_train_resource))
            train_data_TA = TestData(torch.FloatTensor(X_train_TA))
            train_data_m = TestData(torch.FloatTensor(X_train_m))
            train_loader_event = DataLoader(dataset=train_data_event, batch_size=BATCH_SIZE, shuffle=False)
            train_loader_resource = DataLoader(dataset=train_data_resource, batch_size=BATCH_SIZE, shuffle=False)
            train_loader_TA = DataLoader(dataset=train_data_TA, batch_size=BATCH_SIZE, shuffle=False)
            train_loader_m = DataLoader(dataset=train_data_m, batch_size=BATCH_SIZE, shuffle=False)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch < EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader_event))
                pbar = tqdm.tqdm(steps)
                steps_r = list((train_loader_resource))
                steps_ta = list((train_loader_TA))
                steps_m = list((train_loader_m))
                model.train()
                epoch_acc = 0
                epoch_loss = 0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(torch.int64).to(device), steps_r[i].to(torch.int64).to(device),
                                         steps_ta[i].to(torch.float).to(device), steps_m[i].to(torch.int64).to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1) * len(x_batch)
                    epoch_loss += loss
                    if i == len(steps) - 1:
                        model.eval()
                        pred = model(torch.FloatTensor(X_val_event).to(torch.int64),
                                     torch.FloatTensor(X_val_resource).to(torch.int64),
                                     torch.FloatTensor(X_val_TA).to(torch.float),
                                     torch.FloatTensor(X_val_m).to(torch.int64))
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model, vloss): done = True
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader_event)}, Acc: {epoch_acc / len(train_loader_event):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader_event):}, Acc: {epoch_acc / len(train_loader_event):.3f}")

        y_batch_pred
        model.eval()
        test_data_event = TestData(torch.FloatTensor(X_test_event))
        test_data_resource = TestData(torch.FloatTensor(X_test_resource))
        test_data_TA = TestData(torch.FloatTensor(X_test_TA))
        test_data_m = TestData(torch.FloatTensor(X_test_m))
        test_loader_event = DataLoader(dataset=test_data_event, batch_size=1)
        test_loader_resource = DataLoader(dataset=test_data_resource, batch_size=1)
        test_loader_TA = DataLoader(dataset=test_data_TA, batch_size=1)
        test_loader_m = DataLoader(dataset=test_data_m, batch_size=1)
        iterations_r = iter(test_loader_resource)
        iterations_ta = iter(test_loader_TA)
        iterations_m = iter(test_loader_m)
        y_pred_list = []

        with torch.no_grad():
            for i, X_batch in enumerate(test_loader_event):
                X_batch = X_batch.to(device).to(torch.int64)
                y_test_pred = torch.nn.functional.softmax(
                    model(X_batch, next(iterations_r).to(torch.int64), next(iterations_ta).to(torch.float),
                          next(iterations_m).to(torch.int64)))
                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        time_elapsed = time_start - time.clock()
        metrics[d]['Time']=time_elapsed

        metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
        metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
        metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
        metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
        metrics[str('NoDev' + d)]['Support'] = CM[1][1][1] + CM[1][1][0]
        print(CM)

        print(metrics)
        models_collect[d] = model

        X_test_event_c = X_test_event_c.astype(int)
        X_train_event_c = X_train_event_c.astype(int)
        X_val_event_c = X_val_event_c.astype(int)
        X_test_resource_c = X_test_resource_c.astype(int)
        X_train_resource_c = X_train_resource_c.astype(int)
        X_val_resource_c = X_val_resource_c.astype(int)
        X_test_m_c = X_test_m_c.astype(int)
        X_train_m_c = X_train_m_c.astype(int)
        X_val_m_c = X_val_m_c.astype(int)

        train_data_f_combs_event = TestData(torch.FloatTensor(X_train_event_c))
        train_loader_f_combs_event = DataLoader(dataset=train_data_f_combs_event, batch_size=len(X_train_event_c))
        train_data_f_combs_resource = TestData(torch.FloatTensor(X_train_resource_c))
        train_loader_f_combs_resource = DataLoader(dataset=train_data_f_combs_resource, batch_size=len(X_train_resource_c))
        train_data_f_combs_m = TestData(torch.FloatTensor(X_train_m_c))
        train_loader_f_combs_m = DataLoader(dataset=train_data_f_combs_m, batch_size=len(X_train_m_c))
        train_data_f_combs_TA = TestData(torch.FloatTensor(X_train_TA_c))
        train_loader_f_combs_TA = DataLoader(dataset=train_data_f_combs_TA, batch_size=len(X_train_TA_c))

        y_output_train = []
        with torch.no_grad():
            steps_r = list((train_loader_f_combs_resource))
            steps_ta = list((train_loader_f_combs_TA))
            steps_m = list((train_loader_f_combs_m))
            for i, X_batch in enumerate(train_loader_f_combs_event):
                y_test_pred = model(X_batch.to(torch.int64).to(device), steps_r[i].to(torch.int64).to(device),
                                    steps_ta[i].to(torch.float).to(device), steps_m[i].to(torch.int64).to(device))
                y_output_train.append(y_test_pred.numpy())

        test_data_f_combs_event = TestData(torch.FloatTensor(X_test_event_c))
        test_loader_f_combs_event = DataLoader(dataset=test_data_f_combs_event, batch_size=len(X_test_event_c))
        test_data_f_combs_resource = TestData(torch.FloatTensor(X_test_resource_c))
        test_loader_f_combs_resource = DataLoader(dataset=test_data_f_combs_resource, batch_size=len(X_test_resource_c))
        test_data_f_combs_m = TestData(torch.FloatTensor(X_test_m_c))
        test_loader_f_combs_m = DataLoader(dataset=test_data_f_combs_m, batch_size=len(X_test_m_c))
        test_data_f_combs_TA = TestData(torch.FloatTensor(X_test_TA_c))
        test_loader_f_combs_TA = DataLoader(dataset=test_data_f_combs_TA, batch_size=len(X_test_TA_c))

        y_output_test = []
        with torch.no_grad():
            steps_r = list((test_loader_f_combs_resource))
            steps_ta = list((test_loader_f_combs_TA))
            steps_m = list((test_loader_f_combs_m))
            for i, X_batch in enumerate(test_loader_f_combs_event):
                y_test_pred = model(X_batch.to(torch.int64).to(device), steps_r[i].to(torch.int64).to(device),
                                    steps_ta[i].to(torch.float).to(device), steps_m[i].to(torch.int64).to(device))
                y_output_test.append(y_test_pred.numpy())

        val_data_f_combs_event = TestData(torch.FloatTensor(X_val_event_c))
        val_loader_f_combs_event = DataLoader(dataset=val_data_f_combs_event, batch_size=len(X_val_event_c))
        val_data_f_combs_resource = TestData(torch.FloatTensor(X_val_resource_c))
        val_loader_f_combs_resource = DataLoader(dataset=val_data_f_combs_resource, batch_size=len(X_val_resource_c))
        val_data_f_combs_m = TestData(torch.FloatTensor(X_val_m_c))
        val_loader_f_combs_m = DataLoader(dataset=val_data_f_combs_m, batch_size=len(X_val_m_c))
        val_data_f_combs_TA = TestData(torch.FloatTensor(X_val_TA_c))
        val_loader_f_combs_TA = DataLoader(dataset=val_data_f_combs_TA, batch_size=len(X_val_TA_c))

        y_output_val = []
        with torch.no_grad():
            steps_r = list((val_loader_f_combs_resource))
            steps_ta = list((val_loader_f_combs_TA))
            steps_m = list((val_loader_f_combs_m))
            for i, X_batch in enumerate(val_loader_f_combs_event):
                y_test_pred = model(X_batch.to(torch.int64).to(device), steps_r[i].to(torch.int64).to(device),
                                    steps_ta[i].to(torch.float).to(device), steps_m[i].to(torch.int64).to(device))
                y_output_val.append(y_test_pred.numpy())

        outputs_train['NoDev' + str(d)] = y_output_train[0][:, 0]
        outputs_train['Dev' + str(d)] = y_output_train[0][:, 1]
        outputs_test['NoDev' + str(d)] = y_output_test[0][:, 0]
        outputs_test['Dev' + str(d)] = y_output_test[0][:, 1]
        outputs_val['NoDev' + str(d)] = y_output_val[0][:, 0]
        outputs_val['Dev' + str(d)] = y_output_val[0][:, 1]
        if d == dev[0]:
            outputs_train['prefix_length'] = flatten_comprehension(pref_list_train_c)
            outputs_test['prefix_length'] = flatten_comprehension(pref_list_test_c)
            outputs_val['prefix_length'] = flatten_comprehension(pref_list_val_c)


    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()

In [None]:
IDP_separate_LSTM_CIBE(log, ref_log, aligned_traces)

In [None]:
def IDP_collective_LSTM_CIBE(log, ref_log, aligned_traces, u_sample = True,early_stop = True,explained = False,split = 1 / 3):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length
    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    trainin_dev_df = dev_df.loc[x_train_idx]

    trainin_dev_df.corr()
    corrMatrix = trainin_dev_df.corr()

    corrMatrix.loc[:, :] = np.tril(corrMatrix, k=-1)  # borrowed from Karl D's answer

    already_in = set()
    max_combs_l = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] >= relevance_ths].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            max_combs_l.append(perfect_corr)

    test_counts = {}
    for comb in max_combs_l:
        for y in range(len(comb)):
            test_counts[comb[y]] = dev_df.loc[x_test_idx].sum()[comb[y]]
        if any(dev_df.loc[x_test_idx].sum()[comb[y]] == 0 for y in range(len(comb))):
            max_combs_l.remove(comb)
            print(comb)

    max_combs = {}
    for comb in max_combs_l:
        max_combs[str(comb)] = comb

    y_cum_test_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_combs[prefix] = y_cum_test[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_test_combs[prefix][comb] = 0
            for i in list(y_cum_test_combs[prefix].index):
                if event_count[i] < prefix:
                    continue
                if all(y_cum_test_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_test_combs[prefix][j][i] = 0
                    y_cum_test_combs[prefix][comb][i] = 1
    trainin_dev_df.sum()
    y_cum_test_o_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_o_combs[prefix] = y_cum_test_combs[prefix][list(max_combs.keys())]
    y_cum_test_o_combs[1].sum()

    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log,
                                     4000)  # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()

    ref_enc_dat = ref_clean_dat.copy()

    BATCH_SIZE = 128
    LEARNING_RATE = 0.00001

    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])

    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = clean_dat.copy()
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 'No'  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='No')
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/BPDP_LSTM')  # output path
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(path + '/' + z + '_BPDP_CIBE_classification.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)

    x_train_idx_c, x_test_idx_c, y_train_idx_c, y_test_idx_c = train_test_split(range(len(log)), range(len(log)),
                                                                                test_size=split,
                                                                                random_state=0)
    x_train_idx_c, x_val_idx_c, y_train_idx_c, y_val_idx_c = train_test_split(x_train_idx_c, x_train_idx_c, test_size=0.2,
                                                                              random_state=0)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx] + 1):
                if y_cum_test[i][d][idx] == 1: dev_position[d][idx] = i + 1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness = {}

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))


    def flatten_comprehension(matrix):
        return [item for row in matrix for item in row]


    ref_enc_dat

    evs_c = []
    resource_c = []
    month_c = []
    trace_attr = []
    for ca in X_cum[1].columns:
        if ca.startswith('event'): evs_c.append(ca)
    for ca in X_cum[1].columns:
        if ca.startswith('resource'): resource_c.append(ca)
    for ca in X_cum[1].columns:
        if ca.startswith('month'): month_c.append(ca)
    for ca in X_cum[1].columns:
        if not (ca in evs_c or ca in resource_c or ca in month_c): trace_attr.append(ca)
    print(evs_c)
    print(resource_c)
    print(month_c)
    print(trace_attr)

    X_events = {}
    X_resource = {}
    X_month = {}
    X_tracea = {}
    for prefix in range(1, max_ev + 1):
        X_events[prefix] = X_cum[prefix][evs_c]
        X_resource[prefix] = X_cum[prefix][resource_c]
        X_month[prefix] = X_cum[prefix][month_c]
        X_tracea[prefix] = X_cum[prefix][trace_attr]

    cat_tas = []
    for cat in X_tracea[1].columns:
        if type(X_tracea[1][cat][0]) == str:
            cat_tas.append(cat)
    cat_tas
    uniques_cats = {}
    for cat in cat_tas:
        uniques_cats[cat] = []
    for prefix in range(1, max_ev + 1):
        for cat in cat_tas:
            for reals in list(X_tracea[prefix][cat].unique()):
                if not reals in uniques_cats[cat]:
                    uniques_cats[cat].append(reals)
    uniques_cats
    X_tracea[1]
    for cat in cat_tas:
        for prefix in range(1, max_ev + 1):
            for j in list(X_tracea[prefix].index):
                X_tracea[prefix][cat][j] = uniques_cats[cat].index(X_tracea[prefix][cat][j])
    X_tracea[1]
    X_events[1]
    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 8
        negative_weights[label] = 1
    models_collect = {}
    dev_trained = []
    outputs_train = pd.DataFrame()
    outputs_test = pd.DataFrame()
    outputs_val = pd.DataFrame()

    Y_cum_dev = {}
    for prefix in range(1, max_ev + 1):
        Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
        Y_cum_dev[prefix]['NoDev'] = 0
        for i in Y_cum_dev[prefix].index.values.tolist():
            Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
        if prefix == 1:
            print(Y_cum_dev[prefix].columns)

    if u_sample:
        imb_ref_enc_dat = pd.get_dummies(ref_clean_dat)
        imb_ref_enc_dat['ind'] = 0
        for i in range(len(imb_ref_enc_dat)):
            imb_ref_enc_dat['ind'][i] = i

        imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
        for trace in range(len(log)):
            if dev_df.loc[trace].sum() > 0:
                imb_traces['Dev'][trace] = 1

        imb_traces = imb_traces.drop(x_test_idx)
        imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)

        oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

        X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

        x_train_idx = list(X_resampled['ind'])
        y_train_idx = list(X_resampled['ind'])

    print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

    # validation set for early stopping
    x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,

                                                                      random_state=0)

    enumerated_trace_idx = {}
    cum_trace_idxs = []
    pref_list = []
    pref_list_train_c = []
    pref_list_test_c = []
    pref_list_val_c = []
    for prefix in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

        x_te = X_events[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
        x_tr = X_events[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
        x_va = X_events[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
        y_te = y_cum_test[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
        y_tr = y_cum_test[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
        y_va = y_cum_test[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
        x_te_c = X_events[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
        x_tr_c = X_events[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
        x_va_c = X_events[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
        y_te_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(
            float)
        y_tr_c = y_cum_test_o_combs[prefix].loc[
            [j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
        y_va_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(
            float)
        cum_trace_idxs.append(list(set(y_test_idx) - set(drop_idx)))
        pref_list.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
        enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
        pref_list_train_c.append([prefix] * len(list(set(y_train_idx_c) - set(drop_idx))))
        pref_list_test_c.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
        pref_list_val_c.append([prefix] * len(list(set(y_val_idx_c) - set(drop_idx))))
        print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

        if prefix == 1:
            X_train_event = x_tr
            X_test_event = x_te
            y_train = y_tr
            y_test = y_te
            X_val_event = x_va
            y_val = y_va
            X_train_event_c = x_tr_c
            X_test_event_c = x_te_c
            X_val_event_c = x_va_c
            y_train_c = y_tr_c
            y_test_c = y_te_c
            y_val_c = y_va_c
        else:
            X_train_event = np.append(X_train_event, x_tr, axis=0)
            X_test_event = np.append(X_test_event, x_te, axis=0)
            y_train = np.append(y_train, y_tr, axis=0)
            y_test = np.append(y_test, y_te, axis=0)
            y_val = np.append(y_val, y_va, axis=0)
            X_val_event = np.append(X_val_event, x_va, axis=0)
            X_train_event_c = np.append(X_train_event_c, x_tr_c, axis=0)
            X_test_event_c = np.append(X_test_event_c, x_te_c, axis=0)
            X_val_event_c = np.append(X_val_event_c, x_va_c, axis=0)
            y_train_c = np.append(y_train_c, y_tr_c, axis=0)
            y_test_c = np.append(y_test_c, y_te_c, axis=0)
            y_val_c = np.append(y_val_c, y_va_c, axis=0)  # combine all X data from all prefixes into one array
    print(d, len(X_train_event), len(y_train), len(y_train_c), len(X_val_event), len(y_val), len(y_val_c),
          len(X_test_event), len(y_test), len(y_test_c))

    for prefix in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

        x_te = X_resource[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
        x_tr = X_resource[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
        x_va = X_resource[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
        x_te_c = X_resource[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
        x_tr_c = X_resource[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
        x_va_c = X_resource[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
        print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

        if prefix == 1:
            X_train_resource = x_tr
            X_test_resource = x_te
            X_val_resource = x_va
            X_train_resource_c = x_tr_c
            X_test_resource_c = x_te_c
            X_val_resource_c = x_va_c
        else:
            X_train_resource = np.append(X_train_resource, x_tr, axis=0)
            X_test_resource = np.append(X_test_resource, x_te, axis=0)
            X_val_resource = np.append(X_val_resource, x_va, axis=0)
            X_train_resource_c = np.append(X_train_resource_c, x_tr_c, axis=0)
            X_test_resource_c = np.append(X_test_resource_c, x_te_c, axis=0)
            X_val_resource_c = np.append(X_val_resource_c, x_va_c,
                                         axis=0)  # combine all X data from all prefixes into one array
    print(d, len(X_train_resource), len(y_train), len(X_val_resource), len(y_val), len(X_test_resource), len(y_test))

    for prefix in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

        x_te = X_month[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
        x_tr = X_month[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
        x_va = X_month[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
        x_te_c = X_month[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
        x_tr_c = X_month[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
        x_va_c = X_month[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
        print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

        if prefix == 1:
            X_train_m = x_tr
            X_test_m = x_te
            X_val_m = x_va
            X_train_m_c = x_tr_c
            X_test_m_c = x_te_c
            X_val_m_c = x_va_c
        else:
            X_train_m = np.append(X_train_m, x_tr, axis=0)
            X_test_m = np.append(X_test_m, x_te, axis=0)
            X_val_m = np.append(X_val_m, x_va, axis=0)
            X_train_m_c = np.append(X_train_m_c, x_tr_c, axis=0)
            X_test_m_c = np.append(X_test_m_c, x_te_c, axis=0)
            X_val_m_c = np.append(X_val_m_c, x_va_c, axis=0)  # combine all X data from all prefixes into one array
    print(d, len(X_train_m), len(y_train), len(X_val_m), len(y_val), len(X_test_m), len(y_test))

    print('split done')

    for prefix in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

        x_te = X_tracea[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy()
        x_tr = X_tracea[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy()
        x_va = X_tracea[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy()
        x_te_c = X_tracea[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy()
        x_tr_c = X_tracea[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy()
        x_va_c = X_tracea[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy()
        print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

        if prefix == 1:
            X_train_TA = x_tr
            X_test_TA = x_te
            X_val_TA = x_va
            X_train_TA_c = x_tr_c
            X_test_TA_c = x_te_c
            X_val_TA_c = x_va_c
        else:
            X_train_TA = np.append(X_train_TA, x_tr, axis=0)
            X_test_TA = np.append(X_test_TA, x_te, axis=0)
            X_val_TA = np.append(X_val_TA, x_va, axis=0)
            X_train_TA_c = np.append(X_train_TA_c, x_tr_c, axis=0)
            X_test_TA_c = np.append(X_test_TA_c, x_te_c, axis=0)
            X_val_TA_c = np.append(X_val_TA_c, x_va_c, axis=0)  # combine all X data from all prefixes into one array
    print(d, len(X_train_TA), len(y_train), len(X_val_TA), len(y_val), len(X_test_TA), len(y_test))

    events_encoder = list(
        np.unique(np.append(np.append(X_train_event_c, X_test_event_c, axis=0), X_val_event_c, axis=0)))
    events_encoder.index('No')
    resource_encoder = list(
        np.unique(np.append(np.append(X_train_resource_c, X_test_resource_c, axis=0), X_val_resource_c, axis=0)))
    resource_encoder.index('No')
    cat_ecnoders = {}
    for cat in cat_tas:
        cat_ecnoders[cat] = list(np.unique(np.append(np.append(X_train_TA_c, X_test_TA_c, axis=0), X_val_TA_c, axis=0)))

    month_encoder = list(np.unique(np.append(np.append(X_train_m_c, X_test_m_c, axis=0), X_val_m_c, axis=0)))
    month_encoder
    for i in range(len(X_test_event)):
        for j in range(len(X_test_event[0])):
            X_test_event[i][j] = events_encoder.index(X_test_event[i][j])
        for j in range(len(X_test_resource[0])):
            X_test_resource[i][j] = resource_encoder.index(X_test_resource[i][j])
        for j in range(len(X_test_m[0])):
            X_test_m[i][j] = month_encoder.index(X_test_m[i][j])
    for i in range(len(X_train_event)):
        for j in range(len(X_train_event[0])):
            X_train_event[i][j] = events_encoder.index(X_train_event[i][j])
        for j in range(len(X_train_resource[0])):
            X_train_resource[i][j] = resource_encoder.index(X_train_resource[i][j])
        for j in range(len(X_train_m[0])):
            X_train_m[i][j] = month_encoder.index(X_train_m[i][j])
    for i in range(len(X_val_event)):
        for j in range(len(X_val_event[0])):
            X_val_event[i][j] = events_encoder.index(X_val_event[i][j])
        for j in range(len(X_val_resource[0])):
            X_val_resource[i][j] = resource_encoder.index(X_val_resource[i][j])
        for j in range(len(X_val_m[0])):
            X_val_m[i][j] = month_encoder.index(X_val_m[i][j])

    # for combs_output
    for i in range(len(X_test_event_c)):
        for j in range(len(X_test_event_c[0])):
            X_test_event_c[i][j] = events_encoder.index(X_test_event_c[i][j])
        for j in range(len(X_test_resource[0])):
            X_test_resource_c[i][j] = resource_encoder.index(X_test_resource_c[i][j])
        for j in range(len(X_test_m[0])):
            X_test_m_c[i][j] = month_encoder.index(X_test_m_c[i][j])
    for i in range(len(X_train_event_c)):
        for j in range(len(X_train_event_c[0])):
            X_train_event_c[i][j] = events_encoder.index(X_train_event_c[i][j])
        for j in range(len(X_train_resource[0])):
            X_train_resource_c[i][j] = resource_encoder.index(X_train_resource_c[i][j])
        for j in range(len(X_train_m[0])):
            X_train_m_c[i][j] = month_encoder.index(X_train_m_c[i][j])
    for i in range(len(X_val_event_c)):
        for j in range(len(X_val_event_c[0])):
            X_val_event_c[i][j] = events_encoder.index(X_val_event_c[i][j])
        for j in range(len(X_val_resource_c[0])):
            X_val_resource_c[i][j] = resource_encoder.index(X_val_resource_c[i][j])
        for j in range(len(X_val_m_c[0])):
            X_val_m_c[i][j] = month_encoder.index(X_val_m_c[i][j])
    scaler = StandardScaler()
    X_test_TA = scaler.fit_transform(X_test_TA)
    X_train_TA = scaler.fit_transform(X_train_TA)
    X_val_TA = scaler.fit_transform(X_val_TA)
    X_test_TA_c = scaler.fit_transform(X_test_TA_c)
    X_train_TA_c = scaler.fit_transform(X_train_TA_c)
    X_val_TA_c = scaler.fit_transform(X_val_TA_c)
    X_test_event = X_test_event.astype(int)
    X_train_event = X_train_event.astype(int)
    X_val_event = X_val_event.astype(int)
    X_test_resource = X_test_resource.astype(int)
    X_train_resource = X_train_resource.astype(int)
    X_val_resource = X_val_resource.astype(int)
    X_test_m = X_test_m.astype(int)
    X_train_m = X_train_m.astype(int)
    X_val_m = X_val_m.astype(int)
    #for combs again
    X_test_event_c = X_test_event_c.astype(int)
    X_train_event_c = X_train_event_c.astype(int)
    X_val_event_c = X_val_event_c.astype(int)
    X_test_resource_c = X_test_resource_c.astype(int)
    X_train_resource_c = X_train_resource_c.astype(int)
    X_val_resource_c = X_val_resource_c.astype(int)
    X_test_m_c = X_test_m_c.astype(int)
    X_train_m_c = X_train_m_c.astype(int)
    X_val_m_c = X_val_m_c.astype(int)

    y_train


    class BPDP_LSTM_SC(nn.Module):
        def __init__(self, vocab_events, vocab_resources, no_TA, vocab_month, no_devs):
            super(BPDP_LSTM_SC, self).__init__()
            self.embedding_e = nn.Embedding(vocab_events, 16)  # hier auf 8 / 16
            self.activation1 = nn.LeakyReLU()
            self.lstm_e = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
            self.linear_e = nn.Linear(64, 32)
            self.embedding_r = nn.Embedding(vocab_resources, 16)
            self.lstm_r = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
            self.linear_r = nn.Linear(64, 32)
            self.embedding_m = nn.Embedding(vocab_month, 16)
            self.lstm_m = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
            self.linear_m = nn.Linear(64, 32)
            self.linear_ta = nn.Linear(no_TA, 32)
            self.dropout = nn.Dropout(p=0.1)
            self.batchnorm1 = nn.LayerNorm(128)
            self.linear = nn.Linear(128, no_devs)

        def forward(self, evs, rs, tas, ms):
            evs = self.embedding_e(evs)
            evs, _ = self.lstm_e(evs)
            evs = self.linear_e(evs)
            evs = evs[:, -1, :]
            evs = self.activation1(evs)
            rs = self.embedding_r(rs)
            rs, _ = self.lstm_r(rs)
            rs = rs[:, -1, :]
            rs = self.activation1(rs)
            rs = self.linear_r(rs)
            ms = self.embedding_m(ms)
            ms, _ = self.lstm_m(ms)
            ms = ms[:, -1, :]
            ms = self.activation1(ms)
            ms = self.linear_m(ms)
            tas = self.linear_ta(tas)
            fin = torch.cat((evs, rs), dim=1)
            fin = torch.cat((fin, ms), dim=1)
            fin = torch.cat((fin, tas), dim=1)
            fin = self.batchnorm1(fin)
            #fin = self.dropout(fin)
            fin = self.linear(fin)
            return fin


    labels = dev  # ['label_1', ...., 'label_6']

    positives = {}
    negatives = {}
    for label in labels:
        positives[label] = sum(dev_df[label] == 1)
        negatives[label] = sum(dev_df[label] == 0)
    max_Plabel = max(positives.values())
    max_Nlabel = max(negatives.values())
    max_label = max(max_Plabel, max_Nlabel)
    pir = {}
    nir = {}
    pirlbl = {}
    nirlbl = {}
    for label in labels:
        pir[label] = max(positives[label], negatives[label]) / positives[label]
        nir[label] = max(positives[label], negatives[label]) / negatives[label]
        pirlbl[label] = max_label / positives[label]
        nirlbl[label] = max_label / negatives[label]
    positive_weights = {}
    negative_weights = {}
    for label in labels:
        positive_weights[label] = mean(pir.values()) ** (1 / (2 * math.e)) + np.log(pirlbl[label])
        negative_weights[label] = mean(nir.values()) ** (1 / (2 * math.e)) + np.log(nirlbl[label])
    positive_weights

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = BPDP_LSTM_SC(vocab_events=len(events_encoder), vocab_resources=len(resource_encoder), no_TA=len(X_train_TA[0]),
                         vocab_month=len(month_encoder), no_devs=len(y_train[0]))
    model.to(device)
    weights = torch.FloatTensor(list(positive_weights.values()))
    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    torch.autograd.set_detect_anomaly(True)
    if early_stop:
        EPOCHS = 100
        model.train()
        train_data_event = TrainData(torch.FloatTensor(X_train_event),
                                     torch.FloatTensor(y_train))
        train_data_resource = TestData(torch.FloatTensor(X_train_resource))
        train_data_TA = TestData(torch.FloatTensor(X_train_TA))
        train_data_m = TestData(torch.FloatTensor(X_train_m))
        train_loader_event = DataLoader(dataset=train_data_event, batch_size=BATCH_SIZE, shuffle=False)
        train_loader_resource = DataLoader(dataset=train_data_resource, batch_size=BATCH_SIZE, shuffle=False)
        train_loader_TA = DataLoader(dataset=train_data_TA, batch_size=BATCH_SIZE, shuffle=False)
        train_loader_m = DataLoader(dataset=train_data_m, batch_size=BATCH_SIZE, shuffle=False)

        es = EarlyStopping()
        done = False

        epoch = 0
        while epoch < EPOCHS and not done:
            epoch += 1
            steps = list(enumerate(train_loader_event))
            pbar = tqdm.tqdm(steps)
            steps_r = list((train_loader_resource))
            steps_ta = list((train_loader_TA))
            steps_m = list((train_loader_m))
            model.train()
            epoch_acc = 0
            epoch_loss = 0
            for i, (x_batch, y_batch) in pbar:
                optimizer.zero_grad()
                y_batch_pred = model(x_batch.to(torch.int64).to(device), steps_r[i].to(torch.int64).to(device),
                                     steps_ta[i].to(torch.float).to(device), steps_m[i].to(torch.int64).to(device))

                loss = criterion(y_batch_pred, y_batch.to(device))

                acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
                epoch_acc += acc.item()

                loss, current = loss.item(), (i + 1) * len(x_batch)
                epoch_loss += loss
                if i == len(steps) - 1:
                    model.eval()
                    pred = model(torch.FloatTensor(X_val_event).to(torch.int64),
                                 torch.FloatTensor(X_val_resource).to(torch.int64),
                                 torch.FloatTensor(X_val_TA).to(torch.float),
                                 torch.FloatTensor(X_val_m).to(torch.int64))
                    vloss = criterion(pred, torch.FloatTensor(y_val))
                    if es(model, vloss): done = True
                    pbar.set_description(
                        f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader_event)}, Acc: {epoch_acc / len(train_loader_event):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                else:
                    pbar.set_description(
                        f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader_event):}, Acc: {epoch_acc / len(train_loader_event):.3f}")

    y_batch_pred
    model.eval()
    test_data_event = TestData(torch.FloatTensor(X_test_event))
    test_data_resource = TestData(torch.FloatTensor(X_test_resource))
    test_data_TA = TestData(torch.FloatTensor(X_test_TA))
    test_data_m = TestData(torch.FloatTensor(X_test_m))
    test_loader_event = DataLoader(dataset=test_data_event, batch_size=1)
    test_loader_resource = DataLoader(dataset=test_data_resource, batch_size=1)
    test_loader_TA = DataLoader(dataset=test_data_TA, batch_size=1)
    test_loader_m = DataLoader(dataset=test_data_m, batch_size=1)
    iterations_r = iter(test_loader_resource)
    iterations_ta = iter(test_loader_TA)
    iterations_m = iter(test_loader_m)
    y_pred_list = []

    with torch.no_grad():
        for i, X_batch in enumerate(test_loader_event):
            X_batch = X_batch.to(device).to(torch.int64)
            y_test_pred = torch.nn.functional.sigmoid(
                model(X_batch, next(iterations_r).to(torch.int64), next(iterations_ta).to(torch.float),
                      next(iterations_m).to(torch.int64)))
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag.cpu().numpy())

    y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

    CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

    for i, d in enumerate(dev):
        metrics[d]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
        metrics[d]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
        metrics[d]['Support'] = (CM[i][1][1] + CM[i][1][0])
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test[:, i], np.array(y_pred_list)[:, i],
                                                                  average='macro')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev' + d)]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
        metrics[str('NoDev' + d)]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])
        metrics[str('NoDev' + d)]['Support'] = CM[i][0][0] + CM[i][0][1]
    print(CM)

    print(metrics)
    writer = pd.ExcelWriter('BPDP_LSTM/' + z + '_BPDP_LSTM_SC_1.xlsx', engine="xlsxwriter")
    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
def IDP_separate_DPP_CIBE(log, ref_log, aligned_traces, split=1/3, u_sample=True, early_stop=True,relevance_ths = .5):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')

    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    trainin_dev_df = dev_df.loc[x_train_idx]
    trainin_dev_df
    trainin_dev_df.corr()
    corrMatrix = trainin_dev_df.corr()

    corrMatrix.loc[:, :] = np.tril(corrMatrix, k=-1)  # borrowed from Karl D's answer

    already_in = set()
    max_combs_l = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] >= relevance_ths].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            max_combs_l.append(perfect_corr)
    max_combs_l
    test_counts = {}
    for comb in max_combs_l:
        for y in range(len(comb)):
            test_counts[comb[y]] = dev_df.loc[x_test_idx].sum()[comb[y]]
        if any(dev_df.loc[x_test_idx].sum()[comb[y]] == 0 for y in range(len(comb))):
            max_combs_l.remove(comb)
            print(comb)
    max_combs_l
    test_counts
    max_combs = {}
    for comb in max_combs_l:
        max_combs[str(comb)] = comb
    max_combs
    y_cum_test[1]
    y_cum_test_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_combs[prefix] = y_cum_test[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_test_combs[prefix][comb] = 0
            for i in list(y_cum_test_combs[prefix].index):
                if event_count[i] < prefix:
                    continue
                if all(y_cum_test_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_test_combs[prefix][j][i] = 0
                    y_cum_test_combs[prefix][comb][i] = 1
    trainin_dev_df.sum()
    dev_df.loc[x_test_idx].sum()[max_combs_l[0][0]]
    pi = 4
    print(y_cum_test_combs[pi].sum())
    print(y_cum_test[pi].sum())
    y_cum_test_o_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_o_combs[prefix] = y_cum_test_combs[prefix][list(max_combs.keys())]
    y_cum_test_o_combs[1].sum()
    y_cum_test_combs[1]

    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log,
                                     4000)  # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()
    ref_enc_dat = pd.get_dummies(ref_clean_dat)

    EPOCHS = 30
    BATCH_SIZE = 128
    LEARNING_RATE = 0.0001

    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])

    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 0  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/BPDP_Classifier')  # output path
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(path + '/' + z + '_BPDP_CIBE_classification.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)

    x_train_idx_c, x_test_idx_c, y_train_idx_c, y_test_idx_c = train_test_split(range(len(log)), range(len(log)),
                                                                                test_size=split,
                                                                                random_state=0)
    x_train_idx_c, x_val_idx_c, y_train_idx_c, y_val_idx_c = train_test_split(x_train_idx_c, x_train_idx_c, test_size=0.2,
                                                                              random_state=0)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx] + 1):
                if y_cum_test[i][d][idx] == 1: dev_position[d][idx] = i + 1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness = {}

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))


    def flatten_comprehension(matrix):
        return [item for row in matrix for item in row]


    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 16
        negative_weights[label] = 1
    models_collect = {}
    dev_trained = []
    outputs_train = pd.DataFrame()
    outputs_test = pd.DataFrame()
    outputs_val = pd.DataFrame()

    for d in dev:
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        elif dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue
        else:
            dev_trained.append(d)

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        if u_sample:
            imb_ref_enc_dat = ref_enc_dat.copy()
            imb_ref_enc_dat['ind'] = 0
            for i in range(len(imb_ref_enc_dat)):
                imb_ref_enc_dat['ind'][i] = i
            imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
            for trace in range(len(log)):
                if dev_df[d][trace] > 0:
                    imb_traces['Dev'][trace] = 1

            imb_traces = imb_traces.drop(x_test_idx)
            imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
            imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
            imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)

            oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

            X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

            x_train_idx = list(X_resampled['ind'])
            y_train_idx = list(X_resampled['ind'])

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        # validation set for early stopping
        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)
        enumerated_trace_idx = {}
        cum_trace_idxs = []
        pref_list = []
        pref_list_train_c = []
        pref_list_test_c = []
        pref_list_val_c = []
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_cum[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = X_cum[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = X_cum[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_te_c = X_cum[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            x_tr_c = X_cum[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            x_va_c = X_cum[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            y_te_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(
                float)
            y_tr_c = y_cum_test_o_combs[prefix].loc[
                [j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            y_va_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(
                float)
            cum_trace_idxs.append(list(set(y_test_idx) - set(drop_idx)))
            pref_list.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            pref_list_train_c.append([prefix] * len(list(set(y_train_idx_c) - set(drop_idx))))
            pref_list_test_c.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            pref_list_val_c.append([prefix] * len(list(set(y_val_idx_c) - set(drop_idx))))
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
                X_train_c = x_tr_c
                X_test_c = x_te_c
                X_val_c = x_va_c
                y_train_c = y_tr_c
                y_test_c = y_te_c
                y_val_c = y_va_c
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)
                X_train_c = np.append(X_train_c, x_tr_c, axis=0)
                X_test_c = np.append(X_test_c, x_te_c, axis=0)
                X_val_c = np.append(X_val_c, x_va_c, axis=0)
                y_train_c = np.append(y_train_c, y_tr_c, axis=0)
                y_test_c = np.append(y_test_c, y_te_c, axis=0)
                y_val_c = np.append(y_val_c, y_va_c, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(y_train_c), len(X_val), len(y_val), len(y_val_c), len(X_test), len(y_test),
              len(y_test_c))

        print('split done')
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)
        X_test_c = scaler.fit_transform(X_test_c)
        X_train_c = scaler.fit_transform(X_train_c)
        X_val_c = scaler.fit_transform(X_val_c)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BinaryClassificationIndiv(no_columns=len(ref_enc_dat.loc[0]))
        model.to(device)
        weights = torch.FloatTensor(list([positive_weights[d], negative_weights[d]]))
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if early_stop:
            EPOCHS = 300
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                   torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(X_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch < EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss = 0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1) * len(x_batch)
                    epoch_loss += loss
                    if i == len(steps) - 1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model, vloss): done = True
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(X_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []
        y_confidence_list = []
        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                y_confidence_list.append(y_test_pred.numpy())
                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        cum_trace_idxs = flatten_comprehension(cum_trace_idxs)
        pref_list = flatten_comprehension(pref_list)
        if d == dev[0]:
            y_test_cum = pd.DataFrame(data=0, columns=dev, index=range(len(y_test)))
            y_test_cum['trace_idx'] = cum_trace_idxs
            y_test_cum['prefix_length'] = pref_list
            y_pred_cum = pd.DataFrame(data=0, columns=dev, index=range(len(y_test)))
            y_pred_cum['trace_idx'] = cum_trace_idxs
            y_pred_cum['prefix_length'] = pref_list
        y_pred_cum[str('confidence' + d)] = y_confidence_list
        y_test_cum[d] = list(y_test)
        y_pred_cum[d] = y_pred_list

        metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
        metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
        metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
        metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
        metrics[str('NoDev' + d)]['Support'] = CM[1][1][1] + CM[1][1][0]
        print(CM)

        print(metrics)
        models_collect[d] = model

        train_data_f_combs = TestData(torch.FloatTensor(X_train_c))
        train_loader_f_combs = DataLoader(dataset=train_data_f_combs, batch_size=len(X_train_c))

        y_output_train = []
        with torch.no_grad():
            for X_batch in train_loader_f_combs:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                y_output_train.append(y_test_pred.numpy())

        test_data_f_combs = TestData(torch.FloatTensor(X_test_c))
        test_loader_f_combs = DataLoader(dataset=test_data_f_combs, batch_size=len(X_test_c))

        y_output_test = []
        with torch.no_grad():
            for X_batch in test_loader_f_combs:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                y_output_test.append(y_test_pred.numpy())

        val_data_f_combs = TestData(torch.FloatTensor(X_val_c))
        val_loader_f_combs = DataLoader(dataset=val_data_f_combs, batch_size=len(X_val_c))

        y_output_val = []
        with torch.no_grad():
            for X_batch in val_loader_f_combs:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                y_output_val.append(y_test_pred.numpy())

        outputs_train['NoDev' + str(d)] = y_output_train[0][:, 0]
        outputs_train['Dev' + str(d)] = y_output_train[0][:, 1]
        outputs_test['NoDev' + str(d)] = y_output_test[0][:, 0]
        outputs_test['Dev' + str(d)] = y_output_test[0][:, 1]
        outputs_val['NoDev' + str(d)] = y_output_val[0][:, 0]
        outputs_val['Dev' + str(d)] = y_output_val[0][:, 1]
        if d == dev[0]:
            outputs_train['prefix_length'] = flatten_comprehension(pref_list_train_c)
            outputs_test['prefix_length'] = flatten_comprehension(pref_list_test_c)
            outputs_val['prefix_length'] = flatten_comprehension(pref_list_val_c)
    outputs_test

    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()
    y_test_c.sum()
    unique_rel_combs = list(max_combs.keys())
    metrics_comb = pd.DataFrame(data=0, columns=unique_rel_combs, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])
    for d in unique_rel_combs:
        metrics_comb[str('NoDev' + d)] = 0
    metrics_comb


    class Ensemble_Stack_Combs(nn.Module):
        def __init__(self, no_columns, no_devs):
            super(Ensemble_Stack_Combs, self).__init__()
            self.layer_1 = nn.Linear(no_columns, 64)
            self.activation1 = nn.LeakyReLU()
            self.layer_2 = nn.Linear(128, 64)
            self.activation2 = nn.LeakyReLU()
            self.layer_out = nn.Linear(64, no_devs)

            self.dropout = nn.Dropout(p=0.1)
            self.batchnorm1 = nn.LayerNorm(64)
            self.batchnorm2 = nn.LayerNorm(64)
            self.Sigmoid = nn.Sigmoid()

        def forward(self, inputs):
            x = self.layer_1(inputs)
            x = self.activation1(self.layer_1(inputs))
            x = self.batchnorm1(x)
            #x = self.layer_2(x)
            #x = self.activation2(self.layer_2(x))
            #x = self.batchnorm2(x)
            x = self.dropout(x)
            x = self.layer_out(x)
            #x = self.Sigmoid(x)
            return x


    class Ensemble_Stack_Combs_Single(nn.Module):
        def __init__(self, no_columns):
            super(Ensemble_Stack_Combs_Single, self).__init__()
            self.layer_1 = nn.Linear(no_columns, 512)
            self.activation1 = nn.LeakyReLU()
            self.layer_2 = nn.Linear(512, 128)
            self.activation2 = nn.LeakyReLU()
            self.layer_out = nn.Linear(128, 2)

            self.dropout = nn.Dropout(p=0.1)
            self.batchnorm1 = nn.LayerNorm(512)
            self.batchnorm2 = nn.LayerNorm(128)

        def forward(self, inputs):
            x = self.activation1(self.layer_1(inputs))
            x = self.batchnorm1(x)
            x = self.activation2(self.layer_2(x))
            x = self.batchnorm2(x)
            x = self.dropout(x)
            x = self.layer_out(x)
            return x


    len(y_train_c)
    positives = {}
    negatives = {}
    for i, label in enumerate(unique_rel_combs):
        positives[label] = y_train_c[:, i].sum()
        negatives[label] = len(y_train_c) - y_train_c[:, i].sum()
    max_Plabel = max(positives.values())
    max_Nlabel = max(negatives.values())
    max_label = max(max_Plabel, max_Nlabel)
    pir = {}
    nir = {}
    pirlbl = {}
    nirlbl = {}
    for label in unique_rel_combs:
        pir[label] = min((max(positives[label], negatives[label]) / positives[label]), 10000)
        nir[label] = max(positives[label], negatives[label]) / negatives[label]
        pirlbl[label] = max_label / positives[label]
        nirlbl[label] = max_label / negatives[label]
    pw_combs = {}
    nw_combs = {}
    for label in unique_rel_combs:
        pw_combs[label] = min((mean(pir.values()) ** (4 / (2 ** math.e)) + (np.log(pirlbl[label]))), 200)
        #pw_combs[label] =  min(2*(mean(pir.values()) ** ((2* math.e)) + (np.log(pirlbl[label]))), 100000)
        nw_combs[label] = mean(nir.values()) ** (1 / (2 * math.e)) + np.log(nirlbl[label])
    pw_combs

    if len(y_train_c[0]) == 1:
        df_y_train = pd.DataFrame(y_train_c)
        df_y_train['dev'] = 0
        for a in range(len(df_y_train)):
            df_y_train['dev'][a] = max(df_y_train.loc[a])
        df_y_train
        df_X_train = pd.DataFrame(outputs_train)
        df_X_train['ind'] = 0
        for ew in range(len(df_X_train)):
            df_X_train['ind'][ew] = ew
        df_X_train
        oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

        x_resampled, y_resampled = oss.fit_resample(df_X_train, df_y_train['dev'])
        x_resampled
        outputs_train_us = df_X_train.loc[list(x_resampled['ind'])].drop('ind', axis=1)
        scaler = StandardScaler()
        outputs_train_us = scaler.fit_transform(outputs_train_us)
        outputs_val = scaler.fit_transform(outputs_val)
        outputs_test = scaler.fit_transform(outputs_test)
        y_train_c_us = df_y_train.loc[list(x_resampled['ind'])].drop('dev', axis=1)
        outputs_train_us

        model = Ensemble_Stack_Combs_Single(no_columns=len(outputs_train_us[0]))
        model.to(device)
        weights = torch.FloatTensor([8, 1])
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        for i, urc in enumerate(unique_rel_combs):
            y_train_c_us['NoDev']=0
            for je in list(y_train_c_us.index):
                y_train_c_us['NoDev'][je]=1-y_train_c_us[i][je]
            y_val = np.array(y_val_c[:, i], )
            y_val = np.column_stack((y_val, 1 - y_val))
            y_test = np.array(y_test_c[:, i], )
            y_test = np.column_stack((y_test, 1 - y_test))

            if early_stop:
                EPOCHS = 300
                model.train()
                train_data = TrainData(torch.FloatTensor(outputs_train_us),
                                       torch.FloatTensor(y_train_c_us.to_numpy()))

                train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

                X_val = torch.FloatTensor(outputs_val)

                es = EarlyStopping()
                done = False

                epoch = 0
                while epoch < EPOCHS and not done:
                    epoch += 1
                    steps = list(enumerate(train_loader))
                    pbar = tqdm.tqdm(steps)
                    model.train()
                    epoch_acc = 0
                    epoch_loss = 0
                    for i, (x_batch, y_batch) in pbar:
                        optimizer.zero_grad()
                        y_batch_pred = model(x_batch.to(device))

                        loss = criterion(y_batch_pred, y_batch.to(device))

                        acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                        loss.backward()
                        optimizer.step()
                        epoch_acc += acc.item()

                        loss, current = loss.item(), (i + 1) * len(x_batch)
                        epoch_loss += loss
                        if i == len(steps) - 1:
                            model.eval()
                            pred = model(X_val)
                            vloss = criterion(pred, torch.FloatTensor(y_val))
                            if es(model, vloss): done = True
                            pbar.set_description(
                                f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                        else:
                            pbar.set_description(
                                f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

            model.eval()
            test_data = TestData(torch.FloatTensor(outputs_test))
            test_loader = DataLoader(dataset=test_data, batch_size=1)

            y_pred_list = []
            y_confidence_list = []
            with torch.no_grad():
                for X_batch in test_loader:
                    X_batch = X_batch.to(device)
                    y_test_pred = torch.nn.functional.softmax(model(X_batch))
                    y_confidence_list.append(y_test_pred.numpy())
                    y_pred_tag = torch.round(y_test_pred)
                    y_pred_list.append(y_pred_tag.cpu().numpy())

            y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)
            print(CM)

            metrics_comb[urc]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics_comb[urc]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            metrics_comb[urc]['Support'] = CM[0][1][1] + CM[0][1][0]
            try:
                metrics_comb[urc]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
            except Exception as er:
                metrics_comb[urc]['ROC_AUC'] = er
            metrics_comb[str('NoDev' + urc)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics_comb[str('NoDev' + urc)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
            metrics_comb[str('NoDev' + urc)]['Support'] = CM[1][1][1] + CM[1][1][0]
    else:
        df_y_train = pd.DataFrame(y_train_c)
        df_y_train['dev'] = 0
        for a in range(len(df_y_train)):
            df_y_train['dev'][a] = max(df_y_train.loc[a])
        df_y_train
        df_X_train = pd.DataFrame(outputs_train)
        df_X_train['ind'] = 0
        for ew in range(len(df_X_train)):
            df_X_train['ind'][ew] = ew
        df_X_train
        oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

        x_resampled, y_resampled = oss.fit_resample(df_X_train, df_y_train['dev'])
        x_resampled
        outputs_train_us = df_X_train.loc[list(x_resampled['ind'])].drop('ind', axis=1)
        scaler = StandardScaler()
        outputs_train_us = scaler.fit_transform(outputs_train_us)
        outputs_val = scaler.fit_transform(outputs_val)
        outputs_test = scaler.fit_transform(outputs_test)
        y_train_c_us = df_y_train.loc[list(x_resampled['ind'])].drop('dev', axis=1)
        outputs_train_us
        model = Ensemble_Stack_Combs(no_columns=len(outputs_train_us[0]),
                                     no_devs=len(y_train_c_us.loc[list(y_train_c_us.index)[0]]))
        #model = Ensemble_Stack_Combs(no_columns=len(outputs_train.loc[0]), no_devs=len(y_train_c[0]))
        model.to(device)
        criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(list(pw_combs.values())))
        #criterion = nn.MultiLabelSoftMarginLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if early_stop:
            EPOCHS = 300
            model.train()
            train_data = TrainData(torch.FloatTensor(outputs_train_us), torch.FloatTensor(y_train_c_us.to_numpy()))
            #train_data = TrainData(torch.FloatTensor(outputs_train.to_numpy()),torch.FloatTensor(y_train_c))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(outputs_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch < EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss = 0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1) * len(x_batch)
                    epoch_loss += loss
                    if i == len(steps) - 1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val_c))
                        if es(model, vloss): done = True
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(outputs_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []
        y_confidence_list = []
        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.sigmoid(model(X_batch))
                y_confidence_list.append(y_test_pred.numpy())
                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test_c, y_pred_list)
        print(CM)
        for i, urc in enumerate(unique_rel_combs):
            metrics_comb[urc]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
            metrics_comb[urc]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
            metrics_comb[urc]['Support'] = (CM[i][1][1] + CM[i][1][0])
            try:
                metrics_comb[urc]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test_c[:, i], np.array(y_pred_list)[:, i],
                                                                             average='macro')
            except Exception as er:
                metrics_comb[urc]['ROC_AUC'] = er
            metrics_comb[str('NoDev' + urc)]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
            metrics_comb[str('NoDev' + urc)]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])
            metrics_comb[str('NoDev' + urc)]['Support'] = CM[i][0][0] + CM[i][0][1]
    metrics_comb
    writer = pd.ExcelWriter('BPDP_combinations/' + z + '_BPDP_stacked_CIBE_combinations_FFN_w32.xlsx', engine="xlsxwriter")
    metrics_comb.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
IDP_separate_DPP_CIBE(log, ref_log, aligned_traces)

In [None]:
def IDP_separate_CIBE_confidence(log, ref_log, aligned_traces, split=1/3, u_sample=True, early_stop=True,explained=False):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log,
                                     4000)  # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()
    ref_enc_dat = pd.get_dummies(ref_clean_dat)

    EPOCHS = 30
    BATCH_SIZE = 128
    LEARNING_RATE = 0.0001

    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])

    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 0  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)

    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 16
        negative_weights[label] = 1

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/BPDP_Classifier')  # output path
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(path + '/' + z + '_BPDP_CIBE_classification.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx] + 1):
                if y_cum_test[i][d][idx] == 1: dev_position[d][idx] = i + 1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness = {}

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))


    def flatten_comprehension(matrix):
        return [item for row in matrix for item in row]


    dev_trained = []
    for d in dev:
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        elif dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue
        else:
            dev_trained.append(d)

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        if u_sample:
            imb_ref_enc_dat = ref_enc_dat.copy()
            imb_ref_enc_dat['ind'] = 0
            for i in range(len(imb_ref_enc_dat)):
                imb_ref_enc_dat['ind'][i] = i
            imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
            for trace in range(len(log)):
                if dev_df[d][trace] > 0:
                    imb_traces['Dev'][trace] = 1

            imb_traces = imb_traces.drop(x_test_idx)
            imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
            imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
            imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)

            oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

            X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

            x_train_idx = list(X_resampled['ind'])
            y_train_idx = list(X_resampled['ind'])

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        # validation set for early stopping
        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)

        enumerated_trace_idx = {}
        cum_trace_idxs = []
        pref_list = []
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_cum[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = X_cum[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = X_cum[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            cum_trace_idxs.append(list(set(y_test_idx) - set(drop_idx)))
            pref_list.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))

        print('split done')
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BinaryClassificationIndiv(no_columns=len(ref_enc_dat.loc[0]))
        model.to(device)
        weights = torch.FloatTensor(list([positive_weights[d], negative_weights[d]]))
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if early_stop:
            EPOCHS = 300
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                   torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(X_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch < EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss = 0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1) * len(x_batch)
                    epoch_loss += loss
                    if i == len(steps) - 1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model, vloss): done = True
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(X_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []
        y_confidence_list = []
        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                y_confidence_list.append(y_test_pred.numpy())
                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        cum_trace_idxs = flatten_comprehension(cum_trace_idxs)
        pref_list = flatten_comprehension(pref_list)
        if d == dev[0]:
            y_test_cum = pd.DataFrame(data=0, columns=dev, index=range(len(y_test)))
            y_test_cum['trace_idx'] = cum_trace_idxs
            y_test_cum['prefix_length'] = pref_list
            y_pred_cum = pd.DataFrame(data=0, columns=dev, index=range(len(y_test)))
            y_pred_cum['trace_idx'] = cum_trace_idxs
            y_pred_cum['prefix_length'] = pref_list
        y_pred_cum[str('confidence' + d)] = y_confidence_list
        y_test_cum[d] = list(y_test)
        y_pred_cum[d] = y_pred_list

        metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
        metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
        metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
        metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
        metrics[str('NoDev' + d)]['Support'] = CM[1][1][1] + CM[1][1][0]
        print(CM)

        to_be_checked_idx = {}
        for idx in x_test_idx:
            cum_idx = 0
            for prefix in range(1, event_count[idx] + 1):
                if prefix == 1:
                    to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                else:
                    to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                cum_idx += len(enumerated_trace_idx[prefix])

        for prefix in range(1, max_ev + 1):
            for idx in to_be_checked_idx.keys():
                if event_count[idx] >= prefix:
                    if prefix == 1:
                        dev_position_pred[d][idx] = y_pred_list[to_be_checked_idx[idx][prefix - 1]][0]
                    else:
                        if y_pred_list[to_be_checked_idx[idx][prefix - 1]][0] == 1 and \
                                y_pred_list[to_be_checked_idx[idx][prefix - 2]][0] == 0:
                            if dev_position[d][idx] <= prefix:
                                dev_position_pred[d][idx] = dev_position[d][idx]
                            else:
                                dev_position_pred[d][idx] = prefix

        earliness[d] = 0
        tobe_devs = 0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx] == 0 or dev_position_pred[d][idx] == 0:
                continue
            tobe_devs += 1
            earliness[d] += dev_position_pred[d][idx] / dev_position[d][idx]
        if not tobe_devs == 0:
            earliness[d] = earliness[d] / tobe_devs

        if explained:
            import shap
            import matplotlib.pyplot as plt

            np.random.seed(42)
            e = shap.DeepExplainer(model,
                                   torch.FloatTensor(X_train[np.random.choice(X_train.shape[0], 1000, replace=False)]))

            shap_idx = []
            for j in range(len(y_pred_list)):
                if y_pred_list[j][0] == y_test[j][0] == 1:
                    shap_idx.append(j)
            shap_values = e.shap_values(torch.FloatTensor(X_test[shap_idx]))
            fig = shap.summary_plot(shap_values[0], X_test[shap_idx], plot_type='dot', feature_names=enc_dat.columns,
                                    max_display=10, plot_size=(10, 5), show=False)
            plt.savefig(path + '/ShapValues/Dev_' + z + '_' + d + '.png')
            plt.close()

            fig = shap.summary_plot(shap_values[1], X_test[shap_idx], plot_type='dot', feature_names=enc_dat.columns,
                                    max_display=10, plot_size=(10, 5), show=False)
            plt.savefig(path + '/ShapValues/NoDev_' + z + '_' + d + '.png')
            plt.close()

        print(metrics)

    avg_dev_pos = {}
    for d in dev:
        if dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        devs = 0
        positions = 0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx] > 0:
                devs += 1
                positions += dev_position[d][idx]
        if devs == 0:
            continue
        avg_dev_pos[d] = positions / devs

    metrics.to_excel(writer, sheet_name=('Metrics'))
    df = pd.DataFrame(data=earliness, index=[0])
    df.to_excel(writer, sheet_name=('Earliness'))
    df = pd.DataFrame(data=avg_dev_pos, index=[0])
    df.to_excel(writer, sheet_name=('Position'))

    writer.close()
    writer = pd.ExcelWriter(path + '/' + z + '_BPDP_CIBE_confidence.xlsx', engine="xlsxwriter")
    for i in range(len(y_pred_cum)):
        for d in dev_trained:
            y_pred_cum[str('confidence' + d)][i] = y_pred_cum[str('confidence' + d)][i][0][0]
    confidences = [.5, .6, .7, .8]
    for confidence in confidences:
        conf_pred = pd.DataFrame(columns=dev, index=range(len(y_pred_cum)))
        auc_conf_pred = pd.DataFrame(columns=dev, index=range(len(y_pred_cum)))
        auc_test = pd.DataFrame(columns=dev, index=range(len(y_pred_cum)))
        for d in dev_trained:
            for i in range(len(y_pred_cum)):
                if y_pred_cum[str('confidence' + d)][i] >= confidence:
                    conf_pred[d][i] = np.array([1.0, 0.0], dtype=float)
                else:
                    conf_pred[d][i] = np.array([0.0, 1.0], dtype=float)
                auc_conf_pred[d][i] = conf_pred[d][i][0]
                auc_test[d][i] = y_test_cum[d][i][0]

            CM = sklearn.metrics.multilabel_confusion_matrix(list(y_test_cum[d]), list(conf_pred[d]))

            metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
            try:
                metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(list(auc_test[d]), list(auc_conf_pred[d]),
                                                                      average='weighted')
            except Exception as er:
                metrics[d]['ROC_AUC'] = er
            metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
            metrics[str('NoDev' + d)]['Support'] = CM[1][1][1]+CM[1][1][0]
        metrics.to_excel(writer, sheet_name=(str('Unfiltered' + str(confidence))))

        conf_pred = pd.DataFrame(columns=dev, index=range(len(y_pred_cum)))
        conf_test = pd.DataFrame(columns=dev, index=range(len(y_pred_cum)))
        for d in dev_trained:
            conf_pred = y_pred_cum[
                (y_pred_cum[str('confidence' + d)] > confidence) | (y_pred_cum[str('confidence' + d)] < 1 - confidence)]
            conf_test = y_test_cum.loc[conf_pred.index]
            auc_test = []
            auc_pred = []
            for i in conf_pred.index:
                auc_test.append(conf_test[d][i][0])
                auc_pred.append(conf_pred[d][i][0])
            CM = sklearn.metrics.multilabel_confusion_matrix(list(conf_test[d]), list(conf_pred[d]))

            metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
            try:
                metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(auc_test, auc_pred, average='weighted')
            except Exception as er:
                metrics[d]['ROC_AUC'] = er
            metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
            metrics[str('NoDev' + d)]['Support'] = CM[1][1][1] + CM[1][1][0]
        metrics.to_excel(writer, sheet_name=(str('Filtered' + str(confidence))))
    writer.close()

In [None]:
IDP_separate_CIBE_confidence(log, ref_log, aligned_traces)

In [None]:
def catboost_patterns(log, ref_log, aligned_traces, split=1/3, relevance_ths = .5):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    trainin_dev_df = dev_df.loc[x_train_idx]
    trainin_dev_df
    trainin_dev_df.corr()
    corrMatrix = trainin_dev_df.corr()

    corrMatrix.loc[:, :] = np.tril(corrMatrix, k=-1)  # borrowed from Karl D's answer

    already_in = set()
    max_combs_l = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] >= relevance_ths].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            max_combs_l.append(perfect_corr)
    max_combs_l
    test_counts = {}
    for comb in max_combs_l:
        for y in range(len(comb)):
            test_counts[comb[y]] = dev_df.loc[x_test_idx].sum()[comb[y]]
        if any(dev_df.loc[x_test_idx].sum()[comb[y]] == 0 for y in range(len(comb))):
            max_combs_l.remove(comb)
            print(comb)
    max_combs_l
    test_counts
    max_combs = {}
    for comb in max_combs_l:
        max_combs[str(comb)] = comb
    max_combs
    y_cum_test[1]
    y_cum_test_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_combs[prefix] = y_cum_test[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_test_combs[prefix][comb] = 0
            for i in list(y_cum_test_combs[prefix].index):
                if event_count[i] < prefix:
                    continue
                if all(y_cum_test_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_test_combs[prefix][j][i] = 0
                    y_cum_test_combs[prefix][comb][i] = 1
    trainin_dev_df.sum()
    dev_df.loc[x_test_idx].sum()[max_combs_l[0][0]]
    pi = 4
    print(y_cum_test_combs[pi].sum())
    print(y_cum_test[pi].sum())
    y_cum_test_o_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_o_combs[prefix] = y_cum_test_combs[prefix][list(max_combs.keys())]
    y_cum_test_o_combs[1].sum()
    y_cum_test_combs[1]

    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000)
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()
    ref_enc_dat = pd.get_dummies(ref_clean_dat)

    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev,
                           index=['Precision', 'Recall', 'ROC_AUC'])

    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 0  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)

    labels = dev  # ['label_1', ...., 'label_6']

    positive_weights = {}
    negative_weights = {}
    for label in labels:
        positive_weights[label] = 16
        negative_weights[label] = 1
        #positive_weights[label] = (mean(pir.values())+statistics.stdev(pir.values()))**(1/(2*math.e))+np.log(pir[label])
        #negative_weights[label] = (mean(nir.values())+statistics.stdev(nir.values()))**(1/(2*math.e))+np.log(nir[label])

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/CatBoost')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_MC_catoost_ES_' + str(early_stop) + '_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)

    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))

    outputs_train = pd.DataFrame()
    outputs_test = pd.DataFrame()
    outputs_val = pd.DataFrame()


    def flatten_comprehension(matrix):
        return [item for row in matrix for item in row]


    x_train_idx_c, x_test_idx_c, y_train_idx_c, y_test_idx_c = train_test_split(range(len(log)), range(len(log)),
                                                                                test_size=split,
                                                                                random_state=0)
    x_train_idx_c, x_val_idx_c, y_train_idx_c, y_val_idx_c = train_test_split(x_train_idx_c, x_train_idx_c, test_size=0.2,
                                                                              random_state=0)

    x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                      random_state=0)

    for d in dev:

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        enumerated_trace_idx = {}
        cum_trace_idxs = []
        pref_list = []
        pref_list_train_c = []
        pref_list_test_c = []
        pref_list_val_c = []
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

        enumerated_trace_idx = {}
        cum_trace_idxs = []
        pref_list = []
        pref_list_train_c = []
        pref_list_test_c = []
        pref_list_val_c = []
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_cum[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = X_cum[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = X_cum[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_te_c = X_cum[prefix].loc[[j for j in list(set(y_test_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            x_tr_c = X_cum[prefix].loc[[j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            x_va_c = X_cum[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            y_te_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(
                float)
            y_tr_c = y_cum_test_o_combs[prefix].loc[
                [j for j in list(set(y_train_idx_c) - set(drop_idx))]].to_numpy().astype(float)
            y_va_c = y_cum_test_o_combs[prefix].loc[[j for j in list(set(y_val_idx_c) - set(drop_idx))]].to_numpy().astype(
                float)
            cum_trace_idxs.append(list(set(y_test_idx) - set(drop_idx)))
            pref_list.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            pref_list_train_c.append([prefix] * len(list(set(y_train_idx_c) - set(drop_idx))))
            pref_list_test_c.append([prefix] * len(list(set(y_test_idx) - set(drop_idx))))
            pref_list_val_c.append([prefix] * len(list(set(y_val_idx_c) - set(drop_idx))))
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
                X_train_c = x_tr_c
                X_test_c = x_te_c
                X_val_c = x_va_c
                y_train_c = y_tr_c
                y_test_c = y_te_c
                y_val_c = y_va_c
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)
                X_train_c = np.append(X_train_c, x_tr_c, axis=0)
                X_test_c = np.append(X_test_c, x_te_c, axis=0)
                X_val_c = np.append(X_val_c, x_va_c, axis=0)
                y_train_c = np.append(y_train_c, y_tr_c, axis=0)
                y_test_c = np.append(y_test_c, y_te_c, axis=0)
                y_val_c = np.append(y_val_c, y_va_c, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(y_train_c), len(X_val), len(y_val), len(y_val_c), len(X_test), len(y_test),
              len(y_test_c))

        cat_y_train = y_train[:, 0]
        cat_y_val = y_val[:, 0]
        cat_y_test = y_test[:, 0]
        cat_y_train
        from catboost import CatBoostClassifier

        catboost = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=16, early_stopping_rounds=10)
        try:
            catboost.fit(X_train, cat_y_train)
            y_pred = catboost.predict(X_test)
            y_pred
            y_preds = np.stack((y_pred, 1 - y_pred), axis=1)
            y_pred_list = y_preds.tolist()
            y_pred_list

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

            metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            try:
                metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
            except Exception as er:
                metrics[d]['ROC_AUC'] = er
            metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])

            print(CM)

            to_be_checked_idx = {}
            for idx in x_test_idx:
                cum_idx = 0
                for prefix in range(1, event_count[idx] + 1):
                    if prefix == 1:
                        to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                    else:
                        to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                    cum_idx += len(enumerated_trace_idx[prefix])

            y_output_train = catboost.predict_proba(X_train_c)
            y_output_test = catboost.predict_proba(X_test_c)
            y_output_val = catboost.predict_proba(X_val_c)

            outputs_train['NoDev' + str(d)] = y_output_train[:, 0]
            outputs_train['Dev' + str(d)] = y_output_train[:, 1]
            outputs_test['NoDev' + str(d)] = y_output_test[:, 0]
            outputs_test['Dev' + str(d)] = y_output_test[:, 1]
            outputs_val['NoDev' + str(d)] = y_output_val[:, 0]
            outputs_val['Dev' + str(d)] = y_output_val[:, 1]
            if d == dev[0]:
                outputs_train['prefix_length'] = flatten_comprehension(pref_list_train_c)
                outputs_test['prefix_length'] = flatten_comprehension(pref_list_test_c)
                outputs_val['prefix_length'] = flatten_comprehension(pref_list_val_c)

        except Exception as er:
            metrics[d] = er
    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()
    outputs_test
    y_test_c.sum()
    unique_rel_combs = list(max_combs.keys())
    metrics_comb = pd.DataFrame(data=0, columns=unique_rel_combs, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])
    for d in unique_rel_combs:
        metrics_comb[str('NoDev' + d)] = 0
    outputs_train
    positives = {}
    negatives = {}
    for i, label in enumerate(unique_rel_combs):
        positives[label] = y_train_c[:, i].sum()
        negatives[label] = len(y_train_c) - y_train_c[:, i].sum()
    max_Plabel = max(positives.values())
    max_Nlabel = max(negatives.values())
    max_label = max(max_Plabel, max_Nlabel)
    pir = {}
    nir = {}
    pirlbl = {}
    nirlbl = {}
    for label in unique_rel_combs:
        pir[label] = min((max(positives[label], negatives[label]) / positives[label]), 10000)
        nir[label] = max(positives[label], negatives[label]) / negatives[label]
        pirlbl[label] = max_label / positives[label]
        nirlbl[label] = max_label / negatives[label]
    pw_combs = {}
    nw_combs = {}
    for label in unique_rel_combs:
        pw_combs[label] = min((mean(pir.values()) ** (4 / (2 ** math.e)) + (np.log(pirlbl[label]))), 200)
        #pw_combs[label] =  min(2*(mean(pir.values()) ** ((2* math.e)) + (np.log(pirlbl[label]))), 100000)
        nw_combs[label] = mean(nir.values()) ** (1 / (2 * math.e)) + np.log(nirlbl[label])
    pw_combs
    metrics_comb
    if len(y_train_c[0]) == 1:
        catboost = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=list(pw_combs.values())[0],
                                      early_stopping_rounds=10)
        try:
            catboost.fit(outputs_train, y_train_c)
            y_pred = catboost.predict(outputs_test)
            y_pred
            y_preds = np.stack((y_pred, 1 - y_pred), axis=1)
            y_pred_list = y_preds.tolist()
            y_pred_list
            y_test_c = np.column_stack((y_test_c, 1 - y_test_c))

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test_c, y_pred_list)
            print(CM)
            for i, urc in enumerate(unique_rel_combs):
                metrics_comb[urc]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
                metrics_comb[urc]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
                metrics_comb[urc]['Support'] = (CM[i][1][1] + CM[i][1][0])
                try:
                    metrics_comb[urc]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test_c[:, i],
                                                                                 np.array(y_pred_list)[:, i],
                                                                                 average='macro')
                except Exception as er:
                    metrics_comb[urc]['ROC_AUC'] = er
                metrics_comb[str('NoDev' + urc)]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
                metrics_comb[str('NoDev' + urc)]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])
                metrics_comb[str('NoDev' + urc)]['Support'] = CM[i][0][0] + CM[i][0][1]
        except Exception as er:
            metrics_comb[unique_rel_combs[0]] = er
    else:
        catboost = CatBoostClassifier(verbose=False, random_state=0, loss_function='MultiLogloss',
                                      early_stopping_rounds=10)
        try:
            metric_combs = unique_rel_combs
            for ri in range(len(y_train_c[0]) - 1, -1, -1):
                if y_train_c[:, ri].sum() == 0:
                    y_train_c = np.delete(y_train_c, ri, 1)
                    y_test_c = np.delete(y_test_c, ri, 1)
                    del metric_combs[ri]

            catboost.fit(outputs_train, y_train_c)
            y_pred = catboost.predict(outputs_test)

            y_pred_list = y_pred.tolist()

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test_c, y_pred_list)
            print(CM)
            for i, urc in enumerate(metric_combs):
                metrics_comb[urc]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
                metrics_comb[urc]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
                metrics_comb[urc]['Support'] = (CM[i][1][1] + CM[i][1][0])
                try:
                    metrics_comb[urc]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test_c[:, i],
                                                                                 np.array(y_pred_list)[:, i],
                                                                                 average='macro')
                except Exception as er:
                    metrics_comb[urc]['ROC_AUC'] = er
                metrics_comb[str('NoDev' + urc)]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
                metrics_comb[str('NoDev' + urc)]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])
                metrics_comb[str('NoDev' + urc)]['Support'] = CM[i][0][0] + CM[i][0][1]

        except Exception as er:
            metrics_comb[unique_rel_combs[0]] = er
    writer = pd.ExcelWriter('BPDP_combinations/' + z + '_catboost.xlsx', engine="xlsxwriter")
    metrics_comb.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
## load feature vectors from MPPN
file = ask_for_path(REL_INPUT_PATH, 17)  # adjust to your path
with open(file, 'rb') as f:
    pd_cases_fv = pickle.load(f)


def IDP_separate_MPPN(log, pd_cases_fv, aligned_traces, split=1 / 3, u_sample=True, early_stop=True):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i += 1

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end
        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length

    X_cum = {}
    for pref in range(1, max_ev + 1):
        X_cum[pref] = pd.DataFrame(columns=['FV'], index=list(range(len(pd_cases_fv['case:concept:name'].unique()))))
    no_cases = len(pd_cases_fv['case:concept:name'].unique())
    row = 0
    for counter in range(len(pd_cases_fv)):
        X_cum[len(pd_cases_fv.loc[counter].trace)]['FV'][row] = pd_cases_fv.loc[counter].fv
        if not counter == len(pd_cases_fv) - 1:
            if not pd_cases_fv.loc[counter]['case:concept:name'] == pd_cases_fv.loc[counter + 1]['case:concept:name']:
                if not len(pd_cases_fv.loc[counter].trace) == max_ev:
                    for missing in range(len(pd_cases_fv.loc[counter].trace) + 1, max_ev + 1):
                        X_cum[missing]['FV'][row] = pd_cases_fv.loc[counter].fv
                row += 1
        else:
            if not len(pd_cases_fv.loc[counter].trace) == max_ev:
                for missing in range(len(pd_cases_fv.loc[counter].trace) + 1, max_ev + 1):
                    X_cum[missing]['FV'][row] = pd_cases_fv.loc[counter].fv
            row += 1

    positive_weights = {}
    negative_weights = {}
    for label in dev:
        positive_weights[label] = 16
        negative_weights[label] = 1

    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/BPDP_MPPN_Classifier')

    writer = pd.ExcelWriter(path + '/' + z + '_BPDP_MPPN_classification.xlsx', engine="xlsxwriter")

    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)),
                                                                        test_size=split, random_state=0)

    dev_position = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    for d in dev:
        for idx in x_test_idx:
            for i in range(1, event_count[idx] + 1):
                if y_cum_test[i][d][idx] == 1: dev_position[d][idx] = i + 1
    dev_position_pred = pd.DataFrame(index=x_test_idx, columns=dev, data=0)
    earliness = {}

    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)
    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))

    EPOCHS = 30
    BATCH_SIZE = 128
    LEARNING_RATE = 0.0001

    dev_trained = []
    for d in dev:
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        elif dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue
        else:
            dev_trained.append(d)
        if dev_distribution[d]['Test'] / len(x_test_idx) <= 0.05:
            metrics[d]['Notes'] = str(
                'Only very few deviations in Test Set:' + str(dev_distribution[d]['Test'] / len(x_test_idx)))

        elif dev_distribution[d]['Training'] / len(x_train_idx) <= 0.05:
            metrics[d]['Notes'] = str(
                'Only very few deviations in Training Set:' + str(dev_distribution[d]['Training'] / len(x_train_idx)))

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        if u_sample:
            imb_ref_enc_dat = pd.DataFrame(X_cum[1]['FV'].tolist()).add_prefix("c")
            imb_ref_enc_dat['ind'] = 0
            for i in range(len(imb_ref_enc_dat)):
                imb_ref_enc_dat['ind'][i] = i
            imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
            for trace in range(len(log)):
                if dev_df[d][trace] > 0:
                    imb_traces['Dev'][trace] = 1

            imb_traces = imb_traces.drop(x_test_idx)
            imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
            imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
            imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)

            oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)

            X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)

            x_train_idx = list(X_resampled['ind'])
            y_train_idx = list(X_resampled['ind'])

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)

        enumerated_trace_idx = {}
        for prefix in range(1, max_ev + 1):
            P = pd.DataFrame(X_cum[prefix]['FV'].tolist()).add_prefix("c")
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = P.loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = P.loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = P.loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))

        print('split done')
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)
        LEARNING_RATE = 0.0001
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BinaryClassificationIndiv(no_columns=128)
        model.to(device)
        weights = torch.FloatTensor(list([positive_weights[d], negative_weights[d]]))
        criterion = nn.CrossEntropyLoss(weight=weights)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if early_stop:

            EPOCHS = 300
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                   torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(X_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch < EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss = 0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1) * len(x_batch)
                    epoch_loss += loss
                    if i == len(steps) - 1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model, vloss): done = True
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(
                            f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(X_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []

        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                #y_test_pred = torch.sigmoid(y_test_pred)
                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
        metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
        metrics[d]['Support'] = CM[0][1][1] + CM[0][1][0]
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
        metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])
        metrics[str('NoDev' + d)]['Support'] = CM[1][1][1] + CM[1][1][0]

        to_be_checked_idx = {}
        for idx in x_test_idx:
            cum_idx = 0
            for prefix in range(1, event_count[idx] + 1):
                if prefix == 1:
                    to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                else:
                    to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                cum_idx += len(enumerated_trace_idx[prefix])


        for prefix in range(1, max_ev + 1):
            for idx in to_be_checked_idx.keys():
                if event_count[idx] >= prefix:
                    if prefix == 1:
                        dev_position_pred[d][idx] = y_pred_list[to_be_checked_idx[idx][prefix - 1]][0]
                    else:
                        if y_pred_list[to_be_checked_idx[idx][prefix - 1]][0] == 1 and \
                                y_pred_list[to_be_checked_idx[idx][prefix - 2]][0] == 0:
                            if dev_position[d][idx] <= prefix:
                                dev_position_pred[d][idx] = dev_position[d][idx]
                            else:
                                dev_position_pred[d][idx] = prefix

        earliness[d] = 0
        tobe_devs = 0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx] == 0 or dev_position_pred[d][idx] == 0:
                continue
            tobe_devs += 1
            earliness[d] += dev_position_pred[d][idx] / dev_position[d][idx]
        if not tobe_devs == 0:
            earliness[d] = earliness[d] / tobe_devs

    avg_dev_pos = {}
    for d in dev:
        if dev_distribution[d]['Test'] == 0:
            metrics[d] = 'No Deviation in Test Set'
            continue
        if dev_distribution[d]['Training'] == 0:
            metrics[d] = 'No Deviation in Training Set'
            continue
        devs = 0
        positions = 0
        for idx in to_be_checked_idx.keys():
            if dev_position[d][idx] > 0:
                devs += 1
                positions += dev_position[d][idx]
        if devs == 0:
            continue
        avg_dev_pos[d] = positions / devs

    metrics.to_excel(writer, sheet_name=('Metrics'))
    df = pd.DataFrame(data=earliness, index=[0])
    df.to_excel(writer, sheet_name=('Earliness'))
    df = pd.DataFrame(data=avg_dev_pos, index=[0])
    df.to_excel(writer, sheet_name=('Position'))
    writer.close()

In [None]:
def genga_benchmark(log, aligned_traces, c=2, alpha=1,split = 1 / 3):
    xt,z =os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    print(dev)
    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end

        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length





    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'ROC_AUC'])
    for d in dev:
        metrics[str('NoDev' + d)] = 0
    #if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
    for j, trace in enumerate(log):
        for i, event in enumerate(trace):
            pi = i + 1
            if pi == 1:
                event['duration'] = 0
                event['trace'] = event['concept:name']


            elif pi <= event_count[j]:
                event['duration'] = (log[j][i]['time:timestamp'] - log[j][0]['time:timestamp']).total_seconds() / 60
                event['trace'] = str(log[j][i - 1]['trace'] + ', ' + event['concept:name'])
        trace.attributes['duration'] = log[j][event_count[j] - 1]['duration']
    tree_df = pm4py.convert_to_dataframe(log)
    if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
        ref_dataframe = tree_df.filter(items=['case:AMOUNT_REQ', 'duration'], axis=1)
        ref_clean_dat = ref_dataframe.rename(columns={'case:AMOUNT_REQ': 'AMOUNT_REQ'})
    else:
        ref_clean_dat = tree_df.filter(items=['duration'], axis=1)

    X_tree = ref_clean_dat.loc[x_train_idx]
    for d in dev:
        y_tree = dev_df[d][x_train_idx]

        from feature_engine.discretisation import DecisionTreeDiscretiser

        disc = DecisionTreeDiscretiser(regression=False)

        # fit the transformer
        disc.fit(X_tree, y_tree)

        tree_df = pm4py.convert_to_dataframe(log)
        if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
            X_preprosessed = tree_df.filter(items=['case:AMOUNT_REQ', 'duration'], axis=1)
            X_preprosessed = X_preprosessed.rename(columns={'case:AMOUNT_REQ': 'AMOUNT_REQ'})
        else:
            X_preprosessed = tree_df.filter(items=['duration'], axis=1)

        try:
            X_preprosessed = disc.transform(X_preprosessed)
        except Exception as er:
            print(er)


        genga_states = []
        runner = 0
        for j, trace in enumerate(log):
            for i, event in enumerate(trace):
                if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
                    event['genga'] = str(
                        event['trace'] + '_' + str(X_preprosessed['AMOUNT_REQ'][runner + i]) + '_' + str(
                            X_preprosessed['duration'][runner + i]))
                    if not event['genga'] in genga_states:
                        genga_states.append(event['genga'])
                else:
                    event['genga'] = str(
                        event['trace'] + '_' + str(
                            X_preprosessed['duration'][runner + i]))
                    if not event['genga'] in genga_states:
                        genga_states.append(event['genga'])
            runner += event_count[j]

        len(genga_states)
        genga_counts = pd.DataFrame(data=0, index=genga_states, columns=[d])
        genga_counts['count'] = 0
        for j, trace in enumerate(log):
            if not j in x_train_idx:
                continue
            for i, event in enumerate(trace):
                genga_counts[d][event['genga']] += dev_df[d][j]
                genga_counts['count'][event['genga']] += 1
        genga_counts['count'].sum()
        test_count = 0
        for idx in y_test_idx:
            test_count += event_count[idx]
        y_true_df = pd.DataFrame(data=0, index=range(test_count), columns=[d])
        enumerated_trace_idx = {}
        for prefix in range(1, max(event_count.values()) + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))

        runner = 0
        for prefix in range(1, len(enumerated_trace_idx) + 1):
            for idx in range(len(enumerated_trace_idx[prefix])):
                y_true_df[d][runner] = y_cum_test[prefix][d][enumerated_trace_idx[prefix][idx]]
                runner += 1

        genga_states_test = {}
        for prefix in range(1, len(enumerated_trace_idx) + 1):
            genga_states_test[prefix] = []
            for idx in range(len(enumerated_trace_idx[prefix])):
                genga_states_test[prefix].append(log[enumerated_trace_idx[prefix][idx]][prefix - 1]['genga'])

        y_pred_df = pd.DataFrame(data=0, index=range(test_count), columns=[d])

        runner = 0
        for prefix in range(1, len(enumerated_trace_idx) + 1):
            for idx in range(len(enumerated_trace_idx[prefix])):
                p = genga_counts[d][genga_states_test[prefix][idx]]
                n = genga_counts['count'][genga_states_test[prefix][idx]] - genga_counts[d][genga_states_test[prefix][idx]]
                xb = p / (p + n + c)
                xd = n / (p + n + c)
                xu = c / (p + n + c)
                if (xu < xb or xu < xd) and xb > alpha * xd:
                    y_pred_df[d][runner] = 1
                runner += 1
        y_test_list = y_true_df.values.tolist()
        y_pred_list = y_pred_df.values.tolist()



        CM = sklearn.metrics.confusion_matrix(y_test_list, y_pred_list)


        try:
            metrics[d]['Precision'] = CM[1][1] / (CM[1][1] + CM[0][1])
            metrics[d]['Recall'] = CM[1][1] / (CM[1][1] + CM[1][0])
        except Exception as er:
             metrics[d]['Precision']=CM[0]
             metrics[d]['Recall']=er
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_true_df[d], y_pred_df[d], average='macro')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        try:
            metrics[str('NoDev' + d)]['Precision'] = CM[0][0] / (CM[0][0] + CM[1][0])
            metrics[str('NoDev' + d)]['Recall'] = CM[0][0] / (CM[0][0] + CM[0][1])
        except Exception as er:
            metrics[str('NoDev' + d)]['ROC_AUC'] = er


        to_be_checked_idx = {}
        for idx in x_test_idx:
            cum_idx = 0
            for prefix in range(1, event_count[idx] + 1):
                if prefix == 1:
                    to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                else:
                    to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                cum_idx += len(enumerated_trace_idx[prefix])


    path = (os.getcwd() + '/Genga')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_Genga_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")

    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()

In [None]:
def classify_cat(log, ref_log, aligned_traces, split=1/3, early_stop=True):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    # writer = pd.ExcelWriter(path+'/'+z+'_Prediction Evaluation CIBE.xlsx', engine="xlsxwriter") # name your excel file
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    print(len(dev), dev)
    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1

    print(dev_df.sum())

    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end

        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1

    # print(len(dev), dev)

    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000)
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()
    ref_enc_dat = pd.get_dummies(ref_clean_dat)


    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev,
                           index=['Precision', 'Recall', 'ROC_AUC'])


    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 0  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)


    labels = dev  # ['label_1', ...., 'label_6']

    positive_weights = {}
    negative_weights = {}
    for label in labels:
        positive_weights[label] = 16
        negative_weights[label] = 1
        #positive_weights[label] = (mean(pir.values())+statistics.stdev(pir.values()))**(1/(2*math.e))+np.log(pir[label])
        #negative_weights[label] = (mean(nir.values())+statistics.stdev(nir.values()))**(1/(2*math.e))+np.log(nir[label])

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/CatBoost')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_MC_catoost_ES_' + str(early_stop) + '_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)



    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)

    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))

    for d in dev:

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)

        enumerated_trace_idx = {}
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_cum[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = X_cum[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = X_cum[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))

        cat_y_train = y_train[:, 0]
        cat_y_val = y_val[:, 0]
        cat_y_test = y_test[:, 0]
        cat_y_train
        from catboost import CatBoostClassifier
        catboost = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=16, early_stopping_rounds=10)
        try:
            catboost.fit(X_train, cat_y_train)
            y_pred = catboost.predict(X_test)
            y_pred
            y_preds = np.stack((y_pred, 1 - y_pred), axis=1)
            y_pred_list = y_preds.tolist()
            y_pred_list

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

            metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            try:
                metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
            except Exception as er:
                metrics[d]['ROC_AUC'] = er
            metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])

            print(CM)

            to_be_checked_idx = {}
            for idx in x_test_idx:
                cum_idx = 0
                for prefix in range(1, event_count[idx] + 1):
                    if prefix == 1:
                        to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                    else:
                        to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                    cum_idx += len(enumerated_trace_idx[prefix])
        except Exception as er:
            metrics[d]=er
    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
def classify_xgb(log, ref_log, aligned_traces, split=1/3, early_stop=True):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    # writer = pd.ExcelWriter(path+'/'+z+'_Prediction Evaluation CIBE.xlsx', engine="xlsxwriter") # name your excel file
    #### get information whether deviation happened after prefix length in DF
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    print(len(dev), dev)
    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        k += 1
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1

    print(dev_df.sum())

    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end

        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1

    # print(len(dev), dev)

    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000)
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe = ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe = ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe = ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe = ref_dataframe.reset_index()
    ref_raw_dat = ref_dataframe.drop('index', axis=1)
    if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ'] = pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat = ref_raw_dat.drop('REG_DATE', axis=1)
    elif z == 'aligned_traces_20int.pkl':
        ref_clean_dat = ref_raw_dat.drop(
            ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID', 'Permit id'],
            axis=1)
    elif z == 'aligned_traces_20dom.pkl':
        ref_clean_dat = ref_raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
    elif z == 'aligned_traces_20prep.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
    elif z == 'aligned_traces_20RfP.pkl':
        ref_clean_dat = ref_raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
    else:
        ref_clean_dat = ref_raw_dat.copy()
    ref_enc_dat = pd.get_dummies(ref_clean_dat)


    X_cum = {}
    metrics = pd.DataFrame(data=0, columns=dev,
                           index=['Precision', 'Recall', 'ROC_AUC'])


    # prepare X for all prefix lengths
    for prefix in range(1, max_ev + 1):
        complex_index_encoding(log, prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe = dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe = dataframe.filter(like='case:', axis=1)
        dataframe = dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe = dataframe.reset_index()
        raw_dat = dataframe.drop('index', axis=1)
        if z == 'aligned_traces_12A.pkl' or z == 'aligned_traces_12O.pkl' or z == 'aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ'] = pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat = raw_dat.drop('REG_DATE', axis=1)
        elif z == 'aligned_traces_20int.pkl':
            clean_dat = raw_dat.drop(
                ['Permit travel permit number', 'DeclarationNumber', 'travel permit number', 'id', 'Permit ID',
                 'Permit id'], axis=1)
        elif z == 'aligned_traces_20dom.pkl':
            clean_dat = raw_dat.drop(['DeclarationNumber', 'id'], axis=1)
        elif z == 'aligned_traces_20prep.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id', 'Permit travel permit number', 'Permit id'], axis=1)
        elif z == 'aligned_traces_20RfP.pkl':
            clean_dat = raw_dat.drop(['RfpNumber', 'Rfp_id'], axis=1)
        else:
            clean_dat = raw_dat.copy()
        enc_dat = pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key] = 0  # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        enc_dat = pd.DataFrame(data=imp.fit_transform(enc_dat), columns=enc_dat.columns)

        X_cum[prefix] = enc_dat.copy()
        drop_idx = []
        for trace_idx in range(len(log)):
            if event_count[trace_idx] < prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix] = X_cum[prefix].drop(drop_idx)


    labels = dev  # ['label_1', ...., 'label_6']

    positive_weights = {}
    negative_weights = {}
    for label in labels:
        positive_weights[label] = 16
        negative_weights[label] = 1
        #positive_weights[label] = (mean(pir.values())+statistics.stdev(pir.values()))**(1/(2*math.e))+np.log(pir[label])
        #negative_weights[label] = (mean(nir.values())+statistics.stdev(nir.values()))**(1/(2*math.e))+np.log(nir[label])

    for d in dev:
        metrics[str('NoDev' + d)] = 0

    path = (os.getcwd() + '/CatBoost')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_MC_catoost_ES_' + str(early_stop) + '_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)



    dev_distribution = pd.DataFrame(data=0, index=['Training', 'Test'], columns=dev)

    for d in dev:
        dev_distribution[d]['Training'] = sum(dev_df[d][i] for i in x_train_idx)
        dev_distribution[d]['Test'] = sum(dev_df[d][i] for i in x_test_idx)

    dev_distribution.to_excel(writer, sheet_name=('Distribution'))

    for d in dev:

        Y_cum_dev = {}
        for prefix in range(1, max_ev + 1):
            Y_cum_dev[prefix] = pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev'] = 0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i] = 1 - y_cum_test[prefix][d][i]
            if prefix == 1:
                print(Y_cum_dev[prefix].columns)

        print('index length ', len(x_train_idx), len(x_test_idx), len(y_train_idx), len(y_test_idx))

        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2,
                                                                          random_state=0)

        enumerated_trace_idx = {}
        for prefix in range(1, max_ev + 1):
            drop_idx = []
            for trace_idx in range(len(log)):
                if event_count[trace_idx] < prefix:
                    drop_idx.append(trace_idx)  # drop all trace encoding that do not go until prefix length

            x_te = X_cum[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_tr = X_cum[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            x_va = X_cum[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_te = Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_tr = Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
            y_va = Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
            enumerated_trace_idx[prefix] = list(set(y_test_idx) - set(drop_idx))
            print('subset length ', prefix, len(x_te), len(x_tr), len(y_te), len(y_tr))

            if prefix == 1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append(X_train, x_tr, axis=0)
                X_test = np.append(X_test, x_te, axis=0)
                y_train = np.append(y_train, y_tr, axis=0)
                y_test = np.append(y_test, y_te, axis=0)
                y_val = np.append(y_val, y_va, axis=0)
                X_val = np.append(X_val, x_va, axis=0)  # combine all X data from all prefixes into one array
        print(d, len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))

        cxgb_y_train = y_train[:, 0]
        xgb_y_val = y_val[:, 0]
        xgb_y_test = y_test[:, 0]
        xgb_y_train
        bst = xgb.XGBClassifier(max_depth=16, scale_pos_weight=16)

        try:
            bst.fit(X_train, xgb_y_train, eval_set=[(X_train, xgb_y_train), (X_val, xgb_y_val)], early_stopping_rounds=10)
            y_preds = bst.predict(X_test)
            y_preds
            y_preds = np.stack((y_preds, 1 - y_preds), axis=1)
            y_pred_list = y_preds.tolist()
            y_pred_list

            CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

            metrics[d]['Precision'] = CM[0][1][1] / (CM[0][1][1] + CM[0][0][1])
            metrics[d]['Recall'] = CM[0][1][1] / (CM[0][1][1] + CM[0][1][0])
            try:
                metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
            except Exception as er:
                metrics[d]['ROC_AUC'] = er
            metrics[str('NoDev' + d)]['Precision'] = CM[1][1][1] / (CM[1][1][1] + CM[1][0][1])
            metrics[str('NoDev' + d)]['Recall'] = CM[1][1][1] / (CM[1][1][1] + CM[1][1][0])

            print(CM)

            to_be_checked_idx = {}
            for idx in x_test_idx:
                cum_idx = 0
                for prefix in range(1, event_count[idx] + 1):
                    if prefix == 1:
                        to_be_checked_idx[idx] = [enumerated_trace_idx[1].index(idx)]
                    else:
                        to_be_checked_idx[idx].append(enumerated_trace_idx[prefix].index(idx) + cum_idx)
                    cum_idx += len(enumerated_trace_idx[prefix])
        except Exception as er:
            metrics[d]=er
    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
### Suffix Prediction: Load Suffixes and execute prediction of deviations
file= ask_for_path(REL_INPUT_PATH,25)# adjust to your path
with open(file, 'rb') as f:
    suffixes=pickle.load(f)
def suffix_prediction_deviations(log, aligned_traces, net, initial_marking, final_marking, suffixes,split = 1 / 3):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    if z == 'MPPN_BPIC_2020_request_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf]=str('request for payment ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_international_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf]=str('declaration ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_prepaid_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf]=str('request for payment ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_domestic_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf]=str('declaration ' + str(suffixes['case:concept:name'][suf]))
    suffixes = suffixes.rename(columns={'case:concept:name': 'case:trace_ID', 'IDX': 'case:IDX'})
    suffixes['case:concept:name'] = suffixes[["case:trace_ID", "case:IDX"]].astype(str).apply("_".join, axis=1)

    suffixes['case:prefix'] = 1
    for suf in range(1, len(suffixes)):
        if suffixes['case:trace_ID'][suf - 1] == suffixes['case:trace_ID'][suf] and suffixes['case:IDX'][suf - 1] == \
                suffixes['case:IDX'][suf]:
            suffixes['case:prefix'][suf] = suffixes['case:prefix'][suf - 1]
        elif suffixes['case:trace_ID'][suf - 1] == suffixes['case:trace_ID'][suf] and not suffixes['case:IDX'][suf - 1] == \
                                                                                          suffixes['case:IDX'][suf]:
            suffixes['case:prefix'][suf] = suffixes['case:prefix'][suf - 1] + 1

    events_per_trace = pd.DataFrame(data=0, columns=['count'], index=suffixes['case:IDX'].unique())
    for idx in range(len(suffixes)):
        events_per_trace['count'][suffixes['case:IDX'][idx]] += 1

    predicted_log = pm4py.convert_to_event_log(suffixes)


    aligned_predictions = pm4py.conformance_diagnostics_alignments(predicted_log, net, initial_marking, final_marking)

    i = 0
    pred_dev = []
    for trace in predicted_log:
        no_moves = len(aligned_predictions[i]['alignment'])
        for j in range(0, len(aligned_predictions[i]['alignment'])):
            if aligned_predictions[i]['alignment'][j][1] == None or aligned_predictions[i]['alignment'][j][0] == \
                    aligned_predictions[i]['alignment'][j][1]:
                next
            else:
                if not str(aligned_predictions[i]['alignment'][j]) in pred_dev:
                    pred_dev.append(str(aligned_predictions[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    print(dev)
    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    dev_df['trace_ID'] = 0
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        dev_df['trace_ID'][k] = trace.attributes['concept:name']
        k += 1

    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1

    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened

    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end

        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    caseIDs_test = []
    key_caseID_testIDX = {}
    for idx in x_test_idx:
        caseIDs_test.append(log[idx].attributes['concept:name'])
        key_caseID_testIDX[log[idx].attributes['concept:name']] = idx
    caseIDs_train = []
    for idx in x_train_idx:
        caseIDs_train.append(log[idx].attributes['concept:name'])
    y_cum_pred = {}
    for ev in range(1, max_ev + 1):
        y_cum_pred[
            ev] = pd.DataFrame(data=0, columns=dev, index=caseIDs_test)
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = y_cum_test[
            ev].set_index('trace_ID')
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = y_cum_test[
            ev].drop(index=caseIDs_train)
    for i, alignment in enumerate(aligned_predictions):
        no_moves = len(alignment['alignment'])
        for d in dev:
            for j in range(no_moves):
                if str(aligned_predictions[i]['alignment'][j]) == d:
                    #print(i, predicted_log[i].attributes['trace_ID'],d, predicted_log[i].attributes['prefix'])
                    y_cum_pred[predicted_log[i].attributes['prefix']][d][predicted_log[i].attributes['trace_ID']] = 1
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in caseIDs_test:
            if event_count[key_caseID_testIDX[trace_idx]] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
        y_cum_pred[ev] = y_cum_pred[ev].drop(drop_idx)
    for ev in range(1, max_ev + 1):
        y_cum_test[ev] = y_cum_test[ev].sort_index()
        y_cum_pred[ev] = y_cum_pred[ev].sort_index()


    for ev in range(1, max_ev + 1):
        if ev == 1:
            y_pred_list = y_cum_pred[ev]
            y_test = y_cum_test[ev]
        else:

            y_pred_list = np.append(y_pred_list, y_cum_pred[ev], axis=0)
            y_test = np.append(y_test, y_cum_test[ev], axis=0)
    len(y_cum_test[1].values.tolist())
    print(len(y_pred_list))
    CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'ROC_AUC'])

    for i in range(len(dev)):
        metrics[str('NoDev' + dev[i])] = 0
        metrics[dev[i]]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
        metrics[dev[i]]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])

        try:
            metrics[dev[i]]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test[:, i], np.array(y_pred_list)[:, i],
                                                                       average='macro')
        except Exception as er:
            metrics[dev[i]]['ROC_AUC'] = er
        metrics[str('NoDev' + dev[i])]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
        metrics[str('NoDev' + dev[i])]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])

    path = (os.getcwd() + '/Suffix_Prediction')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_suffix_pred_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")

    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()

In [None]:
file= ask_for_path(REL_INPUT_PATH,14)# adjust to your path
with open(file, 'rb') as f:
    suffixes=pickle.load(f)
def suffix_prediction_patterns(log, aligned_traces, net, initial_marking, final_marking, suffixes,split = 1 / 3, relevance_ths = 0.5):
    xt, z = os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    if z == 'MPPN_BPIC_2020_request_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf] = str('request for payment ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_international_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf] = str('declaration ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_prepaid_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf] = str('request for payment ' + str(suffixes['case:concept:name'][suf]))
    elif z == 'MPPN_BPIC_2020_domestic_deviation_detection_suffixe.pkl':
        for suf in range(len(suffixes)):
            suffixes['case:concept:name'][suf] = str('declaration ' + str(suffixes['case:concept:name'][suf]))
    suffixes = suffixes.rename(columns={'case:concept:name': 'case:trace_ID', 'IDX': 'case:IDX'})
    suffixes['case:concept:name'] = suffixes[["case:trace_ID", "case:IDX"]].astype(str).apply("_".join, axis=1)

    suffixes['case:prefix'] = 1
    for suf in range(1, len(suffixes)):
        if suffixes['case:trace_ID'][suf - 1] == suffixes['case:trace_ID'][suf] and suffixes['case:IDX'][suf - 1] == \
                suffixes['case:IDX'][suf]:
            suffixes['case:prefix'][suf] = suffixes['case:prefix'][suf - 1]
        elif suffixes['case:trace_ID'][suf - 1] == suffixes['case:trace_ID'][suf] and not suffixes['case:IDX'][suf - 1] == \
                                                                                          suffixes['case:IDX'][suf]:
            suffixes['case:prefix'][suf] = suffixes['case:prefix'][suf - 1] + 1

    events_per_trace = pd.DataFrame(data=0, columns=['count'], index=suffixes['case:IDX'].unique())
    for idx in range(len(suffixes)):
        events_per_trace['count'][suffixes['case:IDX'][idx]] += 1

    predicted_log = pm4py.convert_to_event_log(suffixes)

    aligned_predictions = pm4py.conformance_diagnostics_alignments(predicted_log, net, initial_marking, final_marking)

    i = 0
    pred_dev = []
    for trace in predicted_log:
        no_moves = len(aligned_predictions[i]['alignment'])
        for j in range(0, len(aligned_predictions[i]['alignment'])):
            if aligned_predictions[i]['alignment'][j][1] == None or aligned_predictions[i]['alignment'][j][0] == \
                    aligned_predictions[i]['alignment'][j][1]:
                next
            else:
                if not str(aligned_predictions[i]['alignment'][j]) in pred_dev:
                    pred_dev.append(str(aligned_predictions[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1
    i = 0
    dev = []  # stores all deviations that happened
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next  # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i += 1

    dev

    y_cum_test = {}  # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df = pd.DataFrame(data=0, columns=dev, index=range(
        len(log)))  # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    dev_df['trace_ID'] = 0
    event_order = {}  # dict with event sequences for each trace
    event_count = {}  # dict with trace length for each trace
    max_ev = 0  # will be maximum trace length
    k = 0
    for trace in log:
        event_order[k] = []
        i = 0
        for event in trace:
            i += 1
            event_order[k].append(event['concept:name'])
        if i > max_ev:
            max_ev = i
        event_count[k] = len(event_order[k])
        dev_df['trace_ID'][k] = trace.attributes['concept:name']
        k += 1

    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        for j in range(0, len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0] == \
                    aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i] = 1
        i += 1

    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = dev_df.copy()  # initialize each prefix length with all traces and information whether deviation happened

    i = 0
    for trace in log:
        no_moves = len(aligned_traces[i]['alignment'])
        j = no_moves - 1  # iterator over moves in alignment, starting at the end
        m = len(event_order[i])  # iterator over event sequence, starting at the end

        while j >= 0:
            if aligned_traces[i]['alignment'][j][
                1] == None:  # if silent move, just go one move further to the beginning in the alignment
                j -= 1
            elif aligned_traces[i]['alignment'][j][0] == aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m - 1] == aligned_traces[i]['alignment'][j][
                    0]:  # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j -= 1
                    m -= 1
            elif event_order[i][m - 1] == aligned_traces[i]['alignment'][j][0]:  # log move detected
                for q in range(m, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j -= 1
                m -= 1
            elif m == max_ev:
                j -= 1
            else:  # model move deteceted
                for q in range(m + 1, max_ev + 1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][
                        i] = 0  # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j -= 1
        i += 1
    ### y_cum_test holds information whether deviation happened after prefix length


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    trainin_dev_df = dev_df.loc[x_train_idx]
    trainin_dev_df
    trainin_dev_df.corr()
    corrMatrix = trainin_dev_df.corr()

    corrMatrix.loc[:, :] = np.tril(corrMatrix, k=-1)  # borrowed from Karl D's answer

    already_in = set()
    max_combs_l = []
    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] >= relevance_ths].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            max_combs_l.append(perfect_corr)
    max_combs_l
    test_counts = {}
    for comb in max_combs_l:
        for y in range(len(comb)):
            test_counts[comb[y]] = dev_df.loc[x_test_idx].sum()[comb[y]]
        if any(dev_df.loc[x_test_idx].sum()[comb[y]] == 0 for y in range(len(comb))):
            max_combs_l.remove(comb)
            print(comb)
    max_combs_l
    test_counts
    max_combs = {}
    for comb in max_combs_l:
        max_combs[str(comb)] = comb
    max_combs
    y_cum_test[1]
    y_cum_test_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_test_combs[prefix] = y_cum_test[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_test_combs[prefix][comb] = 0
            for i in list(y_cum_test_combs[prefix].index):
                if event_count[i] < prefix:
                    continue
                if all(y_cum_test_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_test_combs[prefix][j][i] = 0
                    y_cum_test_combs[prefix][comb][i] = 1
    trainin_dev_df.sum()
    dev_df.loc[x_test_idx].sum()[max_combs_l[0][0]]
    pi = 4
    print(y_cum_test_combs[pi].sum())
    print(y_cum_test[pi].sum())
    y_cum_test_o_combs = {}
    columns_needed = list(max_combs.keys())
    columns_needed.append('trace_ID')
    for prefix in range(1, max_ev + 1):
        y_cum_test_o_combs[prefix] = y_cum_test_combs[prefix][columns_needed]
    y_cum_test_o_combs[1].sum()
    y_cum_test_combs[1]
    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split,
                                                                        random_state=0)
    caseIDs_test = []
    key_caseID_testIDX = {}
    for idx in x_test_idx:
        caseIDs_test.append(log[idx].attributes['concept:name'])
        key_caseID_testIDX[log[idx].attributes['concept:name']] = idx
    caseIDs_train = []
    for idx in x_train_idx:
        caseIDs_train.append(log[idx].attributes['concept:name'])
    y_cum_pred = {}
    for ev in range(1, max_ev + 1):
        y_cum_pred[
            ev] = pd.DataFrame(data=0, columns=dev, index=caseIDs_test)
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = y_cum_test[
            ev].set_index('trace_ID')
        y_cum_test_o_combs[ev] = y_cum_test_o_combs[ev].set_index('trace_ID')
    for ev in range(1, max_ev + 1):
        y_cum_test[
            ev] = y_cum_test[
            ev].drop(index=caseIDs_train)
        y_cum_test_o_combs[
            ev] = y_cum_test_o_combs[
            ev].drop(index=caseIDs_train)
    for i, alignment in enumerate(aligned_predictions):
        no_moves = len(alignment['alignment'])
        for d in dev:
            for j in range(no_moves):
                if str(aligned_predictions[i]['alignment'][j]) == d:
                    #print(i, predicted_log[i].attributes['trace_ID'],d, predicted_log[i].attributes['prefix'])
                    y_cum_pred[predicted_log[i].attributes['prefix']][d][predicted_log[i].attributes['trace_ID']] = 1

    y_cum_pred_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_pred_combs[prefix] = y_cum_pred[prefix].copy(deep=True)
        for comb in max_combs.keys():
            y_cum_pred_combs[prefix][comb] = 0
            for i in list(y_cum_pred_combs[prefix].index):
                if event_count[key_caseID_testIDX[i]] < prefix:
                    continue
                if all(y_cum_pred_combs[prefix][j][i] == 1 for j in max_combs[comb]):
                    for j in max_combs[comb]:
                        y_cum_pred_combs[prefix][j][i] = 0
                    y_cum_pred_combs[prefix][comb][i] = 1
    y_cum_pred_combs
    y_cum_pred_o_combs = {}
    for prefix in range(1, max_ev + 1):
        y_cum_pred_o_combs[prefix] = y_cum_pred_combs[prefix][list(max_combs.keys())]

    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in caseIDs_test:
            if event_count[key_caseID_testIDX[trace_idx]] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_test[ev] = y_cum_test[ev].drop(drop_idx)
        y_cum_pred[ev] = y_cum_pred[ev].drop(drop_idx)
    for ev in range(1, max_ev + 1):
        y_cum_test[ev] = y_cum_test[ev].sort_index()
        y_cum_pred[ev] = y_cum_pred[ev].sort_index()

    for ev in range(1, max_ev + 1):
        if ev == 1:
            y_pred_list = y_cum_pred[ev]
            y_test = y_cum_test[ev]
        else:

            y_pred_list = np.append(y_pred_list, y_cum_pred[ev], axis=0)
            y_test = np.append(y_test, y_cum_test[ev], axis=0)
    len(y_cum_test[1].values.tolist())
    print(len(y_pred_list))
    CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

    metrics = pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'ROC_AUC'])

    for i in range(len(dev)):
        metrics[str('NoDev' + dev[i])] = 0
        metrics[dev[i]]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
        metrics[dev[i]]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])

        try:
            metrics[dev[i]]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test[:, i], np.array(y_pred_list)[:, i],
                                                                       average='macro')
        except Exception as er:
            metrics[dev[i]]['ROC_AUC'] = er
        metrics[str('NoDev' + dev[i])]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
        metrics[str('NoDev' + dev[i])]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])

    path = (os.getcwd() + '/Suffix_Prediction')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_suffix_pred_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")

    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()
    len(y_pred_list)
    metrics
    for ev in range(1, max_ev + 1):
        drop_idx = []
        for trace_idx in caseIDs_test:
            if event_count[key_caseID_testIDX[trace_idx]] < ev:
                drop_idx.append(trace_idx)  # drop all trace labels that do not go until prefix length
        y_cum_pred_o_combs[ev] = y_cum_pred_o_combs[ev].drop(drop_idx)
        y_cum_test_o_combs[ev] = y_cum_test_o_combs[ev].drop(drop_idx)
    for ev in range(1, max_ev + 1):
        y_cum_test_o_combs[ev] = y_cum_test_o_combs[ev].sort_index()
        y_cum_pred_o_combs[ev] = y_cum_pred_o_combs[ev].sort_index()

    for ev in range(1, max_ev + 1):
        if ev == 1:
            y_pred_list = y_cum_pred_o_combs[ev]
            y_test = y_cum_test_o_combs[ev]
        else:
            y_pred_list = np.append(y_pred_list, y_cum_pred_o_combs[ev], axis=0)
            y_test = np.append(y_test, y_cum_test_o_combs[ev], axis=0)
    CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)
    print(CM)

    unique_rel_combs = list(max_combs.keys())
    metrics_comb = pd.DataFrame(data=0, columns=unique_rel_combs, index=['Precision', 'Recall', 'Support', 'ROC_AUC'])
    for d in unique_rel_combs:
        metrics_comb[str('NoDev' + d)] = 0
    metrics_comb

    for i, urc in enumerate(unique_rel_combs):
        metrics_comb[urc]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
        metrics_comb[urc]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
        metrics_comb[urc]['Support'] = (CM[i][1][1] + CM[i][1][0])
        try:
            metrics_comb[urc]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test[:, i], np.array(y_pred_list)[:, i],
                                                                         average='macro')
        except Exception as er:
            metrics_comb[urc]['ROC_AUC'] = er
        metrics_comb[str('NoDev' + urc)]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
        metrics_comb[str('NoDev' + urc)]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])
        metrics_comb[str('NoDev' + urc)]['Support'] = CM[i][0][0] + CM[i][0][1]
    writer = pd.ExcelWriter('BPDP_combinations/' + z + '_suffix.xlsx', engine="xlsxwriter")
    metrics_comb.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
suffix_prediction_patterns(log, aligned_traces, net, initial_marking, final_marking, suffixes)

In [None]:
REL_INPUT_PATH = "/../mppn_cms/" # here lie the event logs (.csv), the to-be model (.bpmn) and the already aligned traces (.pkl)
file= ask_for_path(REL_INPUT_PATH,7)# adjust to your path
with open(file, 'rb') as f:
    cm=pickle.load(f)
def get_metrics_IDP_MPPN(cm):
    metrics = pd.DataFrame(data=0, columns=list(cm.keys()), index=['Precision', 'Recall', 'Support', 'ROC_AUC'])
    for i, d in enumerate(list(cm.keys())):
        metrics[str('NoDev' + d)] = 0
        metrics[str('NoDev' + d)]['Precision'] = cm[list(cm.keys())[i]]['conf_matrix'][0][1][1] / (
                    cm[list(cm.keys())[i]]['conf_matrix'][0][1][1] + cm[list(cm.keys())[i]]['conf_matrix'][0][0][1])
        metrics[str('NoDev' + d)]['Recall'] = cm[list(cm.keys())[i]]['conf_matrix'][0][1][1] / (
                    cm[list(cm.keys())[i]]['conf_matrix'][0][1][1] + cm[list(cm.keys())[i]]['conf_matrix'][0][1][0])
        metrics[str('NoDev' + d)]['Support'] = cm[list(cm.keys())[i]]['conf_matrix'][0][1][1] + \
                                cm[list(cm.keys())[i]]['conf_matrix'][0][1][0]
        try:
            metrics[d]['ROC_AUC'] = sklearn.metrics.roc_auc_score(cm[list(cm.keys())[i]]['targets'],
                                                                  cm[list(cm.keys())[i]]['predictions'], average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[d]['Precision'] = cm[list(cm.keys())[i]]['conf_matrix'][1][1][1] / (
                    cm[list(cm.keys())[i]]['conf_matrix'][1][1][1] + cm[list(cm.keys())[i]]['conf_matrix'][1][0][1])
        metrics[d]['Recall'] = cm[list(cm.keys())[i]]['conf_matrix'][1][1][1] / (
                    cm[list(cm.keys())[i]]['conf_matrix'][1][1][1] + cm[list(cm.keys())[i]]['conf_matrix'][1][1][0])
        metrics[d]['Support'] = cm[list(cm.keys())[i]]['conf_matrix'][1][1][1] + \
                                               cm[list(cm.keys())[i]]['conf_matrix'][1][1][0]
    metrics
    path = (os.getcwd() + '/MPPN2End')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_MPPN2End.xlsx',
        engine="xlsxwriter")

    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()
get_metrics_IDP_MPPN(cm)

In [None]:
cm

In [None]:
cm[list(cm.keys())[3]]

In [None]:
cm[list(cm.keys())[3]]['conf_matrix'][1][1][1] / (
                    cm[list(cm.keys())[3]]['conf_matrix'][1][1][1] + cm[list(cm.keys())[0]]['conf_matrix'][1][1][0])

In [None]:
##### Here start the comparisons to other design choices

In [None]:
def IDP_no_imbalance(log, ref_log, aligned_traces, split=1/3, early_stop=True):
    xt,z =os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    # writer = pd.ExcelWriter(path+'/'+z+'_Prediction Evaluation CIBE.xlsx', engine="xlsxwriter") # name your excel file
    #### get information whether deviation happened after prefix length in DF
    i=0
    dev=[] # stores all deviations that happened
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
                #trace.attributes[str(aligned_traces[i]['alignment'][j])]=1
        i+=1
    print(len(dev), dev)
    y_cum_test={} # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df=pd.DataFrame(data=0,columns=dev, index=range(len(log))) # Data Frame that  stores the information whether a deviation happened for each trace on trace level
    event_order={} # dict with event sequences for each trace
    event_count={} # dict with trace length for each trace
    max_ev=0 # will be maximum trace length
    k=0
    for trace in log:
        event_order[k]=[]
        i=0
        for event in trace:
            i+=1
            event_order[k].append(event['concept:name'])
        if i>max_ev:
            max_ev=i
        event_count[k]=len(event_order[k])
        k+=1
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i]=1
        i+=1


    print(dev_df.sum())

    for ev in range(1,max_ev+1):
        y_cum_test[ev]=dev_df.copy() # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1,max_ev+1):
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< ev:
                drop_idx.append(trace_idx) # drop all trace labels that do not go until prefix length
        y_cum_test[ev]=y_cum_test[ev].drop(drop_idx)
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        j=no_moves-1 # iterator over moves in alignment, starting at the end
        m=len(event_order[i]) # iterator over event sequence, starting at the end

        while j >=0:
            if aligned_traces[i]['alignment'][j][1] == None: # if silent move, just go one move further to the beginning in the alignment
                j-=1
            elif aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # if synchorous move, just go one move further to the beginning in the alignment and one vent forther to the beginning in the event sequence
                    j-=1
                    m-=1
            elif event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # log move detected
                for q in range(m,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j-=1
                m-=1
            elif m==max_ev:
                j-=1
            else: # model move deteceted
                for q in range(m+1,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j-=1
        i+=1

    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000)
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe=ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe=ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe=ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe=ref_dataframe.reset_index()
    ref_raw_dat=ref_dataframe.drop('index', axis=1)
    if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ']= pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat=ref_raw_dat.drop('REG_DATE', axis=1)
    elif z=='aligned_traces_20int.pkl':
            ref_clean_dat=ref_raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
    elif z=='aligned_traces_20dom.pkl':
            ref_clean_dat=ref_raw_dat.drop(['DeclarationNumber','id'], axis=1)
    elif z=='aligned_traces_20prep.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
    elif z=='aligned_traces_20RfP.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
    else:
        ref_clean_dat=ref_raw_dat.copy()
    ref_enc_dat=pd.get_dummies(ref_clean_dat)




    min_pref=1
    max_pref=max_ev
    EPOCHS = 30
    BATCH_SIZE = 128
    LEARNING_RATE=0.0001

    X_cum={}
    metrics=pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'ROC_AUC'])

    # prepare X for all prefix lengths
    for prefix in range(1,max_ev+1):
        complex_index_encoding(log,prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe=dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe=dataframe.filter(like='case:', axis=1)
        dataframe=dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe=dataframe.reset_index()
        raw_dat=dataframe.drop('index', axis=1)
        if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ']= pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat=raw_dat.drop('REG_DATE', axis=1)
        elif z=='aligned_traces_20int.pkl':
            clean_dat=raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
        elif z=='aligned_traces_20dom.pkl':
            clean_dat=raw_dat.drop(['DeclarationNumber','id'], axis=1)
        elif z=='aligned_traces_20prep.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
        elif z=='aligned_traces_20RfP.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
        else:
            clean_dat=raw_dat.copy()
        enc_dat=pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key]=0 # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
        enc_dat=pd.DataFrame(data=imp.fit_transform(enc_dat),columns=enc_dat.columns)

        X_cum[prefix]=enc_dat.copy()
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix]=X_cum[prefix].drop(drop_idx)


    import math



    for d in dev:
        metrics[str('NoDev'+d)]=0

    path=(os.getcwd()+'/BPDP_Classifier')
    xt,z =os.path.split(file)

    writer = pd.ExcelWriter(path+'/'+z+'_MC_CrossEntropyLoss_NoImbalance_'+str(early_stop)+'_'+str(round(split,2))+'.xlsx', engine="xlsxwriter")


    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split, random_state=0)




    for d in dev:


        Y_cum_dev={}
        for prefix in range(1,max_ev+1):
            Y_cum_dev[prefix]=pd.DataFrame(y_cum_test[prefix][d])
            Y_cum_dev[prefix]['NoDev']=0
            for i in Y_cum_dev[prefix].index.values.tolist():
                Y_cum_dev[prefix]['NoDev'][i]=1-y_cum_test[prefix][d][i]
            if prefix==1:
                print(Y_cum_dev[prefix].columns)


        x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split, random_state=0)
        print('index length ', len(x_train_idx),len(x_test_idx),len(y_train_idx),len(y_test_idx))

        x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2, random_state=0)

        enumerated_trace_idx={}
        for prefix in range(1,max_ev+1):
            drop_idx=[]
            for trace_idx in range(len(log)):
                if event_count[trace_idx]< prefix:
                    drop_idx.append(trace_idx) # drop all trace encoding that do not go until prefix length

            x_te=X_cum[prefix].loc[[j for j in list(set(y_test_idx)-set(drop_idx))]].to_numpy().astype(float)
            x_tr=X_cum[prefix].loc[[j for j in list(set(y_train_idx)-set(drop_idx))]].to_numpy().astype(float)
            x_va=X_cum[prefix].loc[[j for j in list(set(y_val_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_te=Y_cum_dev[prefix].loc[[j for j in list(set(y_test_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_tr=Y_cum_dev[prefix].loc[[j for j in list(set(y_train_idx)-set(drop_idx))]].to_numpy().astype(float)
            y_va=Y_cum_dev[prefix].loc[[j for j in list(set(y_val_idx)-set(drop_idx))]].to_numpy().astype(float)
            enumerated_trace_idx[prefix]=list(set(y_test_idx)-set(drop_idx))
            print('subset length ',prefix, len(x_te),len(x_tr),len(y_te),len(y_tr))

            if prefix ==1:
                X_train = x_tr
                X_test = x_te
                y_train = y_tr
                y_test = y_te
                X_val = x_va
                y_val = y_va
            else:
                X_train = np.append( X_train, x_tr, axis=0)
                X_test = np.append( X_test, x_te, axis=0)
                y_train = np.append( y_train, y_tr, axis=0)
                y_test = np.append( y_test, y_te, axis=0)
                y_val = np.append( y_val, y_va, axis=0)
                X_val = np.append( X_val, x_va, axis=0)# combine all X data from all prefixes into one array
        print(d, len(X_train),len(y_train),len(X_val),len(y_val),len(X_test),len(y_test))


        print('split done')
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = BinaryClassificationIndiv(no_columns=len(ref_enc_dat.loc[0]))
        model.to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        if not early_stop:

            print(dev, 'training start no ES')
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            for e in range(1, EPOCHS+1):
                epoch_loss = 0
                epoch_acc = 0
                for X_batch, y_batch in train_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    optimizer.zero_grad()

                    y_pred = model(X_batch)

                    loss = criterion(y_pred.unsqueeze(1), y_batch.unsqueeze(1))
                    acc = binary_acc(y_pred.unsqueeze(1), y_batch.unsqueeze(1))/len(y_pred[0])

                    loss.backward()
                    optimizer.step()

                    epoch_loss += loss.item()
                    epoch_acc += acc.item()


                print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f} |', d)

        else:
            print('training start with ES')
            EPOCHS=300
            model.train()
            train_data = TrainData(torch.FloatTensor(X_train),
                                torch.FloatTensor(y_train))

            train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

            X_val = torch.FloatTensor(X_val)

            es = EarlyStopping()
            done = False

            epoch = 0
            while epoch<EPOCHS and not done:
                epoch += 1
                steps = list(enumerate(train_loader))
                pbar = tqdm.tqdm(steps)
                model.train()
                epoch_acc = 0
                epoch_loss=0
                for i, (x_batch, y_batch) in pbar:
                    optimizer.zero_grad()
                    y_batch_pred = model(x_batch.to(device))

                    loss = criterion(y_batch_pred, y_batch.to(device))

                    acc = binary_acc(y_batch_pred, y_batch.to(device))/len(y_batch_pred[0])

                    loss.backward()
                    optimizer.step()
                    epoch_acc += acc.item()

                    loss, current = loss.item(), (i + 1)* len(x_batch)
                    epoch_loss+=loss
                    if i == len(steps)-1:
                        model.eval()
                        pred = model(X_val)
                        vloss = criterion(pred, torch.FloatTensor(y_val))
                        if es(model,vloss): done = True
                        pbar.set_description(f"Epoch: {epoch}, tloss: {epoch_loss/len(train_loader)}, Acc: {epoch_acc/len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                    else:
                        pbar.set_description(f"Epoch: {epoch}, tloss {epoch_loss/len(train_loader):}, Acc: {epoch_acc/len(train_loader):.3f}")

        model.eval()
        test_data = TestData(torch.FloatTensor(X_test))
        test_loader = DataLoader(dataset=test_data, batch_size=1)

        y_pred_list = []

        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(device)
                y_test_pred = torch.nn.functional.softmax(model(X_batch))
                #y_test_pred = torch.sigmoid(y_test_pred)

                y_pred_tag = torch.round(y_test_pred)
                y_pred_list.append(y_pred_tag.cpu().numpy())

        y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

        CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

        metrics[d]['Precision']=CM[0][1][1]/(CM[0][1][1]+CM[0][0][1])
        metrics[d]['Recall']=CM[0][1][1]/(CM[0][1][1]+CM[0][1][0])
        try:
            metrics[d]['ROC_AUC'] =  sklearn.metrics.roc_auc_score(y_test, y_pred_list, average='weighted')
        except Exception as er:
            metrics[d]['ROC_AUC'] = er
        metrics[str('NoDev'+d)]['Precision']=CM[1][1][1]/(CM[1][1][1]+CM[1][0][1])
        metrics[str('NoDev'+d)]['Recall']=CM[1][1][1]/(CM[1][1][1]+CM[1][1][0])
        print(CM)


    metrics.to_excel(writer, sheet_name=('Metrics'))

    writer.close()

In [None]:
# we define our FFN
class BinaryClassification(nn.Module):
    def __init__(self, no_columns, no_devs):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(no_columns, 2048)
        self.activation1 = nn.LeakyReLU()
        self.layer_2 = nn.Linear(2048, 2048)
        self.activation2 = nn.LeakyReLU()
        self.layer_3 = nn.Linear(2048, 1024)
        self.activation3 = nn.LeakyReLU()
        self.layer_out = nn.Linear(1024, no_devs)


        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.LayerNorm(2048)
        self.batchnorm2 = nn.LayerNorm(1024)
        self.Sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        x = self.activation1(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.activation2(self.layer_2(x))
        x = self.batchnorm1(x)
        x = self.activation3(self.layer_3(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)

        return x

In [None]:
def IDP_collective_CIBE(log, ref_log, aligned_traces, u_sample=True, split=1/3, early_stop=True):
    xt,z =os.path.split(file)
    import warnings
    warnings.simplefilter('ignore')
    #### get information whether deviation happened after prefix length in DF
    i=0
    dev=[] # stores all deviations that happened
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next # we do not care for simultaneous or silent moves
            else:
                if not str(aligned_traces[i]['alignment'][j]) in dev:
                    dev.append(str(aligned_traces[i]['alignment'][j]))
        i+=1

    y_cum_test={} # dict that stores label for each prefix and deviation combinations; keys are prefix length, entries are Data Frames with index = trace and columns = deviation
    dev_df=pd.DataFrame(data=0,columns=dev, index=range(len(log))) # Data Frame that stores the information whether a deviation happened for each trace on trace level
    event_order={} # dict with event sequences for each trace
    event_count={} # dict with trace length for each trace
    max_ev=0 # will be maximum trace length
    k=0
    for trace in log:
        event_order[k]=[]
        i=0
        for event in trace:
            i+=1
            event_order[k].append(event['concept:name'])
        if i>max_ev:
            max_ev=i
        event_count[k]=len(event_order[k])
        k+=1
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        for j in range(0,len(aligned_traces[i]['alignment'])):
            if aligned_traces[i]['alignment'][j][1] == None or aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                next
            else:
                dev_df[str(aligned_traces[i]['alignment'][j])][i]=1
        i+=1
    for ev in range(1,max_ev+1):
        y_cum_test[ev]=dev_df.copy() # initialize each prefix length with all traces and information whether deviation happened
    for ev in range(1,max_ev+1):
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< ev:
                drop_idx.append(trace_idx) # drop all trace labels that do not go until prefix length
        y_cum_test[ev]=y_cum_test[ev].drop(drop_idx)
    i=0
    for trace in log:
        no_moves=len(aligned_traces[i]['alignment'])
        j=no_moves-1 # iterator over moves in alignment, starting at the end
        m=len(event_order[i]) # iterator over event sequence, starting at the end
        while j >=0:
            if aligned_traces[i]['alignment'][j][1] == None: # if silent move, just go one move further to the beginning in the alignment
                j-=1
            elif aligned_traces[i]['alignment'][j][0]==aligned_traces[i]['alignment'][j][1]:
                if event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # if synchronous move, just go one move further to the beginning in the alignment and one event forther to the beginning in the event sequence
                    j-=1
                    m-=1
            elif event_order[i][m-1]==aligned_traces[i]['alignment'][j][0]: # log move detected
                for q in range(m,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes from the current m to the maximum prefix length in this trace to 0 because deviation happened here but not afterwards
                j-=1
                m-=1
            elif m==max_ev:
                j-=1
            else: # model move deteceted
                for q in range(m+1,max_ev+1):
                    y_cum_test[q][str(aligned_traces[i]['alignment'][j])][i]=0 # set all prefixes after the current m to the maximum prefix length in this trace to 0 because deviation happened between m and m+1 but not afterwards
                j-=1
        i+=1
    ### y_cum_test holds information whether deviation happened after prefix length


    ## ref_log will have all attributes that will be the columns for X_test and X_train
    ref_log = complex_index_encoding(ref_log, 4000) # prepare a log with the maximum length of the feature vector from CIBE to know to pad other feature vectors
    ref_dataframe1 = pm4py.convert_to_dataframe(ref_log)
    ref_dataframe=ref_dataframe1.drop_duplicates(subset=['case:concept:name'])
    ref_dataframe=ref_dataframe.filter(like='case:', axis=1)
    ref_dataframe=ref_dataframe.drop('case:concept:name', axis=1)
    ref_dataframe.columns = ref_dataframe.columns.str.replace('case:', '')
    ref_dataframe=ref_dataframe.reset_index()
    ref_raw_dat=ref_dataframe.drop('index', axis=1)
    ## dataset-specific preparation (i.e., redundant attributes, convertion to numeric)
    if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
        ref_raw_dat['AMOUNT_REQ']= pd.to_numeric(ref_raw_dat['AMOUNT_REQ'])
        ref_clean_dat=ref_raw_dat.drop('REG_DATE', axis=1)
    elif z=='aligned_traces_20int.pkl':
            ref_clean_dat=ref_raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
    elif z=='aligned_traces_20dom.pkl':
            ref_clean_dat=ref_raw_dat.drop(['DeclarationNumber','id'], axis=1)
    elif z=='aligned_traces_20prep.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
    elif z=='aligned_traces_20RfP.pkl':
            ref_clean_dat=ref_raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
    else:
        ref_clean_dat=ref_raw_dat.copy()
    ref_enc_dat=pd.get_dummies(ref_clean_dat)


    X_cum={}
    metrics=pd.DataFrame(data=0, columns=dev, index=['Precision', 'Recall', 'Support', 'ROC_AUC','LenTrain', 'LenTrain_beforeUS_0', 'LenTrain_beforeUS_1', 'LenTrain_afterUS_0', 'LenTrain_afterUS_1'])

    # prepare X for all prefix lengths
    for prefix in range(1,max_ev+1):
        complex_index_encoding(log,prefix)
        dataframe1 = pm4py.convert_to_dataframe(log)
        dataframe=dataframe1.drop_duplicates(subset=['case:concept:name'])
        dataframe=dataframe.filter(like='case:', axis=1)
        dataframe=dataframe.drop('case:concept:name', axis=1)
        dataframe.columns = dataframe.columns.str.replace('case:', '')
        dataframe=dataframe.reset_index()
        raw_dat=dataframe.drop('index', axis=1)
        if z=='aligned_traces_12A.pkl' or z=='aligned_traces_12O.pkl' or z=='aligned_traces_12AO.pkl':
            raw_dat['AMOUNT_REQ']= pd.to_numeric(raw_dat['AMOUNT_REQ'])
            clean_dat=raw_dat.drop('REG_DATE', axis=1)
        elif z=='aligned_traces_20int.pkl':
            clean_dat=raw_dat.drop(['Permit travel permit number','DeclarationNumber','travel permit number','id','Permit ID', 'Permit id'], axis=1)
        elif z=='aligned_traces_20dom.pkl':
            clean_dat=raw_dat.drop(['DeclarationNumber','id'], axis=1)
        elif z=='aligned_traces_20prep.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id','Permit travel permit number','Permit id'], axis=1)
        elif z=='aligned_traces_20RfP.pkl':
            clean_dat=raw_dat.drop(['RfpNumber','Rfp_id'], axis=1)
        else:
            clean_dat=raw_dat.copy()
        enc_dat=pd.get_dummies(clean_dat)
        for key in ref_enc_dat.columns:
            if not key in enc_dat.columns:
                enc_dat[key]=0 # pad all prefixes to maximum lengths with 0
        imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
        enc_dat=pd.DataFrame(data=imp.fit_transform(enc_dat),columns=enc_dat.columns)

        X_cum[prefix]=enc_dat.copy()
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< prefix:
                drop_idx.append(trace_idx)

        X_cum[prefix]=X_cum[prefix].drop(drop_idx)

    min_pref = 1
    max_pref = max_ev
    EPOCHS = 50
    BATCH_SIZE = 128
    LEARNING_RATE = 0.001

    N = len(dev_df)

    labels = dev  # ['label_1', ...., 'label_6']

    positives = {}
    negatives = {}
    for label in labels:
        positives[label] = sum(dev_df[label] == 1)
        negatives[label] = sum(dev_df[label] == 0)
    max_Plabel = max(positives.values())
    max_Nlabel = max(negatives.values())
    max_label = max(max_Plabel, max_Nlabel)
    pir = {}
    nir = {}
    pirlbl = {}
    nirlbl = {}
    for label in labels:
        pir[label] = max(positives[label], negatives[label]) / positives[label]
        nir[label] = max(positives[label], negatives[label]) / negatives[label]
        pirlbl[label] = max_label / positives[label]
        nirlbl[label] = max_label / negatives[label]
    positive_weights = {}
    negative_weights = {}
    for label in labels:
        positive_weights[label] = mean(pir.values()) ** (1 / (4 * math.e)) + np.log(pirlbl[label])
        negative_weights[label] = mean(nir.values()) ** (1 / (2 * math.e)) + np.log(nirlbl[label])





    x_train_idx, x_test_idx, y_train_idx, y_test_idx = train_test_split(range(len(log)), range(len(log)), test_size=split, random_state=0)

    for d in dev:
        metrics[str('NoDev'+d)]=0
        metrics[d]['LenTrain_beforeUS_1']=sum(dev_df[d][i] for i in x_train_idx)
        metrics[d]['LenTrain_beforeUS_0']=len(x_train_idx)-sum(dev_df[d][i] for i in x_train_idx)

    if u_sample:
        imb_ref_enc_dat = ref_enc_dat.copy()
        imb_ref_enc_dat['ind'] = 0
        for i in range(len(imb_ref_enc_dat)):
            imb_ref_enc_dat['ind'][i] = i

        imb_traces = pd.DataFrame(data=0, columns=['Dev'], index=range(len(log)))
        for trace in range(len(log)):
            if dev_df.loc[trace].sum() > 0:
                imb_traces['Dev'][trace] = 1
        imb_traces = imb_traces.drop(x_test_idx)
        imb_ref_enc_dat = imb_ref_enc_dat.drop(x_test_idx)
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        imb_ref_enc_dat = pd.DataFrame(data=imp.fit_transform(imb_ref_enc_dat), columns=imb_ref_enc_dat.columns)
        oss = OneSidedSelection(random_state=0, n_seeds_S=250, n_neighbors=7)
        X_resampled, y_resampled = oss.fit_resample(imb_ref_enc_dat, imb_traces)
        x_train_idx = list(X_resampled['ind'])
        y_train_idx = list(X_resampled['ind'])

    devq_df=dev_df.loc[x_train_idx]
    print('index length ', len(x_train_idx),len(x_test_idx),len(y_train_idx),len(y_test_idx))
    for d in dev:
        metrics[d]['LenTrain']=len(x_train_idx)
        metrics[d]['LenTrain_afterUS_1']=devq_df[d].sum()
        metrics[d]['LenTrain_afterUS_0']=len(x_train_idx)-devq_df[d].sum()
    # validation set for early stopping
    x_train_idx, x_val_idx, y_train_idx, y_val_idx = train_test_split(x_train_idx, x_train_idx, test_size=0.2, random_state=0)


    enumerated_trace_idx={}
    for prefix in range(1,max_ev+1):
        drop_idx=[]
        for trace_idx in range(len(log)):
            if event_count[trace_idx]< prefix:
                drop_idx.append(trace_idx) # drop all trace encoding that do not go until prefix length

        x_te=X_cum[prefix].loc[[j for j in list(set(y_test_idx)-set(drop_idx))]].to_numpy().astype(float)
        x_tr=X_cum[prefix].loc[[j for j in list(set(y_train_idx)-set(drop_idx))]].to_numpy().astype(float)
        x_va=X_cum[prefix].loc[[j for j in list(set(y_val_idx)-set(drop_idx))]].to_numpy().astype(float)
        y_te = y_cum_test[prefix].loc[[j for j in list(set(y_test_idx) - set(drop_idx))]].to_numpy().astype(float)
        y_va = y_cum_test[prefix].loc[[j for j in list(set(y_val_idx) - set(drop_idx))]].to_numpy().astype(float)
        y_tr = y_cum_test[prefix].loc[[j for j in list(set(y_train_idx) - set(drop_idx))]].to_numpy().astype(float)
        enumerated_trace_idx[prefix]=list(set(y_test_idx)-set(drop_idx))
        print('subset length ',prefix, len(x_te),len(x_tr),len(y_te),len(y_tr))

        if prefix ==1:
            X_train = x_tr
            X_test = x_te
            y_train = y_tr
            y_test = y_te
            X_val = x_va
            y_val = y_va
        else:
            X_train = np.append( X_train, x_tr, axis=0)
            X_test = np.append( X_test, x_te, axis=0)
            y_train = np.append( y_train, y_tr, axis=0)
            y_test = np.append( y_test, y_te, axis=0)
            y_val = np.append( y_val, y_va, axis=0)
            X_val = np.append( X_val, x_va, axis=0)# combine all X data from all prefixes into one array
    print(d, len(X_train),len(y_train),len(X_val),len(y_val),len(X_test),len(y_test))

    print('split done')
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    X_val = scaler.fit_transform(X_val)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = BinaryClassification(no_columns=len(ref_enc_dat.loc[0]), no_devs=len(dev))
    model.to(device)
    criterion = nn.MultiLabelSoftMarginLoss(weight=torch.FloatTensor(list(positive_weights.values())))
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    if not early_stop:

        print('training start no ES')
        model.train()
        train_data = TrainData(torch.FloatTensor(X_train),
                               torch.FloatTensor(y_train))

        train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

        for e in range(1, EPOCHS + 1):
            epoch_loss = 0
            epoch_acc = 0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()

                y_pred = model(X_batch)

                loss = criterion(y_pred, y_batch)
                acc = binary_acc(y_pred, y_batch) / len(y_pred[0])

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

            print(
                f'Epoch {e + 0:03}: | Loss: {epoch_loss / len(train_loader):.5f} | Acc: {epoch_acc / len(train_loader):.3f} |',
                prefix)

    else:
        print('training start with ES')
        model.train()
        train_data = TrainData(torch.FloatTensor(X_train),
                               torch.FloatTensor(y_train))

        train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)

        X_val = torch.FloatTensor(X_val)

        es = EarlyStopping()
        done = False

        epoch = 0
        while epoch < EPOCHS and not done:
            epoch += 1
            steps = list(enumerate(train_loader))
            pbar = tqdm.tqdm(steps)
            model.train()
            epoch_acc = 0
            epoch_loss = 0
            for i, (x_batch, y_batch) in pbar:
                optimizer.zero_grad()
                y_batch_pred = model(x_batch.to(device))

                #print(y_batch_pred.shape)

                loss = criterion(y_batch_pred, y_batch.to(device))

                acc = binary_acc(y_batch_pred, y_batch.to(device)) / len(y_batch_pred[0])

                loss.backward()
                optimizer.step()
                epoch_acc += acc.item()
                epoch_loss += loss.item()

                current = (i + 1) * len(x_batch)
                if i == len(steps) - 1:
                    model.eval()
                    pred = model(X_val)
                    vloss = criterion(pred, torch.FloatTensor(y_val))
                    if es(model, vloss): done = True
                    pbar.set_description(
                        f"Epoch: {epoch}, tloss: {epoch_loss / len(train_loader)}, Acc: {epoch_acc / len(train_loader):.3f}, vloss: {vloss:>7f}, EStop:[{es.status}]")
                else:
                    pbar.set_description(
                        f"Epoch: {epoch}, tloss {epoch_loss / len(train_loader):}, Acc: {epoch_acc / len(train_loader):.3f}")

    model.eval()
    test_data = TestData(torch.FloatTensor(X_test))
    test_loader = DataLoader(dataset=test_data, batch_size=1)

    y_pred_list = []

    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(device)
            y_test_pred = model(X_batch)
            #y_test_pred = torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(torch.sigmoid_(y_test_pred))
            y_pred_list.append(y_pred_tag.cpu().numpy())

    y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

    CM = sklearn.metrics.multilabel_confusion_matrix(y_test, y_pred_list)

    for i in range(len(dev)):
        metrics[str('NoDev' + dev[i])] = 0
        metrics[dev[i]]['Precision'] = CM[i][1][1] / (CM[i][1][1] + CM[i][0][1])
        metrics[dev[i]]['Recall'] = CM[i][1][1] / (CM[i][1][1] + CM[i][1][0])
        try:
            metrics[dev[i]]['ROC_AUC'] = sklearn.metrics.roc_auc_score(y_test[:,i], np.array(y_pred_list)[:,i], average='macro')
        except Exception as er:
            metrics[dev[i]]['ROC_AUC'] = er
        metrics[str('NoDev' + dev[i])]['Precision'] = CM[i][0][0] / (CM[i][0][0] + CM[i][1][0])
        metrics[str('NoDev' + dev[i])]['Recall'] = CM[i][0][0] / (CM[i][0][0] + CM[i][0][1])



    path = (os.getcwd() + '/BPDP_Classifier')
    xt, z = os.path.split(file)

    writer = pd.ExcelWriter(
        path + '/' + z + '_BPDP_single_classifier_testcounts' + str(early_stop) + '_' + str(round(split, 2)) + '.xlsx',
        engine="xlsxwriter")

    metrics.to_excel(writer, sheet_name=('Metrics'))
    writer.close()

In [None]:
IDP_collective_CIBE(log, ref_log, aligned_traces, u_sample=True, split=1/3, early_stop=True)