In [1]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH "] = "true"

import numpy as np
import pandas as pd

from deepalign import Dataset
from deepalign import fs
from deepalign.alignments import ALIGNERS

from IPython.display import display
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

## Setup

Let us first define a `get_model` helper method that retrieves an `Aligner` model from disk and the corresponding `Dataset` instance.

In [2]:
list(set([f.name.replace('_forward', '').replace('_backward', '') 
                      for f in fs.get_model_files()]))

['small-0.3-1_confnet00_20200907-110056.758934',
 'small-0.3-1_hm',
 'small-0.3-1_optimal',
 'small-0.3-1_confnet10_20200907-110231.150459',
 'small-0.3-1_im']

In [3]:
def get_model(aligner, dataset_name, case_attributes=False, event_attributes=False):
    ea = ca = 0
    
    if aligner == 'confnet':
        ea = int(event_attributes)
        ca = int(case_attributes)
        model_name = f'{dataset_name}_{aligner}{ea}{ca}'
    else:
        model_name = f'{dataset_name}_{aligner}'

    print(model_name)
    dataset = Dataset(dataset_name, use_case_attributes=ca, use_event_attributes=ea)
    models = list(set([f.name.replace('_forward', '').replace('_backward', '') 
                      for f in fs.get_model_files()
                      if model_name in f.name]))
    
    if aligner == 'confnet':
        aligner = ALIGNERS[aligner](dataset, use_case_attributes=ca, use_event_attributes=ea)
    else:
        aligner = ALIGNERS[aligner]()
    aligner.load(str(fs.MODEL_DIR / models[0]))
                
    return aligner, dataset

## Computing alignments with DeepAlign

We will use the `paper-0.3-3` event log as our main example here.

In [4]:
dataset_name = 'small-0.3-1'

The possible aligner key strings are: `confnet` (DeepAlign), `optimal` (Reference Model), `hm` (Heuristics Miner), and `im` (Inductive Miner).

In [5]:
ALIGNERS.keys()

dict_keys(['alpha', 'alphaplus', 'confnet', 'dfg', 'hm', 'im', 'optimal', 'sm'])

Let us load a ConfNet first.

In [6]:
confnet, dataset = get_model('confnet', dataset_name, False, True)

small-0.3-1_confnet10


Now, we can run the DeepAlign algorithm with `confnet.align`. We can control the number of beams with the parameter `k`, the maximum number of steps with `steps`, the maximum deletion size for one step with `delete_max`, and a hot start mode with `hot_start`. The hot start will use the BINet anomaly detection method to automatically finish all beams where no anomaly was found.

In [7]:
alignments, corrected_cases, costs = confnet.align(dataset, k=5, steps=10, delete_max=3, hot_start=True)

Step 1 → 4.478997230529785s (25000, 26) finished=3754
Step 2 ← 1.7230033874511719s (25000, 26) finished=3754
Step 3 → 1.5639970302581787s (25000, 26) finished=3867
Step 4 ← 1.5219953060150146s (25000, 26) finished=4339
Step 5 → 0.927001953125s (25000, 26) finished=4604
Step 6 ← 0.5185084342956543s (25000, 26) finished=4652
Step 7 → 0.6150002479553223s (25000, 26) finished=4678
Step 8 ← 0.46599912643432617s (25000, 26) finished=4688
Step 9 → 0.6630041599273682s (25000, 26) finished=4705
Step 10 ← 0.4069952964782715s (25000, 26) finished=4717


Okay, done. Now let us take a look at the found alignments. Fist, we define a display method for alignments.

In [8]:
def display_alignment(alignment, decode=None):
    a = alignment[alignment != -1]
    a = a.reshape(2, a.shape[0] // 2)
    if decode is not None:
        a = decode(a)
    df = pd.DataFrame(a, index=['Log', 'Model'])
    df = df.style.hide_index()
    return display(df)

We will need a decode method for the integer encoded event sequences of the `Dataset`. We can create a `decode` method from the `sklearn.preprocessing.LabelEncoder` instances saved in `dataset.encoders`.

In [9]:
decode = dict((k, v) for k, v in enumerate(dataset.encoders['name'].classes_))
decode[-1] = decode[0]  # Padding
decode[0] = '»'
decode = np.vectorize(decode.get)

Now, we can display alignments from the `alignments` object. `alignments` is a NumPy array with shape `(num_cases, k, 2, max_sequence_len)`.

In [10]:
alignments.shape

(5000, 5, 2, 26)

To display the top-1 alignment for the first case we can use the following.

In [11]:
display_alignment(alignments[0][0], decode=decode)

0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


Seems like this was not an anomalous case. Let us check.

In [12]:
dataset.text_labels[0]

'Normal'

Remember, the different anomaly types are as follows.

In [13]:
set(dataset.text_labels)

{'Attribute', 'Early', 'Insert', 'Late', 'Normal', 'Rework', 'SkipSequence'}

Let us take a look at a *Skip* anomaly. This is also the example given in the Evaluation section of the paper.

In [14]:
skip_anomalies = np.where(dataset.text_labels == 'SkipSequence')[0]
display_alignment(alignments[skip_anomalies[0]][0], decode=decode)

0,1,2,3,4,5,6,7,8
▶,Activity A,Activity B,Activity C,»,»,Activity T,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity T,Activity F,■


We can do the same for all anomaly types.

In [15]:
for anomaly_type in set(dataset.text_labels):
    index = np.where(dataset.text_labels == anomaly_type)[0][0] # Select the first one
    print(anomaly_type)
    display_alignment(alignments[index][0], decode=decode)

Normal
Rework
Attribute
SkipSequence
Early
Late
Insert


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12,13
▶,Activity A,Activity B,Activity C,Activity D,Activity B,Activity C,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,»,»,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6,7,8
▶,Activity A,Activity B,Activity C,»,»,Activity T,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity T,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12
▶,Activity A,Activity B,»,Activity D,Activity E,Activity C,Activity K,Activity M,Activity N,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,»,Activity K,Activity M,Activity N,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12
▶,»,Activity B,Activity C,Activity D,Activity E,Activity K,Activity A,Activity O,Activity P,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity K,»,Activity O,Activity P,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10
▶,Random activity 18,Activity A,Activity B,Activity C,Activity D,Random activity 11,Activity E,Activity T,Activity F,■
▶,»,Activity A,Activity B,Activity C,Activity D,»,Activity E,Activity T,Activity F,■


## Computing Alignments with the Heuristics Miner

Now, let us take a look at the Heuristics Miner using the shorthand `'hm'`.

In [16]:
heuristics_miner, dataset = get_model('hm', dataset_name)

small-0.3-1_hm


In [17]:
alignments, x, costs = heuristics_miner.align(dataset)

  return np.array(self.cases)[indices]
100%|██████████| 659/659 [00:12<00:00, 52.67it/s]


Let us look at the same examples as for the DeepAlign algorithm. We can see that the Heuristics Miner does not always produce the correct alignments.

In [18]:
for anomaly_type in set(dataset.text_labels):
    index = np.where(dataset.text_labels == anomaly_type)[0][0] # Select the first one
    print(anomaly_type)
    display_alignment(alignments[index][0], decode=decode)

Normal
Rework
Attribute
SkipSequence
Early
Late
Insert


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12,13
▶,Activity A,Activity B,Activity C,Activity D,Activity B,Activity C,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,»,»,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6
▶,Activity A,Activity B,Activity C,Activity T,Activity F,■
▶,Activity A,Activity B,Activity C,»,»,■


0,1,2,3,4,5,6,7,8,9,10,11,12
▶,Activity A,Activity B,»,Activity D,Activity E,Activity C,Activity K,Activity M,Activity N,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,»,Activity K,Activity M,Activity N,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity B,Activity C,Activity D,Activity E,Activity K,Activity A,Activity O,Activity P,Activity L,Activity F,■
▶,Activity B,Activity C,Activity D,Activity E,Activity K,»,Activity O,Activity P,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10
▶,Random activity 18,Activity A,Activity B,Activity C,Activity D,Random activity 11,Activity E,Activity T,Activity F,■
▶,»,Activity A,Activity B,Activity C,Activity D,»,Activity E,Activity T,Activity F,■


## Ground Truth Alignments

We can obtain the ground truth alignments from the `dataset` with the following method.

In [19]:
ground_truth_alignments, ground_truth_costs = dataset.alignments

Let us check the ground truth for the example cases from above.

In [20]:
for anomaly_type in set(dataset.text_labels):
    index = np.where(dataset.text_labels == anomaly_type)[0][0] # Select the first one
    print(anomaly_type)
    display_alignment(ground_truth_alignments[index], decode=decode)

Normal
Rework
Attribute
SkipSequence
Early
Late
Insert


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12,13
▶,Activity A,Activity B,Activity C,Activity D,Activity B,Activity C,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,»,»,Activity E,Activity K,Activity R,Activity S,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity G,Activity H,Activity I,Activity J,Activity F,■


0,1,2,3,4,5,6,7,8
▶,Activity A,Activity B,Activity C,»,»,Activity T,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity T,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12
▶,Activity A,Activity B,»,Activity D,Activity E,Activity C,Activity K,Activity M,Activity N,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,»,Activity K,Activity M,Activity N,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10,11,12
▶,»,Activity B,Activity C,Activity D,Activity E,Activity K,Activity A,Activity O,Activity P,Activity L,Activity F,■
▶,Activity A,Activity B,Activity C,Activity D,Activity E,Activity K,»,Activity O,Activity P,Activity L,Activity F,■


0,1,2,3,4,5,6,7,8,9,10
▶,Random activity 18,Activity A,Activity B,Activity C,Activity D,Random activity 11,Activity E,Activity T,Activity F,■
▶,»,Activity A,Activity B,Activity C,Activity D,»,Activity E,Activity T,Activity F,■


By comparing the ground truth to the alignments from the algorithms, we can evaluate the accuracy. This will be shown in the next notebook.

## Alignments from Nothing with ConfNet (Examples from Sec. 5)

As described in the paper, we can also compute alignments from an empty case using the DeepAlign algorithm. To demonstrate this let us load a ConfNet that uses only the case attributes.

In [21]:
confnet, dataset = get_model('confnet', dataset_name, True, False)

small-0.3-1_confnet01


IndexError: list index out of range

In the dataset, we have two case attributes, `decision` and `topic`, based on which the resulting sequence will differ.

In [None]:
dataset.attribute_keys

In [None]:
print(dataset.encoders['[Case]_decision'].classes_.tolist())
print(dataset.encoders['[Case]_topic'].classes_.tolist())

Let us create an emtpy case sequence using the start and end symbols and the encode functionality `transform` of the `dataset.encoders`. 

In [None]:
def get_empty_case(decision, topic):
    return [
        dataset.encoders['name'].transform(['▶', '■'])[None, :],  # We have to add one dimension here
        dataset.encoders['[Case]_decision'].transform([decision]),
        dataset.encoders['[Case]_topic'].transform([topic]),
    ]

Now, we can use this method to create empty sequences, while setting the case attributes as we wish.

In [None]:
empty_case = get_empty_case('Accept', 'Engineering')
empty_case

Let us see what the DeepAlign algorithm does. We have to run it for some more steps to create enough events for a complete sequence.

In [None]:
alignments, corrected_cases, costs = confnet.align(empty_case, k=5, steps=50)

In [None]:
display_alignment(alignments[0][0], decode=decode)

Interestingly, DeepAlign creates the correct case given the two case attributes. *Minor Revision* only occurs for `decision == 'Accept'` and `decision == 'Weak accept'`. Similarly, *Develop Method* is related to `topic == 'Engineering'`.

Let us see what happens if we change this around.

In [None]:
empty_case = get_empty_case('Accept', 'Theory')

In [None]:
alignments, corrected_cases, costs = confnet.align(empty_case, k=5, steps=50)

In [None]:
display_alignment(alignments[0][0], decode=decode)

Now, it correctly generates the *Develop Hypothesis* activity.

What happens, if we change the `decision` to something else? Let us find out.

In [None]:
empty_case = get_empty_case('Weak reject', 'Theory')

In [None]:
alignments, corrected_cases, costs = confnet.align(empty_case, k=5, steps=50)

In [None]:
display_alignment(alignments[0][0], decode=decode)

The *Minor Revision* is gone.