In [1]:
# add parent dir to sys path for import of modules
import os
import sys
parentdir = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir))
sys.path.insert(0, parentdir) 

# Key Word Approach
variables with prefix ``doc_`` contain data from the dataset
variables with prefix ``o_`` contain data from own computations

In [172]:
import itertools

from petreader.RelationsExtraction import RelationsExtraction
from petreader.TokenClassification import TokenClassification
from petreader import labels
from petreader.labels import *

In [2]:
relations_dataset = RelationsExtraction()
token_dataset = TokenClassification()

Reusing dataset pet (C:\Users\janek\.cache\huggingface\datasets\patriziobellan___pet\relations-extraction\1.0.1\38434e2af57af533c400c8975f37e43c08bb77739085a3c026a862b2efb668d2)




 _______ _     _ _______       _____  _______ _______      ______  _______ _______ _______ _______ _______ _______
    |    |_____| |______      |_____] |______    |         |     \ |_____|    |    |_____| |______ |______    |   
    |    |     | |______      |       |______    |         |_____/ |     |    |    |     | ______| |______    |   
                                                                                                                  
Discover more at: [https://pdi.fbk.eu/pet-dataset/]



  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset pet (C:\Users\janek\.cache\huggingface\datasets\patriziobellan___pet\token-classification\1.0.1\38434e2af57af533c400c8975f37e43c08bb77739085a3c026a862b2efb668d2)




 _______ _     _ _______       _____  _______ _______      ______  _______ _______ _______ _______ _______ _______
    |    |_____| |______      |_____] |______    |         |     \ |_____|    |    |_____| |______ |______    |   
    |    |     | |______      |       |______    |         |_____/ |     |    |    |     | ______| |______    |   
                                                                                                                  
Discover more at: [https://pdi.fbk.eu/pet-dataset/]



  0%|          | 0/1 [00:00<?, ?it/s]

## 1 Prepare Document Data

In [76]:
doc_names = token_dataset.GetDocumentNames()
doc_names.sort(key=lambda name: (int(name[4]), name[5]))

print(doc_names)

['doc-1.3', 'doc-1.4', 'doc-1.2', 'doc-1.1', 'doc-10.11', 'doc-10.2', 'doc-10.8', 'doc-10.10', 'doc-10.7', 'doc-10.6', 'doc-10.12', 'doc-10.4', 'doc-10.1', 'doc-10.5', 'doc-10.3', 'doc-10.13', 'doc-10.14', 'doc-10.9', 'doc-2.2', 'doc-2.1', 'doc-3.6', 'doc-3.7', 'doc-3.2', 'doc-3.5', 'doc-3.3', 'doc-3.1', 'doc-3.8', 'doc-4.1', 'doc-5.4', 'doc-5.2', 'doc-5.1', 'doc-5.3', 'doc-6.3', 'doc-6.4', 'doc-6.1', 'doc-6.2', 'doc-7.1', 'doc-8.3', 'doc-8.1', 'doc-8.2', 'doc-9.3', 'doc-9.4', 'doc-9.2', 'doc-9.5', 'doc-9.1']


### 1.1 Read Example Doc

In [244]:
doc_name = "doc-9.5"  # "doc-1.2"  # doc-3.2
doc_number = relations_dataset.GetDocumentNumber(doc_name)
print(f"  {doc_name}  ".center(50, '*'))
doc_text = token_dataset.GetDocumentText(doc_name)
print(doc_text)

*******************  doc-9.5  ********************
After the Expense Report is received , a new account must be created if the employee does not already have one .
The report is then reviewed for automatic approval .
Amounts under $200 are automatically approved , whereas amounts equal to or over $200 require approval of the supervisor .
In case of rejection , the employee must receive a rejection notice by email .
Otherwise , the reimbursement goes to the employees direct deposit bank account .
If the request is not completed in 7 days , then the employee must receive an approval in progress email .
If the request is not finished within 30 days , then the process is stopped and the employee receives an email cancellation notice and must re-submit the expense report .


In [245]:
doc_activities = token_dataset.GetDocumentActivities(doc_name)
print(" activities and NER labels (per sentences) ".center(50, '*'))
print(doc_activities)
doc_sentence_ner_labels = relations_dataset.GetSentencesWithIdsAndNerTagLabels(doc_number)
print(doc_sentence_ner_labels[:4])
doc_relations = relations_dataset.GetRelations(doc_number)
doc_flow_relations, doc_same_gateway_relations = doc_relations[labels.FLOW], doc_relations[labels.SAME_GATEWAY]

print(" same gateway relations ".center(50, '*'))
for same_gateway_relation in doc_same_gateway_relations:
    for key, value in same_gateway_relation.items():
        print(f"{key}: {value}")
    print()

*** activities and NER labels (per sentences) ****
[[['received'], ['created']], [['reviewed']], [['approved'], ['require']], [['receive']], [['goes']], [['receive']], [['receives'], ['re-submit']]]
[[('After', 0, 'O'), ('the', 1, 'B-Activity Data'), ('Expense', 2, 'I-Activity Data'), ('Report', 3, 'I-Activity Data'), ('is', 4, 'O'), ('received', 5, 'B-Activity'), (',', 6, 'O'), ('a', 7, 'B-Activity Data'), ('new', 8, 'I-Activity Data'), ('account', 9, 'I-Activity Data'), ('must', 10, 'O'), ('be', 11, 'O'), ('created', 12, 'B-Activity'), ('if', 13, 'B-XOR Gateway'), ('the', 14, 'B-Condition Specification'), ('employee', 15, 'I-Condition Specification'), ('does', 16, 'I-Condition Specification'), ('not', 17, 'I-Condition Specification'), ('already', 18, 'I-Condition Specification'), ('have', 19, 'I-Condition Specification'), ('one', 20, 'I-Condition Specification'), ('.', 21, 'O')], [('The', 0, 'B-Activity Data'), ('report', 1, 'I-Activity Data'), ('is', 2, 'O'), ('then', 3, 'O'), ('rev

### 1.2 Preprocess sentences

In [246]:
num_sentences = len(doc_activities) # activities is 2 dim list (one per sentence)
print(num_sentences)
doc_sentences_raw = [sentence.strip() for sentence in doc_text.split(".") if sentence.strip() != ""]
for i, s in enumerate(doc_sentences_raw):
    print(i, s)
assert num_sentences == len(doc_sentences_raw)  # check if number of extracted sentences == from dataset 

7
0 After the Expense Report is received , a new account must be created if the employee does not already have one
1 The report is then reviewed for automatic approval
2 Amounts under $200 are automatically approved , whereas amounts equal to or over $200 require approval of the supervisor
3 In case of rejection , the employee must receive a rejection notice by email
4 Otherwise , the reimbursement goes to the employees direct deposit bank account
5 If the request is not completed in 7 days , then the employee must receive an approval in progress email
6 If the request is not finished within 30 days , then the process is stopped and the employee receives an email cancellation notice and must re-submit the expense report


### 1.3 Filter Gateways Tokens

In [247]:
def filter_ner_labels(sentence_ner_labels, target_label):
    return [[token for token in s_list if target_label in token[2]]
                        for s_list in sentence_ner_labels]

doc_xor_gateway = filter_ner_labels(doc_sentence_ner_labels, labels.XOR_GATEWAY)
doc_and_gateway = filter_ner_labels(doc_sentence_ner_labels, labels.AND_GATEWAY)
print(doc_xor_gateway)
print(doc_and_gateway)

[[('if', 13, 'B-XOR Gateway')], [], [('whereas', 7, 'B-XOR Gateway')], [('In', 0, 'B-XOR Gateway'), ('case', 1, 'I-XOR Gateway'), ('of', 2, 'I-XOR Gateway')], [('Otherwise', 0, 'B-XOR Gateway')], [('If', 0, 'B-XOR Gateway')], [('If', 0, 'B-XOR Gateway')]]
[[], [], [], [], [], [], []]


### 1.4 Filter Sequence Flows

In [248]:
def filter_flow_relations(flow_relations, entity_type_list):
    """
    filter list of flow relations (single dictionaries) for source or target entity type = given entity tyoe
    """
    return [flow_relation for flow_relation in flow_relations if flow_relation[labels.SOURCE_ENTITY_TYPE] in entity_type_list
                                                               or flow_relation[labels.TARGET_ENTITY_TYPE] in entity_type_list]

doc_flow_relations_xor = filter_flow_relations(doc_flow_relations, [labels.XOR_GATEWAY, labels.CONDITION_SPECIFICATION])
doc_flow_relations_and = filter_flow_relations(doc_flow_relations, [labels.AND_GATEWAY])
doc_flow_relations_gateways = filter_flow_relations(doc_flow_relations, [labels.XOR_GATEWAY, labels.AND_GATEWAY, labels.CONDITION_SPECIFICATION])
print(f"Flow relations involving XOR gateways {len(doc_flow_relations_xor)}; "\
      f"AND gateways {len(doc_flow_relations_and)}; overall gateways {len(doc_flow_relations_gateways)}; overall {len(doc_flow_relations)}")

Flow relations involving XOR gateways 18; AND gateways 0; overall gateways 18; overall 21


### 1.5 Enrich activities with token index

In [249]:
doc_activity_tokens = []
for i, (tokens, activities) in enumerate(zip(doc_sentence_ner_labels, doc_activities)):
    sentence_activity_tokens = []
    # note: activity is a list because it could consist of more words (neglect here)
    for activity in activities:
        activity_token_triple = [token_triple for token_triple in tokens if token_triple[0] == activity[0]][0]
        sentence_activity_tokens.append((activity, activity_token_triple[1]))
    doc_activity_tokens.append(sentence_activity_tokens)

## 2 Extract Gateways

### 2.1 Key Word List
#### A) take words from all existing gateways in PET dataset as gold list for detection

In [92]:
def get_gateway_key_words(dataset_gateway_list):
    flattened = list(itertools.chain(*dataset_gateway_list))
    phrases = [" ".join(g).lower() for g in flattened]  # join phrases together if multiple words
    unique = list(set(phrases))
    unique.sort()
    return unique

xor_key_words_gold = get_gateway_key_words(token_dataset.GetXORGateways())
and_key_words_gold = get_gateway_key_words(token_dataset.GetANDGateways())

print(f"XOR gold ({len(xor_key_words_gold)})", xor_key_words_gold)
print(f"AND gold ({len(and_key_words_gold)})", and_key_words_gold)

XOR gold (15) ['either', 'for', 'for each patient for which', 'for the case', 'if', 'in case', 'in case of', 'in the case of', 'it can also happen that', 'or', 'otherwise', 'should', 'sometimes', 'under certain circumstances', 'whereas']
AND gold (6) ['at the same time', 'in the meantime', 'meantime', 'two concurrent activities are triggered', 'whereas', 'while']


#### B) Curated List from Literature

In [253]:
# Ferreira et al. 2017
with open('data/keywords/literature_xor.txt') as f:
    xor_key_words_literature = f.read().splitlines()
    xor_key_words_literature.sort()

with open('data/keywords/literature_and.txt') as f:
    and_key_words_literature = f.read().splitlines()
    and_key_words_literature.sort()   

print(f"XOR literature ({len(xor_key_words_literature)})", xor_key_words_literature)
print(f"AND literature ({len(and_key_words_literature)})", and_key_words_literature)

XOR literature (14) ['either', 'if', 'if not', 'in case', 'in case of', 'only', 'only if', 'or', 'otherwise', 'till', 'unless', 'until', 'when', 'whether']
AND literature (11) ['at the same time', 'concurrently', 'in addition to', 'in parallel', 'in parallel with this', 'in the meantime', 'meantime', 'meanwhile', 'simultaneously', 'whereas', 'while']


In [10]:
for s in doc_sentences_raw:
    print(s)

A small company manufactures customized bicycles
Whenever the sales department receives an order , a new process instance is created
A member of the sales department can then reject or accept the order for a customized bike
In the former case , the process instance is finished
In the latter case , the storehouse and the engineering department are informed
The storehouse immediately processes the part list of the order and checks the required quantity of each part
If the part is available in-house , it is reserved
If it is not available , it is back-ordered
This procedure is repeated for each item on the part list
In the meantime , the engineering department prepares everything for the assembling of the ordered bicycle
If the storehouse has successfully reserved or back-ordered every item of the part list and the preparation activity has finished , the engineering department assembles the bicycle
Afterwards , the sales department ships the bicycle to the customer and finishes the proces

### 2.2 Extraction Algorithm

In [212]:
def extract_gateways(sentence_list, key_words, target_gateway_label):
    """
    extracts gateways in a key-word-based manner given a document structured in a list of sentences
    if two phrases would match to a token (e.g. 'in the meantime' and 'meantime'), the longer phrase is extracted
    target_gateway_label: str, must be 'XOR Gateway' or 'AND Gateway'
    
    result list is two dimensional -> list of tuples (word, position in sentence, tag) for each sentence
    this produces the same structure as sentences and their NER labels are annotated in PET dataset
    """
    gateways = []
    benchmark_gateways = []
    # sort key words descending by length of words in phrase
    key_words.sort(key=lambda key_word_phrase: len(key_word_phrase.split(" ")), reverse=True)

    # 1) extract gateways
    for s_idx, sentence in enumerate(sentence_list):
        # print(f" SENTENCE {s_idx} ".center(50, '-'))
        # print(sentence_list[s_idx])
        sentence_gateways = []
        sentence_to_search = f" {sentence.lower()} "  # lowercase and wrap with spaces for search of key words
        tokens = sentence.split(" ")
        tokens_lower = sentence.lower().split(" ")
        tokens_already_matched_with_key_phrase = []

        # iterate over key phrases
        for key_phrase in key_words:
            key_phrase_to_search = f" {key_phrase} "

            # if key phrase is in sentence, search index and extract
            if key_phrase_to_search in sentence_to_search:
                key_phrase_tokens = key_phrase.split(" ")
                
                # check key phrase for every token
                for t_idx, token in enumerate(tokens_lower):
                    candidate = True
                    # iterate over key phrase tokens in case of multiple world phrase
                    for key_phrase_token_idx, key_phrase_token in enumerate(key_phrase_tokens):
                        # check if token is not part of key phrase or token is already matched with another phrase
                        # if yes, stop processing candidate
                        if not tokens_lower[t_idx + key_phrase_token_idx] == key_phrase_token or \
                            t_idx + key_phrase_token_idx in tokens_already_matched_with_key_phrase:
                            candidate = False
                            break
                    
                    # add tokens to result only if all tokens are matched and not already part of a longer phrase
                    if candidate:
                        for i, key_phrase_token in enumerate(key_phrase_tokens):
                            prefix = "B" if i == 0 else "I"
                            # append tuples with extract information as in PET and process information about gateways to filter later
                            sentence_gateways.append((tokens[t_idx + i], t_idx + i, f"{prefix}-{labels.XOR_GATEWAY}"))
                            tokens_already_matched_with_key_phrase.append(t_idx + i)
                            benchmark_gateways.append([tokens[t_idx + i] for i, x in enumerate(key_phrase_tokens)])

        sentence_gateways.sort(key=lambda gateway_triple: gateway_triple[1])
        gateways.append(sentence_gateways)

    return gateways, benchmark_gateways

In [254]:
# available key word lists: xor_key_words_gold, and_key_words_gold, xor_key_words_literature, and_key_words_literature
o_xor_gateways, o_xor_gateways_benchmark = extract_gateways(doc_sentences_raw, xor_key_words_literature, labels.XOR_GATEWAY)
o_and_gateways, o_and_gateways_benchmark = extract_gateways(doc_sentences_raw, and_key_words_literature, labels.AND_GATEWAY)

print("XOR GATEWAYS")
for idx, sentence_gateways in enumerate(o_xor_gateways):
    print(idx, sentence_gateways)
print("\nAND GATEWAYS")
for idx, sentence_gateways in enumerate(o_and_gateways):
    print(idx, sentence_gateways)

XOR GATEWAYS
0 [('if', 13, 'B-XOR Gateway')]
1 []
2 [('or', 11, 'B-XOR Gateway')]
3 [('In', 0, 'B-XOR Gateway'), ('case', 1, 'I-XOR Gateway'), ('of', 2, 'I-XOR Gateway')]
4 [('Otherwise', 0, 'B-XOR Gateway')]
5 [('If', 0, 'B-XOR Gateway')]
6 [('If', 0, 'B-XOR Gateway')]

AND GATEWAYS
0 []
1 []
2 [('whereas', 7, 'B-XOR Gateway')]
3 []
4 []
5 []
6 []


## 3 Extract Control Flows

### 3.1 Helper Methods

In [13]:
def get_flow_relation_representation(sentence_idx, token_idx, entity_type, entity, source=True):
    if source:
        return {
            labels.SOURCE_SENTENCE_ID: sentence_idx,
            labels.SOURCE_HEAD_TOKEN_ID: token_idx,
            labels.SOURCE_ENTITY_TYPE: entity_type,
            labels.SOURCE_ENTITY: entity
        }
    else:
        return {
            labels.TARGET_SENTENCE_ID: sentence_idx,
            labels.TARGET_HEAD_TOKEN_ID: token_idx,
            labels.TARGET_ENTITY_TYPE: entity_type,
            labels.TARGET_ENTITY: entity
        }
        

def merge_source_target_dicts(source_dict, target_dict):
    return {**source_dict, **target_dict}

### 3.2 Involving AND gateways

In [229]:
# 1) METHODS FOR EXTRACTING THE PREVIOUS (INCL. SECOND PREVIOUS) AND NEXT ACTIVITY

def get_previous_activity(sentence_idx, token_idx, doc_activity_tokens, skip_first=False, one_already_found=False):
    """
    search recursive for the second last previous activity from a start point defined by sentence_idx and token_idx
    sentence_idx: sentence index where to start the search
    token_idx: token index where to stat the search
    doc_activity_tokens: list of activity lists (describes whole document)
    skip_first: True if searching for the second previous activity, False (default) when searching for the previous activity
    one_already_found: flag if one activity was already found and skipped for return in course of search for the second previous
    
    returns: triple of (sentence idx, token_idx, token)
    """
    # search for activities left to the token in target sentence if token is given else in the whole
    if token_idx is not None:
        previous_activities_sentence = [a_t for a_t in doc_activity_tokens[sentence_idx] if a_t[1] < token_idx]
    else:
        previous_activities_sentence = [a_t for a_t in doc_activity_tokens[sentence_idx]]
    
    if previous_activities_sentence:
        # return when just searching the first last activity OR when one was already found before
        previous_activity = previous_activities_sentence[-1]
        # A) base case: activity found
        if not skip_first or one_already_found:
            return (sentence_idx, previous_activity[1], previous_activity[0])
        # B) recursive case: continue search for second previous activity at index of previous activity
        else:
            return get_previous_activity(sentence_idx, previous_activity[1], doc_activity_tokens, one_already_found=True)
    # B) recursive case: continue search for previous activity in previous sentence
    else:
        next_sentence_idx = sentence_idx - 1
        # no sentences any more to search
        if next_sentence_idx == -1:
            return None
        # otherwise search recursively the previous sentence
        else:
            return get_previous_activity(next_sentence_idx, None, doc_activity_tokens, 
                                         skip_first=skip_first, one_already_found=one_already_found)

def get_following_activity(sentence_idx, token_idx, doc_activity_tokens, skip_first=False, one_already_found=False):
    """
    search recursive for the next following activity from a start point defined by sentence_idx and token_idx
    sentence_idx: sentence index where to start the search
    token_idx: token index where to stat the search
    doc_activity_tokens: list of activity lists (describes whole document)
    
    returns: triple of (sentence idx, token_idx, token)
    """
    # search for activities right to the token in target sentence if token is given else in the whole
    if token_idx is not None:
        following_activities_sentence = [a_t for a_t in doc_activity_tokens[sentence_idx] if a_t[1] > token_idx]
    else:
        following_activities_sentence = [a_t for a_t in doc_activity_tokens[sentence_idx]]

    # if activities were found, take the first one
    if following_activities_sentence:
        # return when just searching the first following activity OR when one was already found before
        following_activity = following_activities_sentence[0]
        # 1a) base case: activity found
        if not skip_first or one_already_found:
            return (sentence_idx, following_activity[1], following_activity[0])
        # 2a) recursive case: continue search for second following activity at index of following activity
        else:
            return get_following_activity(sentence_idx, following_activity[1], doc_activity_tokens,
                                          one_already_found=True)

    else:
        next_sentence_idx = sentence_idx + 1
        # 1b) base case: no sentences any more to search
        if next_sentence_idx == len(doc_activity_tokens):
            return None
        # 2b) recursive case: continue search for following activity in following sentence
        else:
            return get_following_activity(next_sentence_idx, None, doc_activity_tokens,
                                                skip_first=skip_first, one_already_found=one_already_found)

        
# 2) EXTRACT RELATIONS
def _extract_concurrent_flows(sentences, doc_activity_tokens, own_gateways):
    """
    extract flow relations for already found AND gateways following the logic:
    + for every gateway, to extract parallel branches, add relation to next activity after and before, because
      thats the pattern how AND key phrases are usually used (oriented by rules of Ferreira et al. 2017)
    + for each case, check over borders if not found in same sentence
    + to extract the flow relation that points to the gateway merge point, take the second before
    + Assumption: only one parallel gateway per sentence

    sentences: list of sentences (used only for debugging)
    doc_activity_tokens: list of activity tokens (word, idx) for each sentence
    own_gateways: list of own extracted gateway for each sentence
    
    return: list of flow relations in source/target dict representation
    """
    relations = []
    
    for s_idx, (sentence, activity_tokens, gateways) in enumerate(zip(sentences, doc_activity_tokens, own_gateways)):
        if gateways:
            # assume only one gateway
            gateway_lead_token = gateways[0]
            gateway_entity = [g[0] for g in gateways]
            
            # 1) Find related activities (previous and following are concurrent activities; second previous the one
            # before the gateway; second following the one after the gateway)
            pa = get_previous_activity(s_idx, gateway_lead_token[1], doc_activity_tokens)
            ppa = get_previous_activity(s_idx, gateway_lead_token[1], doc_activity_tokens, skip_first=True)
            fa = get_following_activity(s_idx, gateway_lead_token[1], doc_activity_tokens)
            ffa = get_following_activity(s_idx, gateway_lead_token[1], doc_activity_tokens, skip_first=True)
            
            # 2) Get representations for flow object dictionaries
            g_source = get_flow_relation_representation(s_idx, gateway_lead_token[1], AND_GATEWAY, gateway_entity,
                                                  source=True)
            g_target = get_flow_relation_representation(s_idx, gateway_lead_token[1], AND_GATEWAY, gateway_entity,
                                                  source=False)
            if pa: # could be None if at document start
                pa_target = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=False)
                pa_source = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=True)
            if ppa: # could be None if at document start
                ppa_source = get_flow_relation_representation(ppa[0], ppa[1], ACTIVITY, ppa[2], source=True)
            if fa: # could be None if at document end
                fa_target = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=False)
                fa_source = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=True)
            if ffa: # could be None if at document end
                ffa_target = get_flow_relation_representation(ffa[0], ffa[1], ACTIVITY, ffa[2], source=False)

            # 3) Create relations
            # a) flow to gateway: second previous -> gateway
            if ppa: # could be None if at document start
                relations.append(merge_source_target_dicts(ppa_source, g_target))
            # b) split into concurrent gateway branches: gateway -> previous; gateway -> following
            # following two None checks (probably) wont never be False, but for safety included
            if pa: # could be None if at document start
                relations.append(merge_source_target_dicts(g_source, pa_target))
            if fa:  # could be None if at document end
                relations.append(merge_source_target_dicts(g_source, fa_target))
            # c) merge branches together: previous -> second following; following -> second following
            if ffa: # could be None if at document end
                relations.append(merge_source_target_dicts(pa_source, ffa_target))
                relations.append(merge_source_target_dicts(fa_source, ffa_target))

    return relations

o_flow_relations_and = _extract_concurrent_flows(doc_sentences_raw, doc_activity_tokens, o_and_gateways)

In [230]:
for i, flow_relation in enumerate(o_flow_relations_and):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()

In [166]:
# control:
for i, flow_relation in enumerate(doc_flow_relations_and):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()

source-head-sentence-ID: 1
source-head-word-ID: 9
source-entity-type: Activity
source-entity: ['initiated']
target-head-sentence-ID: 2
target-head-word-ID: 15
target-entity-type: AND Gateway
target-entity: ['meantime']

source-head-sentence-ID: 1
source-head-word-ID: 17
source-entity-type: Activity
source-entity: ['tracked']
target-head-sentence-ID: 2
target-head-word-ID: 15
target-entity-type: AND Gateway
target-entity: ['meantime']

source-head-sentence-ID: 2
source-head-word-ID: 15
source-entity-type: AND Gateway
source-entity: ['meantime']
target-head-sentence-ID: 2
target-head-word-ID: 9
target-entity-type: Activity
target-entity: ['handed']

source-head-sentence-ID: 2
source-head-word-ID: 15
source-entity-type: AND Gateway
source-entity: ['meantime']
target-head-sentence-ID: 2
target-head-word-ID: 20
target-entity-type: Activity
target-entity: ['distributed']



### 3.2 Involving XOR gateways

#### Input B): Extracted Gateways:

In [195]:
for i, sentence_gateways in enumerate(o_xor_gateways):
    print(i, sentence_gateways)

0 [('for', 23, 'B-XOR Gateway')]
1 [('If', 0, 'B-XOR Gateway'), ('otherwise', 11, 'B-XOR Gateway')]
2 []
3 []


#### Input A): Activities:

In [89]:
for i, s in enumerate(doc_sentences_raw):
    print(i, s)

0 Each morning , the files which have yet to be processed need to be checked , to make sure they are in order for the court hearing that day
1 If some files are missing , a search is initiated , otherwise the files can be physically tracked to the intended location
2 Once all the files are ready , these are handed to the Associate , and meantime the Judgeis Lawlist is distributed to the relevant people
3 Afterwards , the directions hearings are conducted


In [168]:
for i, sentence_activities in enumerate(doc_activities):
    print(i, sentence_activities)

0 [['checked']]
1 [['initiated'], ['tracked']]
2 [['handed'], ['distributed']]
3 [['conducted']]


#### Gold Data: Flow Relations that involve AND Gateways

In [91]:
print(len(doc_flow_relations))
for i, flow_relation in enumerate(doc_flow_relations):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()

10
source-head-sentence-ID: 0
source-head-word-ID: 14
source-entity-type: Activity
source-entity: ['checked']
target-head-sentence-ID: 1
target-head-word-ID: 0
target-entity-type: XOR Gateway
target-entity: ['If']

source-head-sentence-ID: 1
source-head-word-ID: 0
source-entity-type: XOR Gateway
source-entity: ['If']
target-head-sentence-ID: 1
target-head-word-ID: 1
target-entity-type: Condition Specification
target-entity: ['some', 'files', 'are', 'missing']

source-head-sentence-ID: 1
source-head-word-ID: 1
source-entity-type: Condition Specification
source-entity: ['some', 'files', 'are', 'missing']
target-head-sentence-ID: 1
target-head-word-ID: 9
target-entity-type: Activity
target-entity: ['initiated']

source-head-sentence-ID: 1
source-head-word-ID: 9
source-entity-type: Activity
source-entity: ['initiated']
target-head-sentence-ID: 2
target-head-word-ID: 15
target-entity-type: AND Gateway
target-entity: ['meantime']

source-head-sentence-ID: 1
source-head-word-ID: 11
source-ent

In [156]:
for i, flow_relation in enumerate(doc_flow_relations_xor): # (doc_flow_relations_xor):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()

source-head-sentence-ID: 0
source-head-word-ID: 14
source-entity-type: Activity
source-entity: ['hands', 'out']
target-head-sentence-ID: 1
target-head-word-ID: 0
target-entity-type: XOR Gateway
target-entity: ['If']

source-head-sentence-ID: 1
source-head-word-ID: 0
source-entity-type: XOR Gateway
source-entity: ['If']
target-head-sentence-ID: 1
target-head-word-ID: 1
target-entity-type: Condition Specification
target-entity: ['the', 'customer', 'decides', 'that', 'the', 'costs', 'are', 'acceptable']

source-head-sentence-ID: 1
source-head-word-ID: 1
source-entity-type: Condition Specification
source-entity: ['the', 'customer', 'decides', 'that', 'the', 'costs', 'are', 'acceptable']
target-head-sentence-ID: 3
target-head-word-ID: 11
target-entity-type: AND Gateway
target-entity: ['whereas']

source-head-sentence-ID: 1
source-head-word-ID: 14
source-entity-type: XOR Gateway
source-entity: ['otherwise']
target-head-sentence-ID: 1
target-head-word-ID: 16
target-entity-type: Activity
targe

In [104]:
for i, sentence_gateways in enumerate(o_xor_gateways):
    print(i, sentence_gateways)

0 [('for', 23, 'B-XOR Gateway')]
1 [('If', 0, 'B-XOR Gateway'), ('otherwise', 11, 'B-XOR Gateway')]
2 []
3 []


In [255]:
for i, s in enumerate(doc_sentences_raw):
    print(i, s)

0 After the Expense Report is received , a new account must be created if the employee does not already have one
1 The report is then reviewed for automatic approval
2 Amounts under $200 are automatically approved , whereas amounts equal to or over $200 require approval of the supervisor
3 In case of rejection , the employee must receive a rejection notice by email
4 Otherwise , the reimbursement goes to the employees direct deposit bank account
5 If the request is not completed in 7 days , then the employee must receive an approval in progress email
6 If the request is not finished within 30 days , then the process is stopped and the employee receives an email cancellation notice and must re-submit the expense report


In [262]:
contradictory_gateways = [(['if'], ['otherwise']), (['if'], ['else']), (['if'], ['if']), (['in', 'case', 'of'], ['otherwise'])]

def _extract_exclusive_flows(doc_activity_tokens, extracted_gateways):
    sequence_flows = []
    same_gateway_relations = []
    
    # helper method only for this method
    def preprocess_gateways(extracted_gateways):
        """
        flatten gateways but keep sentence index; merge multiple gateway tokens into one gateway
        :param extracted_gateways: gateways in PET format
        :return: flattened gateway list filled with (sentence_idx, start_token_idx, ['Word', 'List'], ['word', 'list'])
        """
        gateways = []
        for sentence_idx, sentence_gateways in enumerate(extracted_gateways):
            sentence_gateways_already_included = []
            for i, gateway in enumerate(sentence_gateways):
                if gateway not in sentence_gateways_already_included:
                    gateway_tokens = [gateway[0]]
                    start_token_idx = gateway[1]
                    # append further tokens of same gateway ('I-' marked)
                    I_index = i+1
                    while I_index < len(sentence_gateways) and sentence_gateways[I_index][2].startswith('I-'):
                        gateway_tokens.append(sentence_gateways[I_index][0])
                        sentence_gateways_already_included.append(sentence_gateways[I_index])
                        I_index += 1
                    gateway_tokens_lower = [t.lower() for t in gateway_tokens]
                    gateways.append((sentence_idx, start_token_idx, gateway_tokens, gateway_tokens_lower))    
        return gateways
    gateways = preprocess_gateways(extracted_gateways)
    gateways_involved = []  # list for gateways already involved into sequence flows
    
    # RULE 1): check for every pair of following gateways if it fits to a gateway constellation with contradictory key words
    # gateways must be in range of X (default 1) sentences, otherwise they would be seen as seperate ones
    # Signal word of first gateway must be at the beginning of a sentence
    for i in range(len(gateways)-1):
        g1, g2 = gateways[i], gateways[i+1]
        
        # if sentence distances is larger than threshold, reject possible pair
        if abs(g2[0] - g1[0]) > 1:
            continue
        for pattern_gateway_1, pattern_gateway_2 in contradictory_gateways:
            if g1[3] == pattern_gateway_1 and g2[3] == pattern_gateway_2 and g1[1] == 0:
                gateways_involved.append(g1)
                gateways_involved.append(g2)

                # A) find related activities
                pa_g1 = get_previous_activity(g1[0], g1[1], doc_activity_tokens)
                fa_g1 = get_following_activity(g1[0], g1[1], doc_activity_tokens)
                fa_g2 = get_following_activity(g2[0], g2[1], doc_activity_tokens)
                # check if following activities of g1 and g2 are the same -> if yes, the first branch is without activity
                if fa_g1 == fa_g2:
                    fa_g1 = 'empty branch'
                ffa_g2 = get_following_activity(g2[0], g2[1], doc_activity_tokens, skip_first=True)

                # B) get dictionary representations
                g1_source = get_flow_relation_representation(g1[0], g1[1], XOR_GATEWAY, g1[2], source=True)
                g1_target = get_flow_relation_representation(g1[0], g1[1], XOR_GATEWAY, g1[2], source=False)
                g2_source = get_flow_relation_representation(g2[0], g2[1], XOR_GATEWAY, g2[2], source=True)
                g2_target = get_flow_relation_representation(g2[0], g2[1], XOR_GATEWAY, g2[2], source=False)
                if pa_g1: # could be None if at document start
                    pa_g1_source = get_flow_relation_representation(pa_g1[0], pa_g1[1], ACTIVITY, pa_g1[2], source=True)
                if fa_g1 != 'empty branch' and fa_g1: # could be set in A) manually to None or at document end
                    fa_g1_source = get_flow_relation_representation(fa_g1[0], fa_g1[1], ACTIVITY, fa_g1[2], source=True)
                    fa_g1_target = get_flow_relation_representation(fa_g1[0], fa_g1[1], ACTIVITY, fa_g1[2], source=False)
                if fa_g2: # could be None if at document end
                    fa_g2_source = get_flow_relation_representation(fa_g2[0], fa_g2[1], ACTIVITY, fa_g2[2], source=True)
                    fa_g2_target = get_flow_relation_representation(fa_g2[0], fa_g2[1], ACTIVITY, fa_g2[2], source=False)
                if ffa_g2: # could be None if at document end
                    ffa_g2_target = get_flow_relation_representation(ffa_g2[0], ffa_g2[1], ACTIVITY, ffa_g2[2], source=False)


                # C.1) connect elements to sequence flows
                # a) previous activity to first gateway -> split point (if not None because of document start)
                if pa_g1:
                    sequence_flows.append(merge_source_target_dicts(pa_g1_source, g1_target))
                # b) gateway 1 to following activity and following activity to activity after gateway (second following of g2)
                # if None because of empty branch then directly there
                if fa_g1: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(g1_source, fa_g1_target))
                    if ffa_g2: # could be None if at document end
                        sequence_flows.append(merge_source_target_dicts(fa_g1_source, ffa_g2_target))
                elif fa_g1 != 'empty branch' and ffa_g2: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(g1_source, ffa_g2_target))
                # c) gateway 2 to following activity and following activity to activity after gateway (second following of g2)
                if fa_g2: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(g2_source, fa_g2_target))
                if ffa_g2: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(fa_g2_source, ffa_g2_target))

                # C.2) same gateway flows
                same_gateway_relations.append(merge_source_target_dicts(g1_source, g2_target))
    
    # RULE 2): exclusive actions of common pattern "... <activity> ... or ... <activity> ..."
    for g in gateways:
        if g not in gateways_involved and g[3] == ['or']:
            # A) find related activities
            pa = get_previous_activity(g[0], g[1], doc_activity_tokens)
            ppa = get_previous_activity(g[0], g[1], doc_activity_tokens, skip_first=True)
            fa = get_following_activity(g[0], g[1], doc_activity_tokens)
            ffa = get_following_activity(g[0], g[1], doc_activity_tokens, skip_first=True)
            
            if pa and fa:  # check if existence because of document end/start
                if pa[0] == g[0] and fa[0] == g[0]:  # check if in same sentence
                    
                    # B) get dict representations
                    g_source = get_flow_relation_representation(g[0], g[1], XOR_GATEWAY, g[2], source=True)
                    g_target = get_flow_relation_representation(g[0], g[1], XOR_GATEWAY, g[2], source=False)
                    if pa: # could be None if at document start
                        pa_source = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=True)
                        pa_target = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=False)
                    if fa: # could be None if at document end
                        fa_source = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=True)
                        fa_target = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=False)
                    if ppa: # could be None if at document start
                        ppa_source = get_flow_relation_representation(ppa[0], ppa[1], ACTIVITY, ppa[2], source=True)
                    if ffa: # could be None if at document end
                        ffa_target = get_flow_relation_representation(ffa[0], ffa[1], ACTIVITY, ffa[2], source=False)
                    
                    if pa is None or fa is None:
                        continue  # if not two surrounding activities are given, do not wire anything (maybe drop gateway again)
                        
                    # C) connect elements to sequence flows
                    # a) second previous activity to gateway -> split point (if not None because of document start)
                    if ppa:
                        sequence_flows.append(merge_source_target_dicts(ppa_source, g_target))
                    # b) gateway to following activity and previous activity -> exclusive branches
                    sequence_flows.append(merge_source_target_dicts(g_source, pa_target))
                    sequence_flows.append(merge_source_target_dicts(g_source, fa_target))
                    # c) exclusive activities to second following activity of gateway -> merge point
                    if ffa:  # if not None because of document end
                        sequence_flows.append(merge_source_target_dicts(pa_source, ffa_target))
                        sequence_flows.append(merge_source_target_dicts(fa_source, ffa_target))
                    
                    gateways_involved.append(g)
                    
    # RULE 3): single-branch gateways: the gateway is related to an activity in the same sentence (order is arbitrary)
    # Assumptiosn: multi-branch gateways are already recognized by rule 1 before; only one activity for the gateway
    for g in gateways:
        if g not in gateways_involved and g[3] != ['or']:
            # A) find related activities
            pa = get_previous_activity(g[0], g[1], doc_activity_tokens)
            ppa = get_previous_activity(g[0], g[1], doc_activity_tokens, skip_first=True)
            fa = get_following_activity(g[0], g[1], doc_activity_tokens)
            ffa = get_following_activity(g[0], g[1], doc_activity_tokens, skip_first=True)
            
            # B) check if activity is before or after the gateway (assumption: both is not included)
            if fa[0] == g[0]:
                case = 'activity after gateway'
            elif pa[0] == g[0]:
                case = 'activity before gateway'
            else:
                continue  # if no activity in same sentence, do not wire anything (maybe drop gateway again)
            gateways_involved.append(g)
            
            # C) get dict representations
            g_source = get_flow_relation_representation(g[0], g[1], XOR_GATEWAY, g[2], source=True)
            g_target = get_flow_relation_representation(g[0], g[1], XOR_GATEWAY, g[2], source=False)
            if pa: # could be None if at document start
                pa_source = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=True)
                pa_target = get_flow_relation_representation(pa[0], pa[1], ACTIVITY, pa[2], source=False)
            if fa: # could be None if at document end
                fa_source = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=True)
                fa_target = get_flow_relation_representation(fa[0], fa[1], ACTIVITY, fa[2], source=False)
            if ppa: # could be None if at document start
                ppa_source = get_flow_relation_representation(ppa[0], ppa[1], ACTIVITY, ppa[2], source=True)
            if ffa: # could be None if at document end
                ffa_target = get_flow_relation_representation(ffa[0], ffa[1], ACTIVITY, ffa[2], source=False)
            
            # D) connect elements to sequence flows
            if case == 'activity after gateway':
                # 1) previous activity to gateway -> split point
                if pa:  # could be None if at document start
                    sequence_flows.append(merge_source_target_dicts(pa_source, g_target))
                # 2) gateway to following activity -> exclusive branch
                sequence_flows.append(merge_source_target_dicts(g_source, fa_target))
                # 3) exclusive activity and gateway to second following activity of gateway -> merge point
                if ffa: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(g_source, ffa_target))
                    sequence_flows.append(merge_source_target_dicts(fa_target, ffa_target))
                
            elif case == 'activity before gateway':
                # 1) second previous activity to gateway -> split point
                if ppa: # could be None if at document start
                    sequence_flows.append(merge_source_target_dicts(ppa_source, g_target))
                # 2) gateway to previous activity -> exclusive branch
                sequence_flows.append(merge_source_target_dicts(g_source, pa_target))
                # 3) exclusive activity and gateway to following activity of gateway -> merge point
                if fa: # could be None if at document end
                    sequence_flows.append(merge_source_target_dicts(g_source, fa_target))
                    sequence_flows.append(merge_source_target_dicts(pa_source, fa_target))
                    
    sequence_flows.sort(key=lambda flow: flow['source-head-sentence-ID'])
    return sequence_flows, same_gateway_relations



print("own extracted XOR gateways:", o_xor_gateways)
print("-"*30)
o_flow_relations_xor, o_same_gateway_xor = _extract_exclusive_flows(doc_activity_tokens, o_xor_gateways)
print("-"*30)
for i, flow_relation in enumerate(o_flow_relations_xor):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()
    
print("-"*30)
    
for i, flow_relation in enumerate(o_same_gateway_xor):
    for key, value in flow_relation.items():
        print(f"{key}: {value}")
    print()

own extracted XOR gateways: [[('if', 13, 'B-XOR Gateway')], [], [('or', 11, 'B-XOR Gateway')], [('In', 0, 'B-XOR Gateway'), ('case', 1, 'I-XOR Gateway'), ('of', 2, 'I-XOR Gateway')], [('Otherwise', 0, 'B-XOR Gateway')], [('If', 0, 'B-XOR Gateway')], [('If', 0, 'B-XOR Gateway')]]
------------------------------
------------------------------
source-head-sentence-ID: 0
source-head-word-ID: 5
source-entity-type: Activity
source-entity: ['received']
target-head-sentence-ID: 0
target-head-word-ID: 13
target-entity-type: XOR Gateway
target-entity: ['if']

source-head-sentence-ID: 0
source-head-word-ID: 13
source-entity-type: XOR Gateway
source-entity: ['if']
target-head-sentence-ID: 0
target-head-word-ID: 12
target-entity-type: Activity
target-entity: ['created']

source-head-sentence-ID: 0
source-head-word-ID: 13
source-entity-type: XOR Gateway
source-entity: ['if']
target-head-sentence-ID: 1
target-head-word-ID: 4
target-entity-type: Activity
target-entity: ['reviewed']

source-head-sentenc

## 3.3 Involving Remaining Gold Activities

In [22]:
# doc_activities = token_dataset.GetDocumentActivities(doc_name)
for sentence_activities in doc_activities:
    print(sentence_activities)

# flatten activities to a list of document activities
activities_flattened = [activitiy for sentence_activities in doc_activities for activitiy in sentence_activities]
print(activities_flattened)

[]
[['receives']]
[['reject'], ['accept']]
[]
[['informed']]
[['processes'], ['checks']]
[['reserved']]
[['back-ordered']]
[]
[['prepares']]
[['assembles']]
[['ships']]
[['receives'], ['reject'], ['accept'], ['informed'], ['processes'], ['checks'], ['reserved'], ['back-ordered'], ['prepares'], ['assembles'], ['ships']]


In [37]:
def create_activity_flows(doc_activity_tokens):
    activities_flattened = [(i, activitiy) for i, sentence_activities in enumerate(doc_activity_tokens) 
                            for activitiy in sentence_activities]
    flow_relations = []
    for i in range(len(activities_flattened) - 1):
        s_idx_1, a1 = activities_flattened[i]
        s_idx_2, a2 = activities_flattened[i+1]
        if True:
            flow_relations.append({labels.SOURCE_ENTITY: a1[0], labels.TARGET_ENTITY: a2[0]})
        else:
            a1 = get_flow_relation_representation(s_idx_1, a1[1], labels.ACTIVITY, a1[0], source=True)
            a2 = get_flow_relation_representation(s_idx_2, a2[1], labels.ACTIVITY, a2[0], source=False)
            flow_relations.append(merge_source_target_dicts(a1, a2))
    return flow_relations


gold_activity_flows = create_activity_flows(doc_activity_tokens)
for i, flow_relation in enumerate(gold_activity_flows):
    for key, value in flow_relation.items():
        print(f"{i} | {key}: {value}")
    print()

0 | source-entity: ['receives']
0 | target-entity: ['reject']

1 | source-entity: ['reject']
1 | target-entity: ['accept']

2 | source-entity: ['accept']
2 | target-entity: ['informed']

3 | source-entity: ['informed']
3 | target-entity: ['processes']

4 | source-entity: ['processes']
4 | target-entity: ['checks']

5 | source-entity: ['checks']
5 | target-entity: ['reserved']

6 | source-entity: ['reserved']
6 | target-entity: ['back-ordered']

7 | source-entity: ['back-ordered']
7 | target-entity: ['prepares']

8 | source-entity: ['prepares']
8 | target-entity: ['assembles']

9 | source-entity: ['assembles']
9 | target-entity: ['ships']



## 4 Evaluate Extraction