# Baseline
variables with prefix ``doc_`` contain data from the dataset
variables with prefix ``o_`` contain data from own computations

In [4]:
import itertools

from petreader.RelationsExtraction import RelationsExtraction
from petreader.TokenClassification import TokenClassification
from petreader.labels import FLOW, SAME_GATEWAY, AND_GATEWAY, XOR_GATEWAY

In [5]:
relations_dataset = RelationsExtraction()
token_dataset = TokenClassification()

Reusing dataset pet (C:\Users\janek\.cache\huggingface\datasets\patriziobellan___pet\relations-extraction\1.0.1\38434e2af57af533c400c8975f37e43c08bb77739085a3c026a862b2efb668d2)




 _______ _     _ _______       _____  _______ _______      ______  _______ _______ _______ _______ _______ _______
    |    |_____| |______      |_____] |______    |         |     \ |_____|    |    |_____| |______ |______    |   
    |    |     | |______      |       |______    |         |_____/ |     |    |    |     | ______| |______    |   
                                                                                                                  
Discover more at: [https://pdi.fbk.eu/pet-dataset/]



  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset pet (C:\Users\janek\.cache\huggingface\datasets\patriziobellan___pet\token-classification\1.0.1\38434e2af57af533c400c8975f37e43c08bb77739085a3c026a862b2efb668d2)




 _______ _     _ _______       _____  _______ _______      ______  _______ _______ _______ _______ _______ _______
    |    |_____| |______      |_____] |______    |         |     \ |_____|    |    |_____| |______ |______    |   
    |    |     | |______      |       |______    |         |_____/ |     |    |    |     | ______| |______    |   
                                                                                                                  
Discover more at: [https://pdi.fbk.eu/pet-dataset/]



  0%|          | 0/1 [00:00<?, ?it/s]

## 1 Prepare Data

### 1.1 Read Example Doc

In [6]:
doc_id = 0
doc_name = token_dataset.GetDocumentName(doc_id)
print(f"  {doc_name}  ".center(50, '*'))
doc_text = relations_dataset.GetDocument(doc_id)
print(doc_text)
doc_activities = token_dataset.GetDocumentActivities(doc_name)

print(" activities and NER labels (per sentences) ".center(50, '*'))
print(doc_activities)
doc_sentence_ner_labels = relations_dataset.GetSentencesWithIdsAndNerTagLabels(doc_id)
print(doc_sentence_ner_labels[:4])
doc_relations = relations_dataset.GetRelations(doc_id)
doc_flow_relations, doc_same_gateway_relations = doc_relations[FLOW], doc_relations[SAME_GATEWAY]

print(" same gateway relations ".center(50, '*'))
for same_gateway_relation in doc_same_gateway_relations:
    for key, value in same_gateway_relation.items():
        print(f"{key}: {value}")
    print()

*******************  doc-1.1  ********************
A small company manufactures customized bicycles . Whenever the sales department receives an order , a new process instance is created . A member of the sales department can then reject or accept the order for a customized bike . In the former case , the process instance is finished . In the latter case , the storehouse and the engineering department are informed . The storehouse immediately processes the part list of the order and checks the required quantity of each part . If the part is available in-house , it is reserved . If it is not available , it is back-ordered . This procedure is repeated for each item on the part list . In the meantime , the engineering department prepares everything for the assembling of the ordered bicycle . If the storehouse has successfully reserved or back-ordered every item of the part list and the preparation activity has finished , the engineering department assembles the bicycle . Afterwards , the s

### 1.2 Preprocess sentences

In [8]:
num_sentences = len(doc_activities) # activities is 2 dim list (one per sentence)
print(num_sentences)
doc_sentences_raw = [sentence.strip() for sentence in doc_text.split(".") if sentence.strip() != ""]
for s in doc_sentences_raw:
    print(s)
assert num_sentences == len(doc_sentences_raw)  # check if number of extracted sentences == from dataset 

12
A small company manufactures customized bicycles
Whenever the sales department receives an order , a new process instance is created
A member of the sales department can then reject or accept the order for a customized bike
In the former case , the process instance is finished
In the latter case , the storehouse and the engineering department are informed
The storehouse immediately processes the part list of the order and checks the required quantity of each part
If the part is available in-house , it is reserved
If it is not available , it is back-ordered
This procedure is repeated for each item on the part list
In the meantime , the engineering department prepares everything for the assembling of the ordered bicycle
If the storehouse has successfully reserved or back-ordered every item of the part list and the preparation activity has finished , the engineering department assembles the bicycle
Afterwards , the sales department ships the bicycle to the customer and finishes the pro

### 1.3 Filter Tokens for Gateways

In [18]:
def filter_ner_labels(sentence_ner_labels, target_label):
    return [[token for token in s_list if target_label in token[2]]
                        for s_list in sentence_ner_labels]
doc_xor_gateway = filter_ner_labels(doc_sentence_ner_labels, XOR_GATEWAY)
doc_and_gateway = filter_ner_labels(doc_sentence_ner_labels, AND_GATEWAY)
print(doc_xor_gateway)
print(doc_and_gateway)

[[], [], [('or', 9, 'B-XOR Gateway')], [], [], [], [('If', 0, 'B-XOR Gateway')], [('If', 0, 'B-XOR Gateway')], [], [], [('If', 0, 'B-XOR Gateway')], []]
[[], [], [], [], [], [], [], [], [], [('In', 0, 'B-AND Gateway'), ('the', 1, 'I-AND Gateway'), ('meantime', 2, 'I-AND Gateway')], [], []]


### 1.X Filter Flow Relations? TODO
see question doc

### 1.4 Key Word List
#### A) take words from all existing gateways as gold list for detection

In [10]:
def get_gateway_key_words(dataset_gateway_list):
    flattened = list(itertools.chain(*dataset_gateway_list))
    phrases = [" ".join(g).lower() for g in flattened]  # join phrases together if multiple words
    unique = list(set(phrases))
    unique.sort()
    return unique

xor_key_words_gold = get_gateway_key_words(token_dataset.GetXORGateways())
and_key_words_gold = get_gateway_key_words(token_dataset.GetANDGateways())

print(f"XOR gold ({len(xor_key_words_gold)})", xor_key_words_gold)
print(f"AND gold ({len(and_key_words_gold)})", and_key_words_gold)

XOR gold (15) ['either', 'for', 'for each patient for which', 'for the case', 'if', 'in case', 'in case of', 'in the case of', 'it can also happen that', 'or', 'otherwise', 'should', 'sometimes', 'under certain circumstances', 'whereas']
AND gold (6) ['at the same time', 'in the meantime', 'meantime', 'two concurrent activities are triggered', 'whereas', 'while']


#### B) Curated List from Literature

In [11]:
# Ferreira et al. 2017
xor_key_words_literature = ['if', 'whether', 'if not', 'or', 'in case', 'in case of', 'otherwise', 'either', 'only', 'till', 'until', 'unless', 'when', 'only if']
xor_key_words_literature.sort()
and_key_words_literature = ['while', 'meanwhile', 'in parallel', 'concurrently', 'meantime', 'in the meantime', 'in parallel with this', 'in addition to', 'simultaneously', 'at the same time', 'whereas']
and_key_words_literature.sort()
print(f"XOR literature ({len(xor_key_words_literature)})", xor_key_words_literature)
print(f"AND literature ({len(and_key_words_literature)})", and_key_words_literature)

XOR literature (14) ['either', 'if', 'if not', 'in case', 'in case of', 'only', 'only if', 'or', 'otherwise', 'till', 'unless', 'until', 'when', 'whether']
AND literature (11) ['at the same time', 'concurrently', 'in addition to', 'in parallel', 'in parallel with this', 'in the meantime', 'meantime', 'meanwhile', 'simultaneously', 'whereas', 'while']


## 2 Extract Gateways

In [13]:
for s in doc_sentences_raw:
    print(s)

A small company manufactures customized bicycles
Whenever the sales department receives an order , a new process instance is created
A member of the sales department can then reject or accept the order for a customized bike
In the former case , the process instance is finished
In the latter case , the storehouse and the engineering department are informed
The storehouse immediately processes the part list of the order and checks the required quantity of each part
If the part is available in-house , it is reserved
If it is not available , it is back-ordered
This procedure is repeated for each item on the part list
In the meantime , the engineering department prepares everything for the assembling of the ordered bicycle
If the storehouse has successfully reserved or back-ordered every item of the part list and the preparation activity has finished , the engineering department assembles the bicycle
Afterwards , the sales department ships the bicycle to the customer and finishes the proces

In [61]:
def extract_gateways(sentence_list, key_words, target_gateway_label):
    """
    extracts gateways in a key-word-based manner given a document structured in a list of sentences
    if two phrases would match to a token (e.g. 'in the meantime' and 'meantime'), the longer phrase is extracted
    target_gateway_label: str, must be 'XOR Gateway' or 'AND Gateway'
    
    result list is two dimensional -> list of tuples (word, position in sentence, tag) for each sentence
    this produces the same structure as sentences and their NER labels are annotated in PET dataset
    """
    gateways = []
    # sort key words descending by length of words in phrase
    key_words.sort(key=lambda key_word_phrase: len(key_word_phrase.split(" ")), reverse=True)

    # 1) extract gateways
    for s_idx, sentence in enumerate(sentence_list):
        # print(f" SENTENCE {s_idx} ".center(50, '-'))
        # print(sentence_list[s_idx])
        sentence_gateways = []
        sentence_to_search = f" {sentence.lower()} "  # lowercase and wrap with spaces for search of key words
        tokens = sentence.split(" ")
        tokens_lower = sentence.lower().split(" ")
        tokens_already_matched_with_key_phrase = []

        # iterate over key phrases
        for key_phrase in key_words:
            key_phrase_to_search = f" {key_phrase} "

            # if key phrase is in sentence, search index and extract
            if key_phrase_to_search in sentence_to_search:
                key_phrase_tokens = key_phrase.split(" ")
                
                # check key phrase for every token
                for t_idx, token in enumerate(tokens_lower):
                    candidate = True
                    # iterate over key phrase tokens in case of multiple world phrase
                    for key_phrase_token_idx, key_phrase_token in enumerate(key_phrase_tokens):
                        # check if token is not part of key phrase or token is already matched with another phrase
                        # if yes, stop processing candidate
                        if not tokens_lower[t_idx + key_phrase_token_idx] == key_phrase_token or \
                            t_idx + key_phrase_token_idx in tokens_already_matched_with_key_phrase:
                            candidate = False
                            break
                    
                    # add tokens to result only if all tokens are matched and not already part of a longer phrase
                    if candidate:
                        for i, key_phrase_token in enumerate(key_phrase_tokens):
                            prefix = "B" if i == 0 else "I"
                            # append tuples with extract information as in PET and process information about gateways to filter later
                            sentence_gateways.append((tokens[t_idx + i], t_idx + i, f"{prefix}-{XOR_GATEWAY}"))
                            tokens_already_matched_with_key_phrase.append(t_idx + i)

        sentence_gateways.sort(key=lambda gateway_triple: gateway_triple[1])
        gateways.append(sentence_gateways)

    return gateways

In [62]:
# available key word lists: xor_key_words_gold, and_key_words_gold, xor_key_words_literature, and_key_words_literature
o_xor_gateways = extract_gateways(doc_sentences_raw, xor_key_words_gold, XOR_GATEWAY)
o_and_gateways = extract_gateways(doc_sentences_raw, and_key_words_gold, AND_GATEWAY)

print("XOR GATEWAYS")
for idx, sentence_gateways in enumerate(o_xor_gateways):
    print(idx, sentence_gateways)
print("\nAND GATEWAYS")
for idx, sentence_gateways in enumerate(o_and_gateways):
    print(idx, sentence_gateways)

XOR GATEWAYS
0 []
1 []
2 [('or', 9, 'B-XOR Gateway'), ('for', 13, 'B-XOR Gateway')]
3 []
4 []
5 []
6 [('If', 0, 'B-XOR Gateway')]
7 [('If', 0, 'B-XOR Gateway')]
8 [('for', 4, 'B-XOR Gateway')]
9 [('for', 9, 'B-XOR Gateway')]
10 [('If', 0, 'B-XOR Gateway'), ('or', 6, 'B-XOR Gateway')]
11 []

AND GATEWAYS
0 []
1 []
2 []
3 []
4 []
5 []
6 []
7 []
8 []
9 [('In', 0, 'B-XOR Gateway'), ('the', 1, 'I-XOR Gateway'), ('meantime', 2, 'I-XOR Gateway')]
10 []
11 []


## 3 Evaluate Extraction