In [1]:
# add parent dir to sys path for import of modules
import json
import os
import sys
# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)

In [36]:
import logging
import itertools

from petreader.labels import *

from PetReader import pet_reader


logger = logging.getLogger('Keywords Same Gateway Filtered Approach')

In [22]:
DF = 'directly_following'
EXCLUSIVE = 'exclusive'
CONCURRENT = 'concurrent'

SOURCE = 'source'
TARGET = 'target'

In [170]:
def transform_relations(relations):
    results = [{SOURCE: (r[SOURCE_SENTENCE_ID], r[SOURCE_HEAD_TOKEN_ID], r[SOURCE_ENTITY], r[SOURCE_ENTITY_TYPE]),
                TARGET: (r[TARGET_SENTENCE_ID], r[TARGET_HEAD_TOKEN_ID], r[TARGET_ENTITY], r[TARGET_ENTITY_TYPE])}
               for r in relations]
    return results

def get_linked_entities(gateway, flow_relations):
    return [r[TARGET] for r in flow_relations if r[SOURCE] == gateway]

def get_linked_entities_via_condition(gateway, flow_relations):
    return [[r2[TARGET] for r2 in flow_relations if r2[SOURCE] == r[TARGET]][0]
            for r in flow_relations if r[SOURCE] == gateway and r[TARGET][3] == CONDITION_SPECIFICATION]

def get_sg_gateways(gateway, sg_relations):
    """
    search for gateways that are related to the given gateway via a same gateway relation
    search is conducted recursively to support multi branch gateways (>2 branches)
    """
    results = []
    for sg in sg_relations:
        if sg[SOURCE] == gateway:
            results.append(sg[TARGET])
            recursive_gateways = get_sg_gateways_recursive(sg[TARGET], sg_relations)
            if recursive_gateways:
                results.extend(recursive_gateways)
    return results

def get_following_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] > element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] >= element[1])]

def get_previous_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] < element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] <= element[1])]

def find_next_merge_point(element, flow_relations):
    """
    find next activity that has multiple incoming flows (i.e. merge point) 
    """
    relevant_flows = get_following_flows_by_text_structure(element, flow_relations)
    
    # add flows ongoing from directly linked activities because structure in text can be different then process structure (e.g. doc-1.1 parallel gateway)
    directy_linked_entities = get_linked_entities(element, flow_relations)
    for e in directy_linked_entities:
        relevant_flows.extend(get_following_flows_by_text_structure(e, flow_relations))
    # filter for unique flows
    relevant_flows_u = []
    for f in relevant_flows:
        if f not in relevant_flows_u:
            relevant_flows_u.append(f)
    relevant_flows = relevant_flows_u
    
    next_targets = []
    print(len(relevant_flows))
    for f in relevant_flows:
        if element[3] == AND_GATEWAY:
            print("DEBUG", f[TARGET])
        if f[TARGET] in next_targets:
            return f[TARGET]
        else:
            next_targets.append(f[TARGET])
    # raise Exception(f"no merge point found for {element}")  # TODO: change to GatewayExtractionException
    print("WARNING: no merge point found")
    return None

def get_activities_until_merge_point(element, next_merge, flow_relations):
    """
    return all activities between given element and next given merge point based on flow relations/connections
    """
    relevant_flows = get_following_flows_by_text_structure(element, flow_relations)
    activities_between = [element]
    for f in relevant_flows:
        if f[SOURCE] == activities_between[-1] \
            and (f[TARGET][0] < next_merge[0] or (f[TARGET][0] == next_merge[0] and f[TARGET][1] < next_merge[1])):
            activities_between.append(f[TARGET])
    return activities_between[1:]

def get_last_activities(flow, flow_relations):
    """
    search for last (transitively) linked activities (recursively) before current flow
    :param flow: flow to start reversed search for
    :param flow_relations: set of flows
    :return: list of transitive connected activities
    """
    last_activities = []
    relevant_flows = get_previous_flows_by_text_structure(flow[SOURCE], flow_relations)
    last_element = flow[SOURCE]
    
    # search from this flow to search reversed for last activities    
    while not last_activities:
        source_flows = [f for f in relevant_flows if f[TARGET] == last_element]
        temp_new_activities = []
        for source_flow in source_flows:
            # a) base case -> activity found
            if source_flow[SOURCE][3] == ACTIVITY:
                temp_new_activities.append(source_flow[SOURCE])
            # b) recursive case -> continue search from flow before
            else:
                temp_new_activities.extend(get_last_activities(source_flow, relevant_flows))
        last_activities.extend(temp_new_activities)
        
    return last_activities


def data_generation(doc_names, whole_branch_pairs=True):
    
    # data format ? -> (doc_name, (a1), (a2), type, comment)
    # split/merge points are represented as directly follow relations 
    relations = []
    
    for i, doc_name in enumerate(pet_reader.document_names):
        
#         if doc_name == 'doc-1.1':
#             continue
            
        if doc_names and doc_name not in doc_names:
            continue
            
        # 1) Search for relations using gateways
        doc_relations = pet_reader.relations_dataset.GetRelations(pet_reader.get_document_number(doc_name))
        flow_relations = transform_relations(doc_relations[FLOW])
        same_gateway_relations = transform_relations(doc_relations[SAME_GATEWAY])
        
        print(" FLOW RELATIONS ".center(100, '-'))
        
        for i, f in enumerate(flow_relations):
            print("\n")
            print(i, f[SOURCE], f[TARGET])
            
            # a) DIRECTLY FOLLOWING RELATIONS
            if f[SOURCE][3] == f[TARGET][3] == ACTIVITY:
                relations.append((doc_name, f[SOURCE], f[TARGET], DF, "normal df"))
                
            # b) RELATIONS INVOLVING GATEWAYS
            if f[TARGET][3] in [XOR_GATEWAY, AND_GATEWAY]:
                
                # extract source activity of current flow for linking pairing with following activities of gateway (f[TARGET]) 
                if f[SOURCE][3] == ACTIVITY:
                    source_activities = [f[SOURCE]]
                # if gateways are nested/referring each other -> lookup previous last normal activity recursively
                elif f[SOURCE][3] in [CONDITION_SPECIFICATION, XOR_GATEWAY, AND_GATEWAY]:
                    source_activities = get_last_activities(f, flow_relations)
                    print(f"Nested gateway - transitive last activities: {source_activities}")
                else:
                    raise Exception("Other flow combination!")
                    
                
                gateway = f[TARGET]
                gateway_merge_point = find_next_merge_point(gateway, flow_relations)
                print(f"Gateway {gateway} - merge point: {gateway_merge_point}")
                
                # create flows from possible multiple incomes to current gateway (only in case of directly nested gateways)
                # to possible multiple outcomes (normal for gateways)
                
                # extract activities to which the gateway refers

                # - 1) in case of direct entity (activity or further gateway) link without conditon and same gateway
                # cases: exlusive 'or' gateways || parallel gateways
                directly_linked_entities = get_linked_entities(gateway, flow_relations)
                # add relations of activities before to (directly linked) gateway activities via DF
                for e in directly_linked_entities:
                    if e[3] == ACTIVITY:
                        for source_activity in source_activities:
                            relations.append((doc_name, source_activity, e, DF, "g -> a")) 
                # add exclusive/concurrent relations between (multiple) activities of branches
                # first create list of activities for each branch
                activity_branches = [[e] + (get_activities_until_merge_point(e, gateway_merge_point, flow_relations) if whole_branch_pairs and gateway_merge_point else [])
                                     for e in directly_linked_entities]
                # second create connections between all activities of each pair of branches 
                for branchA, branchB in itertools.combinations(activity_branches, 2):
                    for e1, e2 in itertools.product(*[branchA, branchB]):
                        if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                            relations.append((doc_name, e1, e2, EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))



                # - 2) in case of indirect link via condition specification and same gateway relations
                gateway_branches_entities_directly_linked = []
                condition_spec_linked = get_linked_entities_via_condition(gateway, flow_relations)
                for e in condition_spec_linked:
                    print(e)
                    if e:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> cond -> a")) 
                            gateway_branches_entities_directly_linked.append(e)
                    else:
                        print("&&&&&&&& ERROR")

                # detect same gateways and repeat procedure for them
                sg_gateways = get_sg_gateways(gateway, same_gateway_relations)
                for sg_gateway in sg_gateways:
                    print("same gateway", sg_gateway)
                    # directly linked
                    sg_linked_entities = get_linked_entities(sg_gateway, flow_relations)
                    print(sg_linked_entities)
                    for e in sg_linked_entities:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> a"))
                            gateway_branches_entities_directly_linked.append(e)
                    # linked via condition
                    sg_gateway_condition_spec_linked = get_linked_entities_via_condition(sg_gateway, flow_relations)
                    for e in sg_gateway_condition_spec_linked:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> cond -> a"))
                            gateway_branches_entities_directly_linked.append(e)

                # add exclusive/concurrent relations between (multiple) activities of branches
                # first create list of activities for each branch
                activity_branches = [[a] + (get_activities_until_merge_point(a, gateway_merge_point, flow_relations) if whole_branch_pairs and gateway_merge_point else [])
                                     for a in gateway_branches_entities_directly_linked]
                # second create connections between all activities of each pair of branches 
                for branchA, branchB in itertools.combinations(activity_branches, 2):
                    for e1, e2 in itertools.product(*[branchA, branchB]):
                        if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                            relations.append((doc_name, e1, e2, EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))

                # TODO: cases where gateway is at the start of the document
                # TODO: one branch gateways

    # filter duplicates & sort
    relations_final = []
    for r in relations:
        if r not in relations_final:
            relations_final.append(r)
    relations_final.sort(key=lambda r: (r[1][0], r[1][1]))

    return relations_final
            


    
activity_relations = data_generation(['doc-1.2'], whole_branch_pairs=True)
print(" RESULTS ".center(100, '-'))
for relation in activity_relations:
    print(relation)
print("relations:", len(activity_relations))
# check again: doc-1.2; doc-6.4

------------------------------------------ FLOW RELATIONS ------------------------------------------


0 (0, 2, ['brings', 'in'], 'Activity') (0, 10, ['checks'], 'Activity')


1 (0, 10, ['checks'], 'Activity') (0, 14, ['hands', 'out'], 'Activity')


2 (0, 14, ['hands', 'out'], 'Activity') (1, 0, ['If'], 'XOR Gateway')
12
Gateway (1, 0, ['If'], 'XOR Gateway') - merge point: (4, 11, ['tested'], 'Activity')
(3, 11, ['whereas'], 'AND Gateway')
same gateway (1, 14, ['otherwise'], 'XOR Gateway')
[(1, 16, ['takes'], 'Activity')]


3 (1, 0, ['If'], 'XOR Gateway') (1, 1, ['the', 'customer', 'decides', 'that', 'the', 'costs', 'are', 'acceptable'], 'Condition Specification')


4 (1, 1, ['the', 'customer', 'decides', 'that', 'the', 'costs', 'are', 'acceptable'], 'Condition Specification') (3, 11, ['whereas'], 'AND Gateway')
Nested gateway - transitive last activities: [(0, 14, ['hands', 'out'], 'Activity')]
9
DEBUG (3, 5, ['check'], 'Activity')
DEBUG (3, 15, ['checks'], 'Activity')
DEBUG (3, 17, [