In [1]:
# add parent dir to sys path for import of modules
import json
import os
import sys
# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)

In [36]:
import logging
import itertools

from petreader.labels import *

from PetReader import pet_reader


logger = logging.getLogger('Keywords Same Gateway Filtered Approach')

In [275]:
DF = 'directly_following'
EXCLUSIVE = 'exclusive'
CONCURRENT = 'concurrent'

SOURCE = 'source'
TARGET = 'target'

doc_black_list = ['doc-6.4']

In [278]:
def transform_relations(relations):
    results = [{SOURCE: (r[SOURCE_SENTENCE_ID], r[SOURCE_HEAD_TOKEN_ID], r[SOURCE_ENTITY], r[SOURCE_ENTITY_TYPE]),
                TARGET: (r[TARGET_SENTENCE_ID], r[TARGET_HEAD_TOKEN_ID], r[TARGET_ENTITY], r[TARGET_ENTITY_TYPE])}
               for r in relations]
    return results

def unique_ordered_flows(flows):
    flows_u = []
    for f in flows:
        if f not in flows_u:
            flows_u.append(f)
    flows_u.sort(key=lambda f: (f[SOURCE][0], f[SOURCE][1]))
    return flows_u
    

def get_linked_entities(gateway, flow_relations):
    return [r[TARGET] for r in flow_relations if r[SOURCE] == gateway]

def get_linked_entities_via_condition(gateway, flow_relations):
    return [[r2[TARGET] for r2 in flow_relations if r2[SOURCE] == r[TARGET]][0]
            for r in flow_relations if r[SOURCE] == gateway and r[TARGET][3] == CONDITION_SPECIFICATION]

def get_sg_gateways(gateway, sg_relations):
    """
    search for gateways that are related to the given gateway via a same gateway relation
    search is conducted recursively to support multi branch gateways (>2 branches)
    """
    results = []
    for sg in sg_relations:
        if sg[SOURCE] == gateway:
            results.append(sg[TARGET])
            recursive_gateways = get_sg_gateways_recursive(sg[TARGET], sg_relations)
            if recursive_gateways:
                results.extend(recursive_gateways)
    return results

def get_following_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] > element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] >= element[1])]

def get_following_flows(element, flow_relations):
    # start with flows following by text structure
    following_flows = get_following_flows_by_text_structure(element, flow_relations)
    
    # check for other links to the element before the element itself
    for f in flow_relations:
        if f[SOURCE] == element:
            following_flows.extend(get_following_flows_by_text_structure(f[SOURCE], flow_relations))
    
    return unique_ordered_flows(following_flows)

def get_previous_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] < element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] <= element[1])]

def get_number_incoming_flows(element, flow_relations):
    return len([f for f in flow_relations if f[TARGET] == element])

def get_number_outgoing_flows(element, flow_relations):
    return len([f for f in flow_relations if f[SOURCE] == element])

def find_next_merge_point(element, flow_relations):
    """
    find next activity that has multiple incoming flows (i.e. merge point) 
    """
    relevant_flows = get_following_flows_by_text_structure(element, flow_relations)
    
    # add flows ongoing from directly linked activities because structure in text can be different then process structure (e.g. doc-1.1 parallel gateway)
    directy_linked_entities = get_linked_entities(element, flow_relations)
    for e in directy_linked_entities:
        relevant_flows.extend(get_following_flows_by_text_structure(e, flow_relations))
    
    # filter for unique flows & order
    relevant_flows = unique_ordered_flows(relevant_flows)
    
    next_targets = []
    unclosed_gateways = 1
    for f in relevant_flows:
        # another gateway opened that has to be closed first
        # check for incoming flows == 1 because with > 1 gateway is merge point as well
        if f[TARGET][3] in [XOR_GATEWAY, AND_GATEWAY] and get_number_incoming_flows(f[TARGET], flow_relations) == 1:
            unclosed_gateways += 1
            print(f"... opened additionally {f[TARGET]}")
#         if get_number_outgoing_flows(f[TARGET], flow_relations) == 0: (works only for doc-6.4)
#             unclosed_gateways -= 1
#             print(f"... closed one because of process end in the middle of the process {f[TARGET]}")
        if f[TARGET] in next_targets:
            # one closing found
            unclosed_gateways -= 1
            print(f"... closed {f[TARGET]}")
            # check if all opened gateways are closed
            if unclosed_gateways == 0:
                return f[TARGET]
        else:
            next_targets.append(f[TARGET])
    print("WARNING: no merge point found")
    return None

def get_activities_until_merge_point(element, next_merge, flow_relations):
    """
    return all activities between given element and next given merge point based on flow relations/connections
    if merge point is None, return all activities until the end
    """
    relevant_flows = get_following_flows(element, flow_relations)
    activities_between = [element]

    # iterate twice because semantical structure does not always follows textual structure -> in first run not all are captured
    # duplicates will be created, but filtered after again
    def dummy():
        for f in flow_relations:
            # if source of new flow is in already recorded elements and (no merge exist or target is before merge)
            if f[SOURCE] in activities_between \
                and (not next_merge or \
                     (f[TARGET][0] < next_merge[0] or (f[TARGET][0] == next_merge[0] and f[TARGET][1] < next_merge[1]))):
                activities_between.append(f[TARGET])
    dummy()
    # remove start element
    activities_between = activities_between[1:]
    dummy()
    
    # make unique again
    activities_between_u = []
    for a in activities_between:
        if a not in activities_between_u:
            activities_between_u.append(a)

    return activities_between

def get_last_activities(flow, flow_relations):
    """
    search for last (transitively) linked activities (recursively) before current flow
    :param flow: flow to start reversed search for
    :param flow_relations: set of flows
    :return: list of transitive connected activities
    """
    last_activities = []
    relevant_flows = get_previous_flows_by_text_structure(flow[SOURCE], flow_relations)
    last_element = flow[SOURCE]
    
    # search from this flow to search reversed for last activities    
    while not last_activities:
        source_flows = [f for f in relevant_flows if f[TARGET] == last_element]
        temp_new_activities = []
        for source_flow in source_flows:
            # a) base case -> activity found
            if source_flow[SOURCE][3] == ACTIVITY:
                temp_new_activities.append(source_flow[SOURCE])
            # b) recursive case -> continue search from flow before
            else:
                temp_new_activities.extend(get_last_activities(source_flow, relevant_flows))
        last_activities.extend(temp_new_activities)
        
    return last_activities


def data_generation(doc_names, whole_branch_pairs=True):
    
    # data format -> (doc_name, (a1), (a2), type, comment)
    # split/merge points are represented as directly follow relations 
    relations = []
    
    for i, doc_name in enumerate(pet_reader.document_names):
        
#         if doc_name == 'doc-1.1':
#             continue
            
        if (doc_names and doc_name not in doc_names) or doc_name in doc_black_list:
            continue
            
        # 1) Search for relations using gateways
        doc_relations = pet_reader.relations_dataset.GetRelations(pet_reader.get_document_number(doc_name))
        flow_relations = transform_relations(doc_relations[FLOW])
        same_gateway_relations = transform_relations(doc_relations[SAME_GATEWAY])
        
        print(" FLOW RELATIONS ".center(100, '-'))
        
        for i, f in enumerate(flow_relations):
            print("\n")
            print(i, f[SOURCE], f[TARGET])
            
            # a) DIRECTLY FOLLOWING RELATIONS
            if f[SOURCE][3] == f[TARGET][3] == ACTIVITY:
                relations.append((doc_name, f[SOURCE], f[TARGET], DF, "normal df"))
                
            # b) RELATIONS INVOLVING GATEWAYS
            if f[TARGET][3] in [XOR_GATEWAY, AND_GATEWAY]:
                
                # extract source activity of current flow for linking pairing with following activities of gateway (f[TARGET]) 
                if f[SOURCE][3] == ACTIVITY:
                    source_activities = [f[SOURCE]]
                # if gateways are nested/referring each other -> lookup previous last normal activity recursively
                elif f[SOURCE][3] in [CONDITION_SPECIFICATION, XOR_GATEWAY, AND_GATEWAY]:
                    source_activities = get_last_activities(f, flow_relations)
                    print(f"Nested gateway - transitive last activities: {source_activities}")
                else:
                    raise Exception("Other flow combination!")
                    
                
                gateway = f[TARGET]
                gateway_merge_point = find_next_merge_point(gateway, flow_relations)
                print(f"Gateway {gateway} - merge point: {gateway_merge_point}")
                
                # create flows from possible multiple incomes to current gateway (only in case of directly nested gateways)
                # to possible multiple outcomes (normal for gateways)
                
                # extract activities to which the gateway refers

                
                # - 1) in case of direct entity (activity or further gateway) link without conditon and same gateway
                # cases: exlusive 'or' gateways || parallel gateways
                directly_linked_entities = get_linked_entities(gateway, flow_relations)
                directly_linked_entities
                # add relations of activities before to (directly linked) gateway activities via DF
                for e in directly_linked_entities:
                    if e[3] == ACTIVITY:
                        for source_activity in source_activities:
                            relations.append((doc_name, source_activity, e, DF, "g -> a"))
                # add exclusive/concurrent relations between (multiple) activities of branches
                # first create list of activities for each branch
                activity_branches1 = ([[e] + (get_activities_until_merge_point(e, gateway_merge_point, flow_relations) if whole_branch_pairs else [])
                                         for e in directly_linked_entities])

                # second create connections between all activities of each pair of branches 
                if activity_branches1:
                    print("-- Simple Branches (direct links)")
                    for i, b in enumerate(activity_branches1):
                        print('-', i, b)
                    
                    for branchA, branchB in itertools.combinations(activity_branches1, 2):
                        for e1, e2 in itertools.product(*[branchA, branchB]):
                            if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                                relations.append((doc_name, e1, e2, EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))



                # - 2) in case of indirect link via condition specification or same gateway relations
                gateway_branches_entities_directly_linked = []
                condition_spec_linked = get_linked_entities_via_condition(gateway, flow_relations)
                for e in condition_spec_linked:
                    if e[3] == ACTIVITY:
                        for source_activity in source_activities:
                            relations.append((doc_name, source_activity, e, DF, "g -> cond -> a")) 
                        gateway_branches_entities_directly_linked.append(e)
                    # not activity is linked, but other (gateway, cond) from which following activities will be included as well
                    else:
                        gateway_branches_entities_directly_linked.append(e)
                        
                # add exclusive/concurrent relations between (multiple) activities of branches
                # first create list of activities for each branch
                activity_branches2 = ([[e] + (get_activities_until_merge_point(e, gateway_merge_point, flow_relations) if whole_branch_pairs else [])
                                         for e in gateway_branches_entities_directly_linked])
                if activity_branches2:
                    print("-- Condition Branches (indirect links)")
                    for i, b in enumerate(activity_branches2):
                        print('-', i, b)

                    # second create connections between all activities of each pair of branches 
                    for branchA, branchB in itertools.combinations(activity_branches2, 2):
                        for e1, e2 in itertools.product(*[branchA, branchB]):
                            if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                                relations.append((doc_name, e1, e2, EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))

                # -3) detect same gateways and repeat procedure for them
                sg_entities_linked = []
                sg_gateways = get_sg_gateways(gateway, same_gateway_relations)
                for sg_gateway in sg_gateways:
                    # directly linked
                    sg_linked_entities = get_linked_entities(sg_gateway, flow_relations)
                    print("same gateway", sg_gateway, "linked entities:", sg_linked_entities)
                    for e in sg_linked_entities:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> a"))
#                             gateway_branches_entities_directly_linked.append(e)
                            sg_entities_linked.append(e)
                        # not activity is linked, but other (gateway, cond) from which following activities will be included as well
                        else:
#                             gateway_branches_entities_directly_linked.append(e)
                            sg_entities_linked.append(e)
                    # linked via condition
                    sg_gateway_condition_spec_linked = get_linked_entities_via_condition(sg_gateway, flow_relations)
                    for e in sg_gateway_condition_spec_linked:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> cond -> a"))
#                             gateway_branches_entities_directly_linked.append(e)
                            sg_entities_linked.append(e)
                        # not activity is linked, but other (gateway, cond) from which following activities will be included as well
                        else:
#                             gateway_branches_entities_directly_linked.append(e)
                            sg_entities_linked.append(e)

               

                # add exclusive/concurrent relations of same gateway relations
                sg_activity_branches = ([[e] + (get_activities_until_merge_point(e, gateway_merge_point, flow_relations) if whole_branch_pairs else [])
                                         for e in sg_entities_linked])
                sg_activity_branches = activity_branches1 + sg_activity_branches
                if sg_activity_branches:
                    print("-- (All) Same Gateway Branches")
                    for i, b in enumerate(sg_activity_branches):
                        print('-', i, b)
                    
                    # second create connections between all activities of each pair of branches 
                    for branchA, branchB in itertools.combinations(sg_activity_branches, 2):
                        for e1, e2 in itertools.product(*[branchA, branchB]):
                            if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                                relations.append((doc_name, e1, e2, EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))
                                
                # TODO: cases where gateway is at the start of the document
                # TODO: one branch gateways

    # filter duplicates & sort
    relations_final = []
    for r in relations:
        if r not in relations_final and r[1] != r[2]:
            relations_final.append(r)
    relations_final.sort(key=lambda r: (r[1][0], r[1][1], r[2][0], r[2][1]))

    return relations_final
            


    
activity_relations = data_generation(['doc-1.2'], whole_branch_pairs=True)
print(" RESULTS ".center(100, '-'))
for relation in activity_relations:
    print(relation)
print("relations:", len(activity_relations))
# check again:

------------------------------------------ FLOW RELATIONS ------------------------------------------


0 (0, 2, ['brings', 'in'], 'Activity') (0, 10, ['checks'], 'Activity')


1 (0, 10, ['checks'], 'Activity') (0, 14, ['hands', 'out'], 'Activity')


2 (0, 14, ['hands', 'out'], 'Activity') (1, 0, ['If'], 'XOR Gateway')
... opened additionally (3, 11, ['whereas'], 'AND Gateway')
... closed (4, 11, ['tested'], 'Activity')
... opened additionally (5, 0, ['If'], 'XOR Gateway')
Gateway (1, 0, ['If'], 'XOR Gateway') - merge point: None
-- Simple Branches (direct links)
- 0 [(1, 1, ['the', 'customer', 'decides', 'that', 'the', 'costs', 'are', 'acceptable'], 'Condition Specification'), (3, 11, ['whereas'], 'AND Gateway'), (3, 5, ['check'], 'Activity'), (3, 15, ['checks'], 'Activity'), (3, 17, ['configures'], 'Activity'), (4, 11, ['tested'], 'Activity'), (5, 0, ['If'], 'XOR Gateway'), (5, 1, ['an', 'error', 'is', 'detected'], 'Condition Specification'), (5, 10, ['executed'], 'Activity'), (3, 7, 