In [1]:
# add parent dir to sys path for import of modules
import json
import os
import sys
# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)

In [3]:
import logging
import itertools

from petreader.labels import *

from PetReader import pet_reader
from utils import GatewayExtractionException


logger = logging.getLogger('Data Generation: Activity Relations')

In [4]:
DF = 'directly_following'
EXCLUSIVE = 'exclusive'
CONCURRENT = 'concurrent'

DOC_START = 'Document Start'

SOURCE = 'source'
TARGET = 'target'

doc_black_list = ['doc-6.4']

In [31]:
def transform_relations(relations):
    results = [{SOURCE: (r[SOURCE_SENTENCE_ID], r[SOURCE_HEAD_TOKEN_ID], r[SOURCE_ENTITY], r[SOURCE_ENTITY_TYPE]),
                TARGET: (r[TARGET_SENTENCE_ID], r[TARGET_HEAD_TOKEN_ID], r[TARGET_ENTITY], r[TARGET_ENTITY_TYPE])}
               for r in relations]
    return results

def unique_ordered_flows(flows):
    flows_u = []
    for f in flows:
        if f not in flows_u:
            flows_u.append(f)
    flows_u.sort(key=lambda f: (f[SOURCE][0], f[SOURCE][1]))
    return flows_u

def get_linked_entities(gateway, flow_relations):
    return [r[TARGET] for r in flow_relations if r[SOURCE] == gateway]

def get_linked_entities_via_condition(gateway, flow_relations):
    return [[r2[TARGET] for r2 in flow_relations if r2[SOURCE] == r[TARGET]][0]
            for r in flow_relations if r[SOURCE] == gateway and r[TARGET][3] == CONDITION_SPECIFICATION]

def get_sg_gateways(gateway, sg_relations):
    """
    search for gateways that are related to the given gateway via a same gateway relation
    search is conducted recursively to support multi branch gateways (>2 branches)
    """
    results = []
    for sg in sg_relations:
        if sg[SOURCE] == gateway:
            results.append(sg[TARGET])
            recursive_gateways = get_sg_gateways(sg[TARGET], sg_relations)
            if recursive_gateways:
                results.extend(recursive_gateways)
    return results

def get_following_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] > element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] >= element[1])]

def get_previous_flows_by_text_structure(element, flow_relations):
    return [f for f in flow_relations if f[SOURCE][0] < element[0] 
                                      or (f[SOURCE][0] == element[0] and f[SOURCE][1] <= element[1])]

def get_number_incoming_flows(element, flow_relations):
    return len([f for f in flow_relations if f[TARGET] == element])

def get_number_outgoing_flows(element, flow_relations):
    return len([f for f in flow_relations if f[SOURCE] == element])

def get_merge_point_search_flows(element, flow_relations):
    # start with flows following by text structure
    following_flows = get_following_flows_by_text_structure(element, flow_relations)
    
    # check for other flows of the target element before because order in text can differ from process logic order
    additional_flows = []
    for f in following_flows:
        additional_flows.extend(get_following_flows_by_text_structure(f[TARGET], flow_relations))
        
    # add flows ongoing from directly linked activities because structure in text can be different then process structure (e.g. doc-1.1 parallel gateway)
    directy_linked_entities = get_linked_entities(element, flow_relations)
    for e in directy_linked_entities:
        additional_flows.extend(get_following_flows_by_text_structure(e, flow_relations))
    
    return unique_ordered_flows(following_flows + additional_flows)

def find_next_merge_point(element, flow_relations):
    """
    find next activity that has multiple incoming flows (i.e. merge point) 
    """
    relevant_flows = get_merge_point_search_flows(element, flow_relations)
    
    next_targets = []
    unclosed_gateways = 1
    for f in relevant_flows:
        # another gateway opened that has to be closed first
        # check for incoming flows == 1 because with > 1 gateway is merge point as well
        if f[TARGET][3] in [XOR_GATEWAY, AND_GATEWAY] and get_number_incoming_flows(f[TARGET], flow_relations) == 1:
            unclosed_gateways += 1
            print(f"... opened additionally {f[TARGET]}")
#         if get_number_outgoing_flows(f[TARGET], flow_relations) == 0: (works only for doc-6.4)
#             unclosed_gateways -= 1
#             print(f"... closed one because of process end in the middle of the process {f[TARGET]}")
        if f[TARGET] in next_targets:
            # one closing found
            unclosed_gateways -= 1
            print(f"... closed {f[TARGET]}")
            # check if all opened gateways are closed
            if unclosed_gateways == 0:
                return f[TARGET]
        else:
            next_targets.append(f[TARGET])
    print("WARNING: no merge point found")
    return None

def get_following_flows(element, flow_relations):
    # start with flows following by text structure
    following_flows = get_following_flows_by_text_structure(element, flow_relations)
    
    # check for other links to the element before the element itself
    for f in flow_relations:
        if f[SOURCE] == element:
            following_flows.extend(get_following_flows_by_text_structure(f[SOURCE], flow_relations))
    
    return unique_ordered_flows(following_flows)

def get_activities_until_merge_point(element, next_merge, flow_relations):
    """
    return all activities between given element and next given merge point based on flow relations/connections
    if merge point is None, return all activities until the end
    """
    relevant_flows = get_following_flows(element, flow_relations)
    activities_between = [element]

    # iterate twice because semantical structure does not always follows textual structure -> in first run not all are captured
    # duplicates will be created, but filtered after again
    def dummy():
        for f in flow_relations:
            # if source of new flow is in already recorded elements and (no merge exist or target is before merge)
            if f[SOURCE] in activities_between \
                and (not next_merge or \
                     (f[TARGET][0] < next_merge[0] or (f[TARGET][0] == next_merge[0] and f[TARGET][1] < next_merge[1]))):
                activities_between.append(f[TARGET])
    dummy()
    # remove start element
    activities_between = activities_between[1:]
    dummy()
    
    # make unique again
    activities_between_u = []
    for a in activities_between:
        if a not in activities_between_u:
            activities_between_u.append(a)

    return activities_between

def get_last_activities(flow, flow_relations):
    """
    search for last (transitively) linked activities (recursively) before current flow
    :param flow: flow to start reversed search for
    :param flow_relations: set of flows
    :return: list of transitive connected activities
    """
    last_activities = []
    relevant_flows = get_previous_flows_by_text_structure(flow[SOURCE], flow_relations)
    last_element = flow[SOURCE]
    
    # search from this flow to search reversed for last activities    
    while not last_activities:
        source_flows = [f for f in relevant_flows if f[TARGET] == last_element]
        temp_new_activities = []
        for source_flow in source_flows:
            # a) base case -> activity found
            if source_flow[SOURCE][3] == ACTIVITY:
                temp_new_activities.append(source_flow[SOURCE])
            # b) recursive case -> continue search from flow before
            else:
                temp_new_activities.extend(get_last_activities(source_flow, relevant_flows))
        last_activities.extend(temp_new_activities)
        
    return last_activities

def enrich_doc_start_flow(flow_relations):
    new_first_flow = {SOURCE: (-1, -1, None, DOC_START), TARGET: flow_relations[0][SOURCE]}
    return [new_first_flow] + flow_relations

def filter_merge_point(merge_point, entity_list):
    return [e for e in entity_list if e != merge_point]

def filter_cond_spec(entity_list):
    return [e for e in entity_list if e[3] != CONDITION_SPECIFICATION]


def create_branch_relations(doc_name, flow_relations, gateway, merge_point, branch_start_entities):
    """
    create all relations between all entities of all combinations of branches
    :param branch_start_entities:
    :return: list of relations in tuple format
    """
    relations = []
    # add exclusive/concurrent relations between (multiple) activities of branches
    # first create list of activities for each branch
    # exclude merge point because its not part of exclusive/concurrent relations
    activity_branches = ([[e] + (get_activities_until_merge_point(e, merge_point, flow_relations))
                           for e in filter_merge_point(merge_point, branch_start_entities)])
    
    for i, b in enumerate(activity_branches):
        print(f"Branch {i}: {b}")

    # second create connections between all activities of each pair of branches
    if activity_branches:
        for branchA, branchB in itertools.combinations(activity_branches, 2):
            for e1, e2 in itertools.product(*[branchA, branchB]):
                if e1[3] == ACTIVITY and e2[3] == ACTIVITY:  # omit gateways or condition specs
                    relations.append((doc_name, e1, e2,
                                      EXCLUSIVE if gateway[3] == XOR_GATEWAY else CONCURRENT, "branches"))

    return relations


def data_generation(doc_names, whole_branch_pairs=True):
    
    # data format -> (doc_name, (a1), (a2), type, comment)
    # split/merge points are represented as directly follow relations 
    relations = []
    
    for i, doc_name in enumerate(pet_reader.document_names):

        if (doc_names and doc_name not in doc_names) or doc_name in doc_black_list:
            continue
            
        # 1) Search for relations using gateways
        doc_relations = pet_reader.relations_dataset.GetRelations(pet_reader.get_document_number(doc_name))
        flow_relations = transform_relations(doc_relations[FLOW])
        flow_relations = enrich_doc_start_flow(flow_relations)
        same_gateway_relations = transform_relations(doc_relations[SAME_GATEWAY])
        
        # special case: remove last flow for doc-9.5 because this is a whole process loop
        if doc_name == 'doc-9.5':
            flow_relations = flow_relations[:-1]
        
        print(" FLOW RELATIONS ".center(100, '-'))
        
        for i, f in enumerate(flow_relations):
            print("\n")
            print(i, f[SOURCE], f[TARGET])
            
            # a) DIRECTLY FOLLOWING RELATIONS
            if f[SOURCE][3] == f[TARGET][3] == ACTIVITY:
                relations.append((doc_name, f[SOURCE], f[TARGET], DF, "normal df"))
                
            # b) RELATIONS INVOLVING GATEWAYS
            if f[TARGET][3] in [XOR_GATEWAY, AND_GATEWAY]:
                
                # extract source activity of current flow for linking pairing with following activities of gateway (f[TARGET]) 
                if f[SOURCE][3] == ACTIVITY:
                    source_activities = [f[SOURCE]]
                # if gateways are nested/referring each other -> lookup previous last normal activity recursively
                elif f[SOURCE][3] in [CONDITION_SPECIFICATION, XOR_GATEWAY, AND_GATEWAY]:
                    source_activities = get_last_activities(f, flow_relations)
                    print(f"Nested gateway - transitive last activities: {source_activities}")
                # if gateway is at document start, no flows from connected activities of gateway to any source activities can be created
                elif f[SOURCE][3] == DOC_START:
                    source_activities = []
                else:
                    raise GatewayExtractionException("Other flow combination!")
                    
                
                gateway = f[TARGET]
                gateway_merge_point = find_next_merge_point(gateway, flow_relations)
                branch_start_entities = []
                print(f"Gateway {gateway} - merge point: {gateway_merge_point}")
                
                # create flows from possible multiple incomes to current gateway (only in case of directly nested gateways)
                # to possible multiple outcomes (normal for gateways)
                
                # extract activities to which the gateway refers

                
                # - 1) in case of direct entity (activity or further gateway) link without conditon and same gateway
                # cases: exlusive 'or' gateways || parallel gateways
                directly_linked_entities = get_linked_entities(gateway, flow_relations)
                directly_linked_entities
                # add relations of activities before to (directly linked) gateway activities via DF
                for e in directly_linked_entities:
                    if e[3] == ACTIVITY:
                        for source_activity in source_activities:
                            relations.append((doc_name, source_activity, e, DF, "g -> a"))
                directly_linked_entities_filtered = filter_merge_point(gateway_merge_point, directly_linked_entities)
                directly_linked_entities_filtered = filter_cond_spec(directly_linked_entities_filtered)
                branch_start_entities.extend(directly_linked_entities_filtered)

                # - 2) in case of indirect link via condition specification or same gateway relations
                gateway_branches_entities_directly_linked = []
                condition_spec_linked = get_linked_entities_via_condition(gateway, flow_relations)
                for e in condition_spec_linked:
                    if e[3] == ACTIVITY:
                        for source_activity in source_activities:
                            relations.append((doc_name, source_activity, e, DF, "g -> cond -> a")) 
                        gateway_branches_entities_directly_linked.append(e)
                    # not activity is linked, but other (gateway, cond) from which following activities will be included as well
                    else:
                        gateway_branches_entities_directly_linked.append(e)
                branch_start_entities.extend(gateway_branches_entities_directly_linked)

                # - 3) detect same gateways and repeat procedure for them
                sg_entities_linked = []
                sg_gateways = get_sg_gateways(gateway, same_gateway_relations)
                for sg_gateway in sg_gateways:
                    # directly linked
                    sg_linked_entities = get_linked_entities(sg_gateway, flow_relations)
                    print("same gateway", sg_gateway, "linked entities:", sg_linked_entities)
                    for e in sg_linked_entities:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> a"))
                            sg_entities_linked.append(e)
                        # not activity is linked, but other gateway from which following activities will be included as well
                        elif e[3] in [XOR_GATEWAY, AND_GATEWAY]:
                            sg_entities_linked.append(e)
                    # linked via condition
                    sg_gateway_condition_spec_linked = get_linked_entities_via_condition(sg_gateway, flow_relations)
                    for e in sg_gateway_condition_spec_linked:
                        if e[3] == ACTIVITY:
                            for source_activity in source_activities:
                                relations.append((doc_name, source_activity, e, DF, "g -> sg -> cond -> a"))
                            sg_entities_linked.append(e)
                        # not activity is linked, but other (gateway, cond) from which following activities will be included as well
                        else:
                            sg_entities_linked.append(e)
                branch_start_entities.extend(sg_entities_linked)
                
                # Create relations between activities of all branches
                relations.extend(create_branch_relations(doc_name, flow_relations, gateway, gateway_merge_point,
                                        branch_start_entities))

    # filter duplicates & sort
    relations_final = []
    for r in relations:
        if r not in relations_final and r[1] != r[2]:
            relations_final.append(r)
    relations_final.sort(key=lambda r: (r[1][0], r[1][1], r[2][0], r[2][1]))

    return relations_final
            


    
activity_relations = data_generation(['doc-1.1'], whole_branch_pairs=True)
print(" RESULTS ".center(100, '-'))
for relation in activity_relations:
    print(relation)
print("relations:", len(activity_relations))
# check again:

------------------------------------------ FLOW RELATIONS ------------------------------------------


0 (-1, -1, None, 'Document Start') (1, 4, ['receives'], 'Activity')


1 (1, 4, ['receives'], 'Activity') (2, 9, ['or'], 'XOR Gateway')
... opened additionally (9, 0, ['In', 'the', 'meantime'], 'AND Gateway')
... opened additionally (6, 0, ['If'], 'XOR Gateway')
... closed (10, 0, ['If'], 'XOR Gateway')
... closed (10, 0, ['If'], 'XOR Gateway')
Gateway (2, 9, ['or'], 'XOR Gateway') - merge point: None
Branch 0: [(2, 8, ['reject'], 'Activity')]
Branch 1: [(2, 10, ['accept'], 'Activity'), (4, 12, ['informed'], 'Activity'), (9, 0, ['In', 'the', 'meantime'], 'AND Gateway'), (9, 7, ['prepares'], 'Activity'), (5, 3, ['processes'], 'Activity'), (10, 0, ['If'], 'XOR Gateway'), (10, 1, ['the', 'storehouse', 'has', 'successfully', 'reserved', 'or', 'back-ordered', 'every', 'item', 'of', 'the', 'part', 'list'], 'Condition Specification'), (10, 24, ['assembles'], 'Activity'), (11, 5, ['ships'], 'A