In [1]:
#!/usr/bin/env python3

# add parent dir to sys path for import of modules
import json
import os
import sys
# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)

In [2]:
import logging
import os.path
from typing import List, Tuple, Dict
import argparse

from petreader.labels import *
import transformers
import tensorflow as tf
# fix for exception "Attempting to perform BLAS operation using StreamExecutor without BLAS support"
config = tf.compat.v1.ConfigProto(gpu_options=tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8))
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

from token_approaches.SameGatewayClassifier import SameGatewayClassifier
from token_approaches.same_gateway_data_preparation import preprocess_gateway_pair
from Ensemble import Ensemble
from token_approaches.KeywordsApproach import KeywordsApproach
from PetReader import pet_reader
from utils import config, set_seeds, NumpyEncoder
from labels import *

logger = logging.getLogger('Keywords Same Gateway Filtered Approach')




INFO:Utilities:Loaded config: {'general-seed': 42, 'keywords-filtered-approach': {'bert-model-name': 'distilbert-base-uncased', 'label-set': 'all', 'label-number': 9, 'other-labels-weight': 0.1, 'num-labels': 9}, 'same-gateway-classifier': {'context_label_length': 350}, 'synonym-samples-start-number': 500}
INFO:PetReader:Reload pet_reader from C:\Users\janek\Development\Git\master-thesis\data/other/pet_reader.pkl
INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl
INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl


In [3]:
parser = argparse.ArgumentParser()
# Standard params
parser.add_argument("--batch_size", default=8, type=int, help="Batch size.")
parser.add_argument("--epochs", default=1, type=int, help="Number of epochs.")
parser.add_argument("--seed_general", default=42, type=int, help="Random seed.")
parser.add_argument("--ensemble", default=True, type=bool, help="Use ensemble learning with config.json seeds.")
parser.add_argument("--seeds_ensemble", default="0-1", type=str, help="Random seed range to use for ensembles")
# routine params
parser.add_argument("--routine", default="cv", type=str, help="Simple split training 'sp', cross validation 'cv' or "
                                                              "full training without validation 'ft'.")
parser.add_argument("--folds", default=2, type=int, help="Number of folds in cross validation routine.")
parser.add_argument("--store_weights", default=False, type=bool, help="Flag if best weights should be stored.")
# Data params
parser.add_argument("--gateway", default=XOR_GATEWAY, type=str, help="Type of gateway to classify")
parser.add_argument("--use_synonyms", default=False, type=str, help="Include synonym samples.")
parser.add_argument("--context_size", default=1, type=int, help="Number of sentences around to include in text.")
parser.add_argument("--mode", default=CONTEXT_NGRAM, type=str, help="How to include gateway information.")
parser.add_argument("--n_gram", default=1, type=int, help="Number of tokens to include for gateway in CONCAT mode.")
parser.add_argument("--activity_masking", default=NOT, type=str, help="How to include activity data.")
# Architecture params
parser.add_argument("--dropout", default=0.2, type=float, help="Dropout rate.")
parser.add_argument("--hidden_layer", default="32", type=str, help="Hidden layer sizes sep. by '-'")
parser.add_argument("--learning_rate", default=2e-5, type=float, help="Learning rate.")
parser.add_argument("--warmup", default=0, type=int, help="Number of warmup steps.")

args = parser.parse_args([] if "__file__" not in globals() else None)

In [15]:
class SameGatewayClassifier(tf.keras.Model):
    """
    binary classification model to classify if two gateways belong to the same gateway construct
    """
    def __init__(self, args: argparse.Namespace, bert_model, train_size: int = None):
        self.args = args

        # A) ARCHITECTURE
        inputs = {
            "input_ids": tf.keras.layers.Input(shape=[None], dtype=tf.int32),
            "attention_mask": tf.keras.layers.Input(shape=[None], dtype=tf.int32),
            "indexes": tf.keras.layers.Input(shape=[2], dtype=tf.int32),
            "context_labels": tf.keras.layers.Input(shape=[config[SAME_GATEWAY_CLASSIFIER][CONTEXT_LABEL_LENGTH]],
                                                    dtype=tf.int32),
        }

        if not bert_model:
            bert_model = transformers.TFAutoModel.from_pretrained(config[KEYWORDS_FILTERED_APPROACH][BERT_MODEL_NAME])
        # includes one dense layer with linear activation function
        bert_output = bert_model({"input_ids": inputs["input_ids"],
                                  "attention_mask": inputs["attention_mask"]}).last_hidden_state
        # extract cls token for every sample
        cls_token = bert_output[:, 0]
        dropout1 = tf.keras.layers.Dropout(args.dropout)(cls_token)

        # for only textual modes add immediately output layers
        if args.mode == CONTEXT_NGRAM or args.mode == N_GRAM:
            predictions = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(dropout1)

        # for modes that include more features, combine them with hidden layer(s) with BERT output
        elif args.mode in [CONTEXT_INDEX, CONTEXT_LABELS_NGRAM, CONTEXT_TEXT_AND_LABELS_NGRAM]:
            if args.mode == CONTEXT_INDEX:
                additional_information = inputs["indexes"]
            elif args.mode in [CONTEXT_LABELS_NGRAM, CONTEXT_TEXT_AND_LABELS_NGRAM]:
                additional_information = inputs["context_labels"]
            additional_information = tf.cast(additional_information, tf.float32)
            hidden = tf.keras.layers.Concatenate()([dropout1, additional_information])
            for hidden_layer_size in args.hidden_layer.split("-"):
                hidden = tf.keras.layers.Dense(int(hidden_layer_size), activation=tf.nn.relu)(hidden)
                hidden = tf.keras.layers.Dropout(args.dropout)(hidden)
            predictions = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(hidden)

        else:
            raise ValueError(f"mode must be {N_GRAM}, {CONTEXT_INDEX}, {CONTEXT_NGRAM}, {CONTEXT_LABELS_NGRAM} or"
                             f" {CONTEXT_TEXT_AND_LABELS_NGRAM}")

        super().__init__(inputs=inputs, outputs=predictions)

        # B) COMPILE (only needed when training is intended)
        optimizer, lr_schedule = transformers.create_optimizer(
            init_lr=args.learning_rate,
            num_train_steps=(train_size // args.batch_size) * args.epochs,
            weight_decay_rate=0.01,
            num_warmup_steps=args.warmup,
        )

        self.compile(optimizer=optimizer,
                     loss=tf.keras.losses.BinaryCrossentropy(),
                     metrics=[tf.keras.metrics.BinaryAccuracy(),
                              tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall")])

        # self.summary()

    def classify_pair(self, doc_name, g1, g2) -> bool:
        """
        create predictions for given data
        :param doc_name: document where gateways belong to
        :param g1: first gateway of pair to evaluate
        :param g2: second gateway of pair to evaluate
        :return: true or false (threshold 0.5 because of binary classification head)
        """
        # preprocess data
        tokens, indexes, context_labels = preprocess_gateway_pair(self.args, doc_name, g1, g2)
        inputs = {
            "input_ids": tokens["input_ids"],
            "attention_mask": tokens["attention_mask"],
            "indexes": indexes,
            "context_labels": context_labels
        }
        p = super().predict(inputs)[0][0]
        result = p > 0.5
        print(p, type(p), result)
        logger.info(f"Custom predict in {doc_name}: {g1} - {g2} -> {result}")
        return result

sgc = SameGatewayClassifier(args, None, train_size=70)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [16]:
class KeywordsSGCApproach(KeywordsApproach):
    """
    extend KeywordsApproach by evaluating same gateway relations with model
    """

    def __init__(self, approach_name: str = None, keywords: str = LITERATURE, contradictory_keywords: str = GOLD,
                 same_xor_gateway_threshold: int = 1, multiple_branches_allowed: bool = False,
                 output_format: str = BENCHMARK, output_folder: str = None,
                 xor_rule_c: bool = True, xor_rule_or: bool = True, xor_rule_op: bool = True,
                 # class / ensemble specific params:
                 ensemble_path: str = None, seed_limit: int = None):
        """
        creates new instance of the same gateway relation classification approach
        ---- super class params ----
        :param approach_name: description of approach to use in result folder name; if not set use key word variant
        :param keywords: flag/variant which keywords to use; available: literature, gold, own
        :param same_xor_gateway_threshold: threshold to recognize subsequent (contradictory xor) gateways as same
        :param output_format: output format of extracted element and flows; available: benchmark, pet
        :param output_folder: name of output folder; if none -> create based on approach name
        :param xor_rule_c: flag if rule for detection of contradictory gateways should be applied
        :param xor_rule_or: flag if rule for detection of 'or' gateways should be applied
        :param xor_rule_op: flag if rule for detection of one branch (optional branches) should be applied
        -- class / ensemble params ---
        :param ensemble_path: path of ensemble model to restore weights from;
                              if None, a random initialized model will be used
        :param seed_limit: limit of seeds to reload from the ensemble (in case of OOM errors)
        """
        super().__init__(approach_name=approach_name, keywords=keywords, contradictory_keywords=contradictory_keywords,
                         same_xor_gateway_threshold=same_xor_gateway_threshold,
                         multiple_branches_allowed=multiple_branches_allowed, output_format=output_format,
                         output_folder=output_folder,
                         xor_rule_c=xor_rule_c, xor_rule_or=xor_rule_or, xor_rule_op=xor_rule_op)
        # self.same_gateway_classifier = Ensemble(args=None, model_class=SameGatewayClassifier, ensemble_path=ensemble_path, seed_limit=seed_limit)
        self.same_gateway_classifier = sgc
        set_seeds(config[SEED], "Reset after initialization of SameGatewayClassifierEnsemble")

    def _extract_exclusive_flows(self, doc_activity_tokens: List[List[Tuple[str, int]]],
                                 extracted_gateways: List[List[Tuple[str, int, str]]],
                                 doc_name: str = None) -> Tuple[List[Dict], List[Dict]]:
        """
        extracts sequence flows surrounding exclusive gateways based on rules TODO describe rules
        :param doc_activity_tokens: list of activity tokens (word, idx) for each sentence
        :param extracted_gateways: list of own extracted gateway for each sentence
        :param doc_name: doc_name (used by override of KeywordsSGCApproach)
        :return: list of flow relations as source/target dicts; list of same gateway relations as source/target dicts
        """
        sequence_flows = []
        same_gateway_relations = []

        gateways = self._preprocess_extracted_gateways(extracted_gateways, XOR_GATEWAY)
        gateways_involved = []  # list for gateways already involved into sequence flows
        gateways_involved_contradictory = []  # list for gateways already involved into a contradictory gateway pair

        # RULE 1): check for every pair of following gateways if it fits to a gateway constellation with
        # contradictory key words. Gateways must be in range of same_xor_gateway_threshold sentences, otherwise they
        # would be seen as separate ones
        print(gateways)
        if self.xor_rule_c:
            for g1, g2 in self.extract_same_gateway_pairs_old(doc_name, gateways, gateways_involved, gateways_involved_contradictory):
                gateways_involved.append(g1[ELEMENT])
                gateways_involved.append(g2[ELEMENT])
                gateways_involved_contradictory.append(g1[ELEMENT])
                gateways_involved_contradictory.append(g2[ELEMENT])

                # A) find related activities
                _, pa_g1, fa_g1, _ = self._get_surrounding_activities(g1, doc_activity_tokens)
                _, _, fa_g2, ffa_g2 = self._get_surrounding_activities(g2, doc_activity_tokens)

                # B.1) connect elements to sequence flows
                # check if fol. activities of g1 and g2 are equal -> if yes, the first branch is without activity
                empty_branch = fa_g1[ELEMENT] == fa_g2[ELEMENT]
                # 1) previous activity to first gateway -> split point (if not None because of document start)
                if pa_g1[ELEMENT]:
                    sequence_flows.append(self._merge_flow(pa_g1, g1))
                # 2) gateway 1 to following activity and following activity to activity after gateway (second
                # following of g2) -> merge point
                # if None because of empty branch then directly there
                if not empty_branch and fa_g1[ELEMENT]:  # could be None if at document end
                    sequence_flows.append(self._merge_flow(g1, fa_g1))
                    if ffa_g2[ELEMENT]:  # could be None if at document end
                        sequence_flows.append(self._merge_flow(fa_g1, ffa_g2))
                elif empty_branch and ffa_g2[ELEMENT]:  # could be None if at document end
                    sequence_flows.append(self._merge_flow(g1, ffa_g2))
                # 3) gateway 2 to following activity and following activity to activity after gateway (second
                # following of g2) -> merge point
                if fa_g2[ELEMENT]:  # could be None if at document end
                    sequence_flows.append(self._merge_flow(g2, fa_g2))
                if ffa_g2[ELEMENT]:  # could be None if at document end
                    sequence_flows.append(self._merge_flow(fa_g2, ffa_g2))

                # B.2) same gateway flows
                same_gateway_relations.append(self._merge_flow(g1, g2))

                # log gateway frame for later usage in flow merging of whole document
                closing = fa_g2 if fa_g2[ELEMENT] else g2
                self._log_gateway_frame(g1[ELEMENT][0], g1[ELEMENT][1], g1,
                                        closing[ELEMENT][0], closing[ELEMENT][1], closing)

        return sequence_flows, same_gateway_relations
    
    def extract_same_gateway_pairs_old(self, doc_name, gateways, gateways_involved, gateways_involved_contradictory):
        print("OLD APPROACH")
        same_gateway_pairs = []
        for i in range(len(gateways) - 1):
            g1, g2 = gateways[i], gateways[i + 1]
            # if sentence distances is larger than threshold, reject possible pair
            if abs(g2[ELEMENT][0] - g1[ELEMENT][0]) > self.same_xor_gateway_threshold:
                continue
            # check for every pair of following gateways if it fits to a gateway pair of contradictory key words
            # and check that first gateway is at the beginning of a sentence
            # and check if gateways already matched another pair; possible because of partly same phrase
            for pattern_gateway_1, pattern_gateway_2 in self._contradictory_gateways:
                if g1[ELEMENT][3] == pattern_gateway_1 and g2[ELEMENT][3] == pattern_gateway_2 \
                        and g1[ELEMENT][1] == 0 \
                        and ((g1[ELEMENT] not in gateways_involved_contradictory
                              and g2[ELEMENT] not in gateways_involved_contradictory)
                             or self.multiple_branches_allowed):
                    same_gateway_pairs.append((g1, g2))
        for g1, g2 in same_gateway_pairs:
            print(g1[ELEMENT], ",", g2[ELEMENT])
        return same_gateway_pairs
    
    def extract_same_gateway_pairs_new(self, doc_name, gateways, gateways_involved, gateways_involved_contradictory):
        print("NEW APPROACH")
        same_gateway_pairs = []
        for i in range(len(gateways) - 1):
            g1, g2 = gateways[i], gateways[i + 1]
            if self.same_gateway_classifier.classify_pair(doc_name, g1[ELEMENT], g2[ELEMENT]):
                same_gateway_pairs.append((g1, g2))
        for g1, g2 in same_gateway_pairs:
            print(g1[ELEMENT], ",", g2[ELEMENT])
        return same_gateway_pairs

    
logging.basicConfig(level=logging.INFO)
set_seeds(config[SEED], "Set first seed")
keyword_filtered_approach = KeywordsSGCApproach(approach_name='key_words_literature_sg_classified_{model_params}',
                                                 # three cases to evaluate with filter model
                                                 keywords=LITERATURE,
                                                 # keywords=CUSTOM,
                                                 # keywords=CUSTOM, contradictory_keywords=GOLD, same_xor_gateway_threshold=3, multiple_branches_allowed=True, seed_limit=15,
                                                 ensemble_path="?")
if False:
    keyword_filtered_approach.evaluate_documents(evaluate_token_cls=True, evaluate_relation_extraction=True)

if True:
    doc_name = 'doc-1.1'
    xor_gateways, and_gateways, doc_flows, same_gateway_relations = keyword_filtered_approach.process_document(doc_name)

    print(" Concurrent gateways ".center(50, '-'))
    for gateway in and_gateways:
        print(gateway)

    print(" Exclusive gateways ".center(50, '-'))
    for gateway in xor_gateways:
        print(gateway)
    
    print(" Same gateway relations ".center(50, '-'))
    for sg in same_gateway_relations:
        print(sg)
        
if True:
    test_gateways = [{'element': (2, 9, ['or'], ['or']), 'source': {'source-head-sentence-ID': 2, 'source-head-word-ID': 9, 'source-entity-type': 'XOR Gateway', 'source-entity': ['or']}, 'target': {'target-head-sentence-ID': 2, 'target-head-word-ID': 9, 'target-entity-type': 'XOR Gateway', 'target-entity': ['or']}}, {'element': (6, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 6, 'source-head-word-ID': 0, 'source-entity-type': 'XOR Gateway', 'source-entity': ['If']}, 'target': {'target-head-sentence-ID': 6, 'target-head-word-ID': 0, 'target-entity-type': 'XOR Gateway', 'target-entity': ['If']}}, {'element': (7, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 7, 'source-head-word-ID': 0, 'source-entity-type': 'XOR Gateway', 'source-entity': ['If']}, 'target': {'target-head-sentence-ID': 7, 'target-head-word-ID': 0, 'target-entity-type': 'XOR Gateway', 'target-entity': ['If']}}, {'element': (10, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 10, 'source-head-word-ID': 0, 'source-entity-type': 'XOR Gateway', 'source-entity': ['If']}, 'target': {'target-head-sentence-ID': 10, 'target-head-word-ID': 0, 'target-entity-type': 'XOR Gateway', 'target-entity': ['If']}}, {'element': (10, 6, ['or'], ['or']), 'source': {'source-head-sentence-ID': 10, 'source-head-word-ID': 6, 'source-entity-type': 'XOR Gateway', 'source-entity': ['or']}, 'target': {'target-head-sentence-ID': 10, 'target-head-word-ID': 6, 'target-entity-type': 'XOR Gateway', 'target-entity': ['or']}}]
    keyword_filtered_approach.extract_same_gateway_pairs_new('doc-1.1', test_gateways, [], [])

INFO:Utilities:Set seeds to 42 (caller: Set first seed)
INFO:Utilities:Load keywords 'literature' ...
INFO:Utilities:Loaded 15 XOR and 11 AND keywords (literature)
INFO:Utilities:Used XOR keywords: ['either', 'else', 'if', 'if not', 'in case', 'in case of', 'only', 'only if', 'or', 'otherwise', 'till', 'unless', 'until', 'when', 'whether']
INFO:Utilities:Used AND keywords: ['at the same time', 'concurrently', 'in addition to', 'in parallel', 'in parallel with this', 'in the meantime', 'meantime', 'meanwhile', 'simultaneously', 'whereas', 'while']
INFO:Utilities:Loaded 14 gold pairs of contradictory keywords
INFO:Utilities:Set seeds to 42 (caller: Reset after initialization of SameGatewayClassifierEnsemble)
INFO:Keyword Approach:10 gateway flows
INFO:Keyword Approach:10 gold activity flows
INFO:Keyword Approach:15 doc flows


[{'element': (2, 9, ['or'], ['or']), 'source': {'source-head-sentence-ID': 2, 'source-head-word-ID': 9, 'source-entity-type': 'XOR Gateway', 'source-entity': ['or']}, 'target': {'target-head-sentence-ID': 2, 'target-head-word-ID': 9, 'target-entity-type': 'XOR Gateway', 'target-entity': ['or']}}, {'element': (6, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 6, 'source-head-word-ID': 0, 'source-entity-type': 'XOR Gateway', 'source-entity': ['If']}, 'target': {'target-head-sentence-ID': 6, 'target-head-word-ID': 0, 'target-entity-type': 'XOR Gateway', 'target-entity': ['If']}}, {'element': (7, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 7, 'source-head-word-ID': 0, 'source-entity-type': 'XOR Gateway', 'source-entity': ['If']}, 'target': {'target-head-sentence-ID': 7, 'target-head-word-ID': 0, 'target-entity-type': 'XOR Gateway', 'target-entity': ['If']}}, {'element': (10, 0, ['If'], ['if']), 'source': {'source-head-sentence-ID': 10, 'source-head-word-ID': 0, 'so

INFO:Keywords Same Gateway Filtered Approach:Custom predict in doc-1.1: (2, 9, ['or'], ['or']) - (6, 0, ['If'], ['if']) -> True


0.5473748 <class 'numpy.float32'> True


INFO:Keywords Same Gateway Filtered Approach:Custom predict in doc-1.1: (6, 0, ['If'], ['if']) - (7, 0, ['If'], ['if']) -> True


0.5678432 <class 'numpy.float32'> True


INFO:Keywords Same Gateway Filtered Approach:Custom predict in doc-1.1: (7, 0, ['If'], ['if']) - (10, 0, ['If'], ['if']) -> True


0.5629023 <class 'numpy.float32'> True


INFO:Keywords Same Gateway Filtered Approach:Custom predict in doc-1.1: (10, 0, ['If'], ['if']) - (10, 6, ['or'], ['or']) -> True


0.5770768 <class 'numpy.float32'> True
(2, 9, ['or'], ['or']) , (6, 0, ['If'], ['if'])
(6, 0, ['If'], ['if']) , (7, 0, ['If'], ['if'])
(7, 0, ['If'], ['if']) , (10, 0, ['If'], ['if'])
(10, 0, ['If'], ['if']) , (10, 6, ['or'], ['or'])
