In [1]:
# add parent dir to sys path for import of modules
import os
import sys
parentdir = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir))
sys.path.insert(0, parentdir) 

In [9]:
import itertools
import tensorflow as tf
import tensorflow_addons as tfa
import transformers
from transformers import BatchEncoding
import argparse
from typing import Tuple

from sklearn.model_selection import KFold
from petreader.labels import *

from labels import *
from utils import config, generate_args_logdir
from PetReader import pet_reader
from token_approaches.SameGatewayClassifier import SameGatewayClassifier
from token_approaches.token_data_augmentation import get_synonym_samples

In [3]:
seed=42
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)

In [4]:
_tokenizer = transformers.AutoTokenizer.from_pretrained(config[KEYWORDS_FILTERED_APPROACH][BERT_MODEL_NAME])
assert isinstance(_tokenizer, transformers.PreTrainedTokenizerFast)

In [99]:
def _create_dataset(input_ids: tf.Tensor, attention_masks: tf.Tensor, indexes: tf.Tensor, labels: tf.Tensor)\
        -> tf.data.Dataset:
    return tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks, "indexes": indexes},
                                               labels))


def _shuffle_tokenization_data(input_ids: tf.Tensor, attention_masks: tf.Tensor, indexes: tf.Tensor, labels: tf.Tensor) \
                        -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
    """
    shuffle tensors of tokenized data; seed for shuffling is seed_general from args
    :return: data tensors in same format but shuffled
    """
    indices = tf.range(start=0, limit=input_ids.shape[0], dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)
    input_ids = tf.gather(input_ids, shuffled_indices)
    attention_masks = tf.gather(attention_masks, shuffled_indices)
    indexes = tf.gather(indexes, shuffled_indices)
    labels = tf.gather(labels, shuffled_indices)
    return input_ids, attention_masks, indexes, labels


def _preprocess_gateway_pairs(gateway_type: str, context_sentences: int = 1, mode: str = CONCAT, n_gram: int = 1,
                              use_synonyms: bool = False) -> Tuple[BatchEncoding, tf.Tensor, tf.Tensor]:
    """
    extract and preprocess gateway pairs
    :param gateway_type: type of gateway to extract data for (XOR_GATEWAY or AND_GATEWAY)
    :param context_sentences: context size = number of sentences before and after first and second gateway to include
    :param mode: flag how to include gateway information (by concatenating n_grams of gateways to text or by indexes)
    :param n_gram: n of n_grams to include from gateways in CONCAT mode
    :param use_synonyms: flag if synonym samples should be included;
    :return: tokens as batch encoding, list of index pairs, list of labels
    """
    # reload from cache if already exists
#     cache_path = os.path.join(ROOT_DIR,
#                               f"data/other/same_gateway_data_{gateway_type}_{context_sentences}_{mode}_{n_gram}")
#     if os.path.exists(cache_path):
#         tokens, indexes, labels = load_pickle(cache_path)
#         logger.info("Reloaded same gateway data from cache")
#         return tokens, indexes, labels

    if use_synonyms:
        synonym_samples = get_synonym_samples()
        synonyms_of_original_samples = get_synonyms_of_original_samples()

    # lists to store results
    texts = []  # context texts
    n_gram_tuples = []  # tuples of gateway n_grams (only necessary for mode=CONCAT)
    indexes = []  # index of gateway tokens in samples -> tuple
    labels = []  # labels (0 or 1)

    # A) GENERATE DATA
    for i, doc_name in enumerate(pet_reader.document_names):
        
        if doc_name != 'doc-2.1':
            continue

        if i % 5 == 0:
            print(f"processed {i} documents")

        # 1) Prepare token data
        text = pet_reader.get_doc_text(doc_name)
        sample_ids = pet_reader.get_doc_sample_ids(doc_name)
        doc_tokens = [list(zip(
            [sample_id for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
            [s_i for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
            [i for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
            pet_reader.token_dataset.GetTokens(sample_id),
            pet_reader.token_dataset.GetNerTagLabels(sample_id))
        ) for s_i, sample_id in enumerate(sample_ids)]
        doc_tokens_flattened = list(itertools.chain(*doc_tokens))
        doc_tokens_flattened = [(i,) + token_tuple for i, token_tuple in enumerate(doc_tokens_flattened)]
        # token represented as tuple: (doc token index, sample id, sentence id, token id, token, ner-tag)

        # 2) Identify gateway pairs
        # filter for B- tokens, because I-s do not mark a new gateway of interest
        gateway_tokens = [token_tuple for token_tuple in doc_tokens_flattened if token_tuple[5] == f"B-{gateway_type}"]
        gateway_pairs = [(gateway_tokens[i], gateway_tokens[i + 1]) for i in range(len(gateway_tokens) - 1)]

        # check if gateways are related
        same_gateway_relations = pet_reader.get_doc_relations(doc_name)[SAME_GATEWAY]
        pair_labels = []  # list of labels if gateway are related (1) or not (0)
        # check if for pair of two subsequent gateways exists a same gateway relation
        for g1, g2 in gateway_pairs:
            same_gateway_found = False
            for same_gateway_relation in same_gateway_relations:
                if not same_gateway_found \
                        and g1[2] == same_gateway_relation[SOURCE_SENTENCE_ID] \
                        and g1[2] == same_gateway_relation[SOURCE_HEAD_TOKEN_ID] \
                        and g2[2] == same_gateway_relation[TARGET_SENTENCE_ID] \
                        and g2[2] == same_gateway_relation[TARGET_HEAD_TOKEN_ID]:
                    pair_labels.append(1)
                    same_gateway_found = True
            if not same_gateway_found:
                pair_labels.append(0)

        # TODO: SYNONYM USAGE
        # important variables so far: doc_tokens_flattened, gateway_pairs
        # approach: multiply data so far if synonyms are involved in document
        # concrete steps:
        # - DONE record before the in the document involved sample IDs
        # - DONE check if synonyms exist for one of the samples
        # - DONE "multiply" document sample by all combinations of synonyms

        # 3) prepare sample data
        def get_token(token_tuple, gateways_sample_infos):
                """
                returns the textual token of the given token tuple considering the different possible samples (normal or synonyms)
                """
                if not gateways_sample_infos:
                    return token_tuple[4]
                
                (g1_sample_id, g1_sample_id_original), (g2_sample_id, g2_sample_id_original) = gateways_sample_infos
                
                # check if both gateways are in same sentence and token is in the sentence
                if g1_sample_id_original == g2_sample_id_original and token_tuple[1] == g1_sample_id_original:
                    
                    # prefer higher id to favor synonym samples (but all will be used once)
                    sample_id_to_choose = max(g1_sample_id, g2_sample_id)
#                      # if sample is original sample, take normal token
#                     if g1_sample_id == g1_sample_id_original:
#                         print("Take token from original sentence", g1_sample_id, g2_sample_id, max(g1_sample_id, g2_sample_id))
#                         return token_tuple[4]
#                     # if not, take token at the same index from synonym sample
#                     else:
                    if sample_id_to_choose >= 500:
                        print("Take token from syn sentence", g1_sample_id, g2_sample_id, max(g1_sample_id, g2_sample_id))
                        return synonym_samples[sample_id_to_choose]['tokens'][token_tuple[3]]
                    else:
                        return token_tuple[4]
                
                # if token is in sentence of first gateway
                elif token_tuple[1] == g1_sample_id_original:
                    
                    # if sample is original sample, take normal token
                    if g1_sample_id == g1_sample_id_original:
                        return token_tuple[4]
                    # if not, take token at the same index from synonym sample
                    else:
                        return synonym_samples[g1_sample_id]['tokens'][token_tuple[3]]
                
                # if token is in sentence of second gateway
                elif token_tuple[1] == g2_sample_id_original:
                    # if sample is original sample, take normal token
                    if g2_sample_id == g2_sample_id_original:
                        return token_tuple[4]
                    # if not, take token at the same index from synonym sample
                    else:
                        return synonym_samples[g2_sample_id]['tokens'][token_tuple[3]]
                    
                # if token is not in scope of gateway sentences but context -> return normal token
                else:
                    return token_tuple[4]
        
        def get_n_gram(token, gateways_sample_infos=None):
            return ' '.join([get_token(token_tuple, gateways_sample_infos)
                             for token_tuple in doc_tokens_flattened[max(token[0] - n_gram, 0):
                                                                     min(token[0] + n_gram + 1, len(doc_tokens_flattened))]])

        for (g1, g2), label in zip(gateway_pairs, pair_labels):
            # Tokens/Text
            num_s = context_sentences
            sentences_in_scope = list(range(g1[2] - num_s if (g1[2] - num_s) > 0 else 0,
                                            g2[2] + num_s + 1 if (g2[2] + num_s + 1) < len(sample_ids) else len(
                                                sample_ids)))
            if not use_synonyms:
                # Tokens/Text
                text_in_scope = ' '.join([token[4] for token in doc_tokens_flattened
                                          if token[2] in sentences_in_scope])
                texts.append((text_in_scope))
                if mode == CONCAT:
                    n_gram_tuples.append((get_n_gram(g1), get_n_gram(g2)))

                # Indexes
                indexes.append((g1[0], g2[0]))

                # Label
                labels.append(label)
                
            else:
                print()
                print("++++++++++++++++++++ New pair", g1, g2)
                print(sentences_in_scope)

                print("Synonyms of sample id for g1:", synonyms_of_original_samples[g1[1]])
                print("Synonyms of sample id for g2:", synonyms_of_original_samples[g2[1]])
                # create lists of tuple of sample ids (normal and synonyms) for each gateway and original sample id
                g1_sample_ids = [(sample_id, g1[1]) for sample_id in [g1[1]] + synonyms_of_original_samples[g1[1]]]
                g2_sample_ids = [(sample_id, g2[1]) for sample_id in [g2[1]] + synonyms_of_original_samples[g2[1]]]
                print(g1_sample_ids)
                print(g2_sample_ids)
                print(list(itertools.product(*[g1_sample_ids, g2_sample_ids])))

                if g1[1] == g2[1]:
                    print("AAA", [(g1[1], g1[1])])
                    print("AAA", [(g1[1], g1[1])] + [(s, g1[1]) for s in synonyms_of_original_samples[g1[1]]])
                    gateway_sample_combinations = itertools.product(*[[(g1[1], g1[1])],
                                                                      [(g1[1], g1[1])] + [(s, g1[1]) for s in synonyms_of_original_samples[g1[1]]]])
                else:
                    g1_sample_ids = [(sample_id, g1[1]) for sample_id in [g1[1]] + synonyms_of_original_samples[g1[1]]]
                    g2_sample_ids = [(sample_id, g2[1]) for sample_id in [g2[1]] + synonyms_of_original_samples[g2[1]]]
                    gateway_sample_combinations = itertools.product(*[g1_sample_ids, g2_sample_ids])

                # iterate over pairs of gateway sentences (multiple possible if synonyms are used)
                for gateways_sample_infos in gateway_sample_combinations:
                    #print(gateways_sample_infos)
                    text_in_scope = ' '.join([get_token(token, gateways_sample_infos) for token in doc_tokens_flattened
                                              if token[2] in sentences_in_scope])

                    texts.append(text_in_scope)
                    if mode == CONCAT:
                        n_gram_tuples.append((get_n_gram(g1, gateways_sample_infos), get_n_gram(g2, gateways_sample_infos)))


                    print(text_in_scope)
                    print(get_n_gram(g1, gateways_sample_infos), '||', get_n_gram(g2, gateways_sample_infos))
                    print('\n' + 100 * '-' + '\n')


                    # Indexes
                    indexes.append((g1[0], g2[0]))

                    # Label
                    labels.append(label)
                
        print(len(labels))
    return

    # B) TOKENIZE TEXT
    if mode == INDEX:
        tokens = _tokenizer(texts, padding=True, return_tensors='tf')
    elif mode == CONCAT:
        # tokenize text & pairs seperately, because it is not possible to concat triple
        text_tokens = _tokenizer(texts, padding=True, return_tensors='tf')
        n_gram_tokens = _tokenizer(n_gram_tuples, padding=True, return_tensors="tf")
        # concat manually after (cut the CLS token of the second pair / n_grams)
        concatted_input_ids = tf.concat([text_tokens["input_ids"], n_gram_tokens["input_ids"][:, 1:]], axis=1)
        concatted_attention_masks = tf.concat([text_tokens["attention_mask"], n_gram_tokens["attention_mask"][:, 1:]],
                                              axis=1)
        tokens = transformers.BatchEncoding(
            {"input_ids": concatted_input_ids, "attention_mask": concatted_attention_masks})
    else:
        raise ValueError(f"mode must be {INDEX} or {CONCAT}")
        
        
_preprocess_gateway_pairs(XOR_GATEWAY, context_sentences=1, mode=CONCAT, n_gram=1, use_synonyms=True)        

INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl
INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl


processed 25 documents

++++++++++++++++++++ New pair (117, 136, 6, 0, 'In', 'B-XOR Gateway') (144, 137, 7, 0, 'In', 'B-XOR Gateway')
[5, 6, 7, 8]
Synonyms of sample id for g1: []
Synonyms of sample id for g2: [802, 803, 804]
[(136, 136)]
[(137, 137), (802, 137), (803, 137), (804, 137)]
[((136, 136), (137, 137)), ((136, 136), (802, 137)), ((136, 136), (803, 137)), ((136, 136), (804, 137))]
An electronic service then determines the significance of the customer based on information that has been collected during the history of the contractual relationship . In case the customer is premium , the process will link to an extra problem fix process ( this process will not be detailed here ) . In case the customer is of certain significance which would affect the counter measures previously decided upon , the process goes back to re-prioritize these measures otherwise the process continues . Taking together the information ( i.e . contract commitment data + prioritized actions ) a detailed pro

In [33]:

def get_synonyms_of_original_samples():
    synonyms = get_synonym_samples()
    synonyms_of_original_samples = {}  # dict with {original sample id: list of synonym ids}
    # record synonyms of sample ids
    for synonym_id, synonym_dict in synonyms.items():
        if synonym_dict['original_sample_number'] in synonyms_of_original_samples:
            synonyms_of_original_samples[synonym_dict['original_sample_number']].append(synonym_id)
        else:
            synonyms_of_original_samples[synonym_dict['original_sample_number']] = [synonym_id]
    # add empty lists for samples without synonyms
    for sample_id in pet_reader.token_dataset.GetRandomizedSampleNumbers():
        if sample_id not in synonyms_of_original_samples:
            synonyms_of_original_samples[sample_id] = []
    return synonyms_of_original_samples

synonyms_of_original_samples = get_synonyms_of_original_samples()
synonyms = get_synonym_samples()
print(str(synonyms_of_original_samples)[:200])

INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl
INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl


{399: [500, 501, 502, 503, 504, 505, 506, 507], 42: [508], 66: [509, 510, 511, 512, 513, 514, 515, 516], 371: [517, 518, 519], 7: [520, 521, 522, 523, 524, 525, 526, 527], 113: [528], 54: [529], 43: [


In [6]:
def create_same_gateway_cls_dataset_cv(args: argparse.Namespace = None, shuffle: bool = True, batch_size: int = None,
                                       context_sentences: int = 1, gateway_type: str = XOR_GATEWAY,
                                       mode: str = CONCAT, n_gram: int = 1):
    tokens, indexes, labels = _preprocess_gateway_pairs(context_sentences=context_sentences, gateway_type=gateway_type,
                                                        mode=mode, n_gram=n_gram)
    input_ids, attention_masks = tokens['input_ids'], tokens['attention_mask']
    if shuffle:
        input_ids, attention_masks, indexes, labels = _shuffle_tokenization_data(input_ids, attention_masks, indexes, labels)
    
    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=5)

    # create folds
    folded_datasets = []
    for train, test in kfold.split(input_ids):
        train_tf_dataset = _create_dataset(tf.gather(input_ids, train),
                                           tf.gather(attention_masks, train),
                                           tf.gather(indexes, train),
                                           tf.gather(labels, train))
        dev_tf_dataset = _create_dataset(tf.gather(input_ids, test),
                                         tf.gather(attention_masks, test),
                                         tf.gather(indexes, test),
                                         tf.gather(labels, test))
        if batch_size:
            train_tf_dataset = train_tf_dataset.batch(batch_size)
            dev_tf_dataset = dev_tf_dataset.batch(batch_size)
        folded_datasets.append((train_tf_dataset, dev_tf_dataset))
    
    return folded_datasets
    
folded_datasets = create_same_gateway_cls_dataset_cv(batch_size=8, context_sentences=1, gateway_type=XOR_GATEWAY, 
                                                     mode=CONCAT, n_gram=1)

for train, dev in folded_datasets:
    print(len(train), len(dev))


processed 0 documents
processed 5 documents
processed 10 documents
processed 15 documents
processed 20 documents
processed 25 documents
processed 30 documents
processed 35 documents
processed 40 documents
8 3
9 2
9 2
9 2
9 2


In [102]:
def create_same_gateway_cls_dataset_full(args: argparse.Namespace = None, shuffle: bool = True, batch_size: int = None,
                                       context_sentences: int = 1, gateway_type: str = XOR_GATEWAY,
                                       mode: str = CONCAT, n_gram: int = 1):
    tokens, indexes, labels = _preprocess_gateway_pairs(context_sentences=context_sentences, gateway_type=gateway_type,
                                                        mode=mode, n_gram=n_gram)
    input_ids, attention_masks = tokens['input_ids'], tokens['attention_mask']
    if shuffle:
        input_ids, attention_masks, indexes, labels = _shuffle_tokenization_data(input_ids, attention_masks, indexes, labels)
    dataset = _create_dataset(input_ids, attention_masks, indexes, labels)
    
    if batch_size:
        dataset = dataset.batch(batch_size)
    return dataset


datasets_full = create_same_gateway_cls_dataset_full(batch_size=None, context_sentences=1, gateway_type=XOR_GATEWAY, 
                                                     mode=INDEX, n_gram=1)
print(len(datasets_full))

processed 0 documents
processed 5 documents
processed 10 documents
processed 15 documents
processed 20 documents
processed 25 documents
processed 30 documents
processed 35 documents
processed 40 documents
81


## Label Distribution

In [107]:
from collections import Counter
labels = [x[1].numpy() for x in datasets_full]
Counter(labels)

Counter({0: 41, 1: 40})

In [24]:
from collections import Counter
print(type(Counter))

<class 'type'>


## Dummy Training

In [11]:
parser = argparse.ArgumentParser()
# Standard params
parser.add_argument("--batch_size", default=8, type=int, help="Batch size.")
parser.add_argument("--epochs", default=1, type=int, help="Number of epochs.")
parser.add_argument("--seed_general", default=42, type=int, help="Random seed.")
# routine params
parser.add_argument("--routine", default="cv", type=str, help="Simple split training 'sp', cross validation 'cv' or "
                                                              "full training without validation 'ft'.")
parser.add_argument("--folds", default=2, type=int, help="Number of folds in cross validation routine.")
parser.add_argument("--store_weights", default=False, type=bool, help="Flag if best weights should be stored.")
# Architecture / data params
parser.add_argument("--context_size", default=1, type=int, help="Number of sentences around to include in text.")
parser.add_argument("--mode", default=CONCAT, type=str, help="How to include gateway information.")
parser.add_argument("--n_gram", default=1, type=int, help="Number of tokens to include for gateway in CONCAT mode.")

args = parser.parse_args([] if "__file__" not in globals() else None)
args.logdir = generate_args_logdir(args)

In [12]:
train_dataset, dev_dataset = folded_datasets[0][0], folded_datasets[0][1]
model = SameGatewayClassifier(args, bert_model=None, mode=CONCAT, train_size=len(train_dataset))
history = model.fit(train_dataset, epochs=3, validation_data=dev_dataset)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "same_gateway_classifier"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_2[0][0]',                
 BertModel)                     ast_hidden_state=(N               'input_1[0][0]']                
                                one, None, 768),                                                  
                                 hidden_states=None                         

In [22]:
model = SameGatewayClassifier(args, bert_model=None, mode=CONCAT, train_size=len(train_dataset))

optimizer, lr_schedule = transformers.create_optimizer(
    init_lr=2e-5,
    num_train_steps=(len(train_dataset) // args.batch_size) * args.epochs,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model.compile(optimizer=optimizer,
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(name="precision"), 
                      tf.keras.metrics.Recall(name="recall")])  # , tfa.metrics.F1Score(num_classes=1)
    
history = model.fit(train_dataset, epochs=3, validation_data=dev_dataset)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "same_gateway_classifier_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_29 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_28 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 tf_distil_bert_model_9 (TFDist  TFBaseModelOutput(l  66362880   ['input_29[0][0]',               
 ilBertModel)                   ast_hidden_state=(N               'input_28[0][0]']               
                                one, None, 768),                                                  
                                 hidden_states=None                       