In [1]:
# add parent dir to sys path for import of modules
import os
import sys

# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)

In [75]:
import itertools
import logging
import argparse
import random
from typing import Tuple, List, Dict

import tensorflow as tf
import transformers
from sklearn.model_selection import KFold
from petreader.labels import *
from transformers import BatchEncoding

from labels import *
from utils import config, ROOT_DIR, load_pickle, save_as_pickle, set_seeds
from PetReader import pet_reader
from token_approaches.token_data_augmentation import get_synonym_samples, get_synonyms_of_original_samples

In [3]:
_tokenizer = transformers.AutoTokenizer.from_pretrained(config[KEYWORDS_FILTERED_APPROACH][BERT_MODEL_NAME])
assert isinstance(_tokenizer, transformers.PreTrainedTokenizerFast)

In [27]:
# load synonym data
synonym_samples = get_synonym_samples()
synonyms_of_original_samples = get_synonyms_of_original_samples()

INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl
INFO:Data Augmentation:Reload synonym_samples from C:\Users\janek\Development\Git\master-thesis\data/other/synonym_samples.pkl


## Same Gateway

In [31]:
logger = logging.getLogger('Data Preparation [Same Gateway CLS]')
logging.basicConfig(level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
parser.add_argument("--gateway", default=XOR_GATEWAY, type=str, help="Type of gateway to classify")
parser.add_argument("--use_synonyms", default=True, type=str, help="Include synonym samples.")
parser.add_argument("--activity_masking", default=MULTI_MASK, type=str, help="How to include activity data.")
parser.add_argument("--context_size", default=1, type=int, help="Number of sentences around to include in text.")
parser.add_argument("--mode", default=CONTEXT_TEXT_AND_LABELS_NGRAM, type=str, help="How to include gateway information.")
parser.add_argument("--n_gram", default=1, type=int, help="Number of tokens to include for gateway in CONCAT mode.")

args_sg = parser.parse_args([] if "__file__" not in globals() else None)

In [35]:
def _get_doc_tokens_flattened(doc_name: str) -> Tuple[List[List], List[int]]:
    """
    extract, enrich and flatten tokens of given document
    :param doc_name: doc_name
    :return:
        - list of tuples -> (doc token index, sample id, sentence id, token id, token, ner-tag, #I-tokens)
        - list of sample_ids
    """
    sample_ids = pet_reader.get_doc_sample_ids(doc_name)
    doc_tokens = [list(zip(
        [sample_id for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
        [s_i for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
        [i for i in range(len(pet_reader.token_dataset.GetTokens(sample_id)))],
        pet_reader.token_dataset.GetTokens(sample_id),
        pet_reader.token_dataset.GetNerTagLabels(sample_id))
    ) for s_i, sample_id in enumerate(sample_ids)]
    doc_tokens_flattened = list(itertools.chain(*doc_tokens))
    doc_tokens_flattened = [[i] + list(token_tuple) for i, token_tuple in enumerate(doc_tokens_flattened)]
    
    def get_following_i_tokens(token_index):
        """
        append number of following I- tokens in case of B- token for usage when computing n_grams
        :param token_index: token index
        :return: list of following I- tokens
        """
        following_i_tokens = []
        for token in doc_tokens_flattened[token_index + 1:]:
            if token[5].startswith("I-"):
                following_i_tokens.append(token)
            else:
                break
        return following_i_tokens

    doc_tokens_flattened = [doc_token + [len(get_following_i_tokens(doc_token[0]))]
                            for doc_token in doc_tokens_flattened]
    return doc_tokens_flattened, sample_ids
    
    
def _get_textual_token(token_tuple, gateways_sample_infos):
    """
    returns the textual token of the given token tuple considering the different possible samples (normal or synonyms)
    :param token_tuple: token tuple
    :param gateways_sample_infos: infos about which samples are used for surrounding gateways
    :returns: token
    """
    if not gateways_sample_infos:
        return token_tuple[4]

    (g1_sample_id, g1_sample_id_original), (g2_sample_id, g2_sample_id_original) = gateways_sample_infos

    # check if both gateways are in same sentence and token is in the sentence
    if g1_sample_id_original == g2_sample_id_original and token_tuple[1] == g1_sample_id_original:

        # prefer higher id to favor synonym samples (but all will be used once)
        sample_id_to_choose = max(g1_sample_id, g2_sample_id)
        if sample_id_to_choose >= config[SYNONYM_SAMPLES_START_NUMBER]:
            return synonym_samples[sample_id_to_choose]['tokens'][token_tuple[3]]
        else:
            return token_tuple[4]

    # if token is in sentence of first gateway
    elif token_tuple[1] == g1_sample_id_original:

        # if sample is original sample, take normal token
        if g1_sample_id == g1_sample_id_original:
            return token_tuple[4]
        # if not, take token at the same index from synonym sample
        else:
            return synonym_samples[g1_sample_id]['tokens'][token_tuple[3]]

    # if token is in sentence of second gateway
    elif token_tuple[1] == g2_sample_id_original:
        # if sample is original sample, take normal token
        if g2_sample_id == g2_sample_id_original:
            return token_tuple[4]
        # if not, take token at the same index from synonym sample
        else:
            return synonym_samples[g2_sample_id]['tokens'][token_tuple[3]]

    # if token is not in scope of gateway sentences but context -> return normal token
    else:
        return token_tuple[4]


def _get_n_gram(token, n_gram, doc_tokens_flattened, gateways_sample_infos=None):
    """
    create n gram of a given token
    for gateway elements that consist of multiple tokens, include I- tokens as well by adding token[6] to range
    :param token: token tuple
    :param gateways_sample_infos: infos about which samples are used for surrounding gateways
    :return: textual n-gram
    """
    return ' '.join([_get_textual_token(token_tuple, gateways_sample_infos)
                     for token_tuple in doc_tokens_flattened[max(token[0] - n_gram, 0):
                                                             min(token[0] + n_gram + token[6] + 1,
                                                                 len(doc_tokens_flattened))]])


def _tokenize_textual_features(mode, texts, n_gram_tuples) -> transformers.BatchEncoding:
    """
    create a tokenization with different inputs based on passed mode
    :param mode: architecture variant / mode
    :param texts: texts
    :param n_gram_tuples: n gram tuples
    :return: encoded tokens
    """
    if mode == N_GRAM or mode == CONTEXT_LABELS_NGRAM:
        tokens = _tokenizer(n_gram_tuples, padding=True, return_tensors="tf")
    elif mode == CONTEXT_INDEX:
        tokens = _tokenizer(texts, padding=True, return_tensors='tf')
    elif mode == CONTEXT_NGRAM or mode == CONTEXT_TEXT_AND_LABELS_NGRAM:
        # tokenize text & pairs separately, because it is not possible to concat triple
        text_tokens = _tokenizer(texts, padding=True, return_tensors='tf')
        n_gram_tokens = _tokenizer(n_gram_tuples, padding=True, return_tensors="tf")
        # concat manually after (cut the CLS token of the second pair / n_grams)
        concatted_input_ids = tf.concat([text_tokens["input_ids"], n_gram_tokens["input_ids"][:, 1:]], axis=1)
        concatted_attention_masks = tf.concat([text_tokens["attention_mask"], n_gram_tokens["attention_mask"][:, 1:]],
                                              axis=1)
        tokens = transformers.BatchEncoding(
            {"input_ids": concatted_input_ids, "attention_mask": concatted_attention_masks})
    else:
        raise ValueError(f"mode must be {N_GRAM}, {CONTEXT_INDEX}, {CONTEXT_NGRAM}, {CONTEXT_LABELS_NGRAM} or"
                         f" {CONTEXT_TEXT_AND_LABELS_NGRAM}")

    return tokens


def _pad_context_labels(context_labels: List[int]) -> List[int]:
    """
    pad context labels to static maximum length from config (necessary for passing to dense prediction layer)
    :param context_labels: list of context labels unpadded
    :return: list of context labels padded
    """
    # pad context labels to same fixed length (pad with 0, label for activities = 1, label for other tokens = 2
    max_context = config[SAME_GATEWAY_CLASSIFIER][CONTEXT_LABEL_LENGTH]
    context_labels_padded = [row[:max_context] + [SGC_CONTEXT_LABEL_PADDING for i in range(max_context - len(row))]
                             for row in context_labels]
    return context_labels_padded


def _mask_activities(doc_tokens_flattened: List[List], masking_strategy: str) -> List[List]:
    """
    mask activities with "dummy", most common activity or most common activities (if multiple in one sentence)
    :param doc_tokens_flattened: list of tokens of a document
    :param masking_strategy: how activities should be masked
    :return: list of tokens with masked texts
    """
    found_activities = 0
    for token in doc_tokens_flattened:
        if token[5].endswith(ACTIVITY):
            if masking_strategy == DUMMY:
                masked = 'activity'
            elif masking_strategy == SINGLE_MASK:
                masked = pet_reader.most_common_activities[0]
            elif masking_strategy == MULTI_MASK:
                masked = pet_reader.most_common_activities[found_activities]
            found_activities += 1
            token[4] = masked
    return doc_tokens_flattened

    
def _get_gateway_pairs(gateway_type: str, doc_names: List[str] = []) -> List[Tuple]:
    pairs = []
    for i, doc_name in enumerate(pet_reader.document_names):

        if doc_names and (doc_name not in doc_names):
            continue
            
        if i % 5 == 0:
            print(f"processed {i} documents")

        # 1) Prepare token data
        doc_tokens_flattened, sample_ids = _get_doc_tokens_flattened(doc_name)

        # 2) Identify gateway pairs
        # filter for B- tokens, because I-s do not mark a new gateway of interest
        gateway_tokens = [token_tuple for token_tuple in doc_tokens_flattened if token_tuple[5] == f"B-{gateway_type}"]
        gateway_pairs = [(gateway_tokens[i], gateway_tokens[i + 1]) for i in range(len(gateway_tokens) - 1)]
        
        same_gateway_relations = pet_reader.get_doc_relations(doc_name)[SAME_GATEWAY]
        label = None  # if gateway are related (1) or not (0)
        # check if for pair of two subsequent gateways exists a same gateway relation
        for g1, g2 in gateway_pairs:
            same_gateway_found = False
            for same_gateway_relation in same_gateway_relations:
                if not same_gateway_found \
                        and g1[2] == same_gateway_relation[SOURCE_SENTENCE_ID] \
                        and g1[3] == same_gateway_relation[SOURCE_HEAD_TOKEN_ID] \
                        and g2[2] == same_gateway_relation[TARGET_SENTENCE_ID] \
                        and g2[3] == same_gateway_relation[TARGET_HEAD_TOKEN_ID]:
                    label = 1
                    same_gateway_found = True
            if not same_gateway_found:
                label = 0
        
        for pair in gateway_pairs:
            pairs.append((doc_name, pair, label))
    return pairs


def _create_dataset_sg(input_ids: tf.Tensor, attention_masks: tf.Tensor, indexes: tf.Tensor, context_labels: tf.Tensor,
                       labels: tf.Tensor) -> tf.data.Dataset:
    return tf.data.Dataset.from_tensor_slices(
        ({'input_ids': input_ids, 'attention_mask': attention_masks, "indexes": indexes,
          "context_labels": context_labels}, labels))


def _generate_data_sg(gateway_type, args, pairs=None):
    
    if not pairs:
        pairs = _get_gateway_pairs(gateway_type)
        random.shuffle(pairs)

    
    # reload from cache if already exists
    param_string = "reworked_" + '_'.join([str(p) for p in [gateway_type, args.use_synonyms, args.activity_masking, 
                                                            args.mode, args.context_size, args.n_gram]])
    cache_path = os.path.join(ROOT_DIR, f"data/other/data_cache/same_gateway/same_gateway_data_{param_string}")

    
    # create datasets for k fold cross validation
    folded_datasets = []
        
    kfold = KFold(n_splits=2)
    for i, (train, dev) in enumerate(kfold.split(pairs)):
        train_pairs = [p for i, p in enumerate(pairs) if i in train]
        dev_pairs = [p for i, p in enumerate(pairs) if i in dev]
        logger.info(f"Fold {i} -> {len(train_pairs)} / {len(dev_pairs)}")
        
        cache_path_train, cache_path_dev = f"{cache_path}__fold{i}_train", f"{cache_path}__fold{i}_dev"
        train_tf_dataset = _prepare_dataset_sg(cache_path_train, train_pairs, args.mode, args.use_synonyms, 
                                               args.activity_masking, args.context_size, args.n_gram)
        dev_tf_dataset = _prepare_dataset_sg(cache_path_dev, dev_pairs, args.mode, False, 
                                             args.activity_masking, args.context_size, args.n_gram)
        
        if args.batch_size:
            train_tf_dataset = train_tf_dataset.batch(args.batch_size)
            dev_tf_dataset = dev_tf_dataset.batch(args.batch_size)
        
        folded_datasets.append((train_tf_dataset, dev_tf_dataset))
        
    return folded_datasets
        
        
def _prepare_dataset_sg(cache_path, pairs, mode, use_synonyms=False, activity_masking=NOT, context_sentences=1, n_gram=0):
    
    # reload from cache if already exists
    if os.path.exists(cache_path):
        tokens, indexes, context_labels, labels = load_pickle(cache_path)
        logger.info("Reloaded same gateway data from cache")
        results = (tokens, indexes, context_labels, labels)
    
    else:
    
        # lists to store results
        texts = []  # context texts
        n_gram_tuples = []  # tuples of gateway n_grams (only necessary for mode=context_n_gram)
        indexes = []  # index of gateway tokens in samples -> tuple
        context_labels = []  # list of context token labels
        labels = []  # labels (0 or 1)   

        for i, (doc_name, (g1, g2), label) in enumerate(pairs[:2]):
            doc_tokens_flattened, sample_ids = _get_doc_tokens_flattened(doc_name)
            
            if activity_masking in [DUMMY, SINGLE_MASK, MULTI_MASK]:
                doc_tokens_flattened = _mask_activities(doc_tokens_flattened, activity_masking)
            
            if i % 15 == 0:
                logger.info(f"Process pair {i} of {len(pairs)}")

            # Tokens/Text
            num_s = context_sentences
            sentences_in_scope = list(range(g1[2] - num_s if (g1[2] - num_s) > 0 else 0,
                                            g2[2] + num_s + 1 if (g2[2] + num_s + 1) < len(sample_ids) else len(
                                                sample_ids)))

            def append_not_token_data():
                """
                appending indexes, context_labels and labels of g1/g2 sample to dataset wide lists
                defined for reuse because of normal and synonym mode
                """
                # Indexes
                indexes.append((g1[0], g2[0]))
                # Context token labels
                context_labels.append([SGC_CONTEXT_LABEL_ACTIVITY if token[5] == ACTIVITY
                                       else SGC_CONTEXT_LABEL_OTHER for token in doc_tokens_flattened
                                       if token[2] in sentences_in_scope])
                # Label
                labels.append(label)

            if not use_synonyms:
                # Tokens/Text
                text_in_scope = ' '.join([token[4] for token in doc_tokens_flattened
                                          if token[2] in sentences_in_scope])
                texts.append(text_in_scope)
                if mode in [N_GRAM, CONTEXT_NGRAM, CONTEXT_LABELS_NGRAM, CONTEXT_TEXT_AND_LABELS_NGRAM]:
                    n_gram_tuples.append((_get_n_gram(g1, n_gram, doc_tokens_flattened),
                                          _get_n_gram(g2, n_gram, doc_tokens_flattened)))

                append_not_token_data()

            else:
                # create cartesian product between different samples of sentences that include gateways
                # use for each gateway the sentence itself and optional synonyms
                if g1[1] == g2[1]:
                    gateway_sample_combinations = itertools.product(*[
                        [(g1[1], g1[1])],
                        [(g1[1], g1[1])] + [(s, g1[1]) for s in synonyms_of_original_samples[g1[1]]]])
                else:
                    g1_sample_ids = [(sample_id, g1[1]) for sample_id in [g1[1]] + synonyms_of_original_samples[g1[1]]]
                    g2_sample_ids = [(sample_id, g2[1]) for sample_id in [g2[1]] + synonyms_of_original_samples[g2[1]]]
                    gateway_sample_combinations = itertools.product(*[g1_sample_ids, g2_sample_ids])

                # iterate over pairs of gateway sentences (multiple possible if synonyms are used)
                for gateways_sample_infos in gateway_sample_combinations:
                    text_in_scope = ' '.join([_get_textual_token(token, gateways_sample_infos)
                                              for token in doc_tokens_flattened if token[2] in sentences_in_scope])

                    texts.append(text_in_scope)
                    if mode in [N_GRAM, CONTEXT_NGRAM, CONTEXT_LABELS_NGRAM, CONTEXT_TEXT_AND_LABELS_NGRAM]:
                        n_gram_tuples.append(
                            (_get_n_gram(g1, n_gram, doc_tokens_flattened, gateways_sample_infos),
                             _get_n_gram(g2, n_gram, doc_tokens_flattened, gateways_sample_infos)))

                    append_not_token_data()
                    
        results = (_tokenize_textual_features(mode, texts, n_gram_tuples),
                   tf.constant(indexes),
                   tf.constant(_pad_context_labels(context_labels)),
                   tf.constant(labels))

        # save in cache
        save_as_pickle(results, cache_path)

    return _create_dataset_sg(results[0]["input_ids"], results[0]["attention_mask"], results[1], 
                           results[2], results[3])
    
folded_datasets_sg = _generate_data_sg(XOR_GATEWAY, args_sg, pairs=pairs)
for i, (train, dev) in enumerate(folded_datasets_sg):
    print(f"Fold {i}: train {len(train)} / dev {len(dev)}")

INFO:Data Preparation [Same Gateway CLS]:Fold 0 -> 40 / 41
INFO:Data Preparation [Same Gateway CLS]:Process pair 0 of 40
INFO:Data Preparation [Same Gateway CLS]:Process pair 0 of 41
INFO:Data Preparation [Same Gateway CLS]:Fold 1 -> 41 / 40
INFO:Data Preparation [Same Gateway CLS]:Process pair 0 of 41
INFO:Data Preparation [Same Gateway CLS]:Process pair 0 of 40


Fold 0: train 13 / dev 2
Fold 1: train 37 / dev 2


In [7]:
pairs = _get_gateway_pairs(XOR_GATEWAY)

processed 0 documents
processed 5 documents
processed 10 documents
processed 15 documents
processed 20 documents
processed 25 documents
processed 30 documents
processed 35 documents
processed 40 documents


In [8]:
set_seeds(42)
random.shuffle(pairs)

INFO:Utilities:Set seeds to 42 (caller: None)


## Token Classification

In [54]:
parser = argparse.ArgumentParser()

parser.add_argument("--labels", default=ALL, type=str, help="Label set to use.")
parser.add_argument("--other_labels_weight", default=0.1, type=float, help="Sample weight for non gateway tokens.")
parser.add_argument("--sampling_strategy", default=NORMAL, type=str, help="How to sample samples.")
parser.add_argument("--use_synonyms", default=False, type=str, help="Include synonym samples.")
parser.add_argument("--activity_masking", default=NOT, type=str, help="How to include activity data.")

args_tc = parser.parse_args([] if "__file__" not in globals() else None)

In [89]:
# SAMPLING STRATEGIES -> provide list of sample IDs to use

def _get_sample_ids(strategy: str = None) -> List[int]:
    """
    unified method to get list of samples to include in a dataset; which samples is controlled by strategy parameter
    use use_synonyms=True only with "normal" and "only gateway" strategy
    :param strategy: strategy which samples to include
    :param use_synonyms: flag if synonym samples should be included;
                         WARNING: True will change up/down sampling logic -> DO NOT USE TOGETHER
    :return: list of sample numbers
    """
    all_sample_ids = pet_reader.token_dataset.GetRandomizedSampleNumbers()

    # modify all_sample_ids list based on sampling strategy
    if strategy == NORMAL or strategy is None:
        return all_sample_ids
    elif strategy == UP_SAMPLING:
        return _up_sample_gateway_samples(all_sample_ids)
    elif strategy == DOWN_SAMPLING:
        return _down_sample_other_samples(all_sample_ids)
    elif strategy == ONLY_GATEWAYS:
        return _only_gateway_samples(all_sample_ids)
    else:
        raise ValueError(f"{strategy} is not a valid sampling strategy")

def _up_sample_gateway_samples(all_sample_ids: List[int]) -> List[int]:
    """
    create a (shuffled) list of samples where gateway samples get upsampled to number of samples without gateway
    :return: list of sample ids
    """
    gateway_samples = _only_gateway_samples()
    without_gateway_samples = list(set(all_sample_ids) - set(gateway_samples))

    # sample samples with gateway until number of samples without gateway is reached
    upsampled_gateway_samples = []
    i = 0
    while len(upsampled_gateway_samples) < len(without_gateway_samples):
        upsampled_gateway_samples.append(gateway_samples[i])
        i += 1
        i %= len(gateway_samples)

    up_sampled_samples = without_gateway_samples + upsampled_gateway_samples
    random.seed(CURRENT_USED_SEED)
    random.shuffle(up_sampled_samples)
    return up_sampled_samples


def _down_sample_other_samples(all_sample_ids: List[int]) -> List[int]:
    """
    create a (shuffled) list of samples where samples without gateway get down sampled to the number of samples with
    gateway
    :return: list of sample ids
    """
    gateway_samples = _only_gateway_samples()
    without_gateway_samples = list(set(all_sample_ids) - set(gateway_samples))
    # not all samples without gateway will be included -> shuffle to sample random ones
    random.seed(CURRENT_USED_SEED)
    random.shuffle(without_gateway_samples)

    # sample samples without gateway until number of samples with gateway is reached
    down_sampled_without_gateway_samples = []
    i = 0
    while len(down_sampled_without_gateway_samples) < len(gateway_samples):
        down_sampled_without_gateway_samples.append(without_gateway_samples[i])
        i += 1

    down_sampled_samples = gateway_samples + down_sampled_without_gateway_samples
    random.seed(CURRENT_USED_SEED)
    random.shuffle(down_sampled_samples)
    return down_sampled_samples


def _only_gateway_samples(all_sample_ids: List[int]) -> List[int]:
    """
    return filtered list of samples ids that contain at least one gateway token
    :param use_synonyms: flag if synonym samples should be included
    """
    only_gateway_samples = [s for s in pet_reader.token_dataset.GetRandomizedSampleNumbers()
                            if f"B-{XOR_GATEWAY}" in pet_reader.token_dataset.GetSampleDictWithNerLabels(s)["ner-tags"]
                            or f"B-{AND_GATEWAY}" in pet_reader.token_dataset.GetSampleDictWithNerLabels(s)["ner-tags"]]
    return only_gateway_samples


def _create_dataset(input_ids: tf.Tensor, attention_masks: tf.Tensor, labels: tf.Tensor, sample_weights: tf.Tensor)\
        -> tf.data.Dataset:
    return tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks},
                                               labels,
                                               sample_weights))


def _mask_activities(sample_dicts: List[Dict], masking_strategy: str) -> List[Dict]:
    """
    mask activities with "dummy", most common activity or most common activities (if multiple in one sentence)
    :param sample_dicts: list of samples represented as dictionaries (including tokens and ner-tags)
    :param masking_strategy: how activities should be asked
    :return: list of sample dictionaries with masked tokens
    """
    for dictionary in sample_dicts:
        found_activities = 0
        masked_tokens = []
        for token, tag in zip(dictionary["tokens"], dictionary["ner-tags"]):
            if tag.endswith(ACTIVITY):
                if masking_strategy == DUMMY:
                    token = 'activity'
                elif masking_strategy == SINGLE_MASK:
                    token = pet_reader.most_common_activities[0]
                elif masking_strategy == MULTI_MASK:
                    token = pet_reader.most_common_activities[found_activities]
                found_activities += 1
            masked_tokens.append(token)
        dictionary["tokens"] = masked_tokens
    return sample_dicts


def _prepare_data_tc(sample_numbers: List[int], use_synonyms: bool = False, other_labels_weight: float = 0.1, 
                     label_set: str = 'filtered', activity_masking: str = None) \
        -> Tuple[BatchEncoding, tf.Tensor, tf.Tensor, List[List[int]]]:
    """
    create token classification samples from whole PET dataset -> samples (tokens) and their labels and weights for
    usage in a tensorflow dataset
    include either samples from sample_numbers list OR sample samples with sampling_strategy
    :param sample_numbers: list of concrete sample numbers
    :param use_synonyms: flag if synonym samples should be included;
    :param other_labels_weight: sample weight to assign samples with tokens != gateway tokens
    :param label_set: flag if to use all labels ('all') or only gateway labels and one rest label ('filtered')
    :param activity_masking: flag how to use activity data in tokenization
    :return: tokens, labels & weights as tensors, original word ids (2-dim integer list)
    """

    # 1) prepare sample data
    sample_dicts = []
    if use_synonyms:
        # synonym_samples = get_synonym_samples()
        pass
    for sample_number in sample_numbers:
        # in case sample is normal sample
        if sample_number < config[SYNONYM_SAMPLES_START_NUMBER]:
            sample_dicts.append(pet_reader.token_dataset.GetSampleDictWithNerLabels(sample_number))
        # in case sample is synonym sample
        else:
            sample_dicts.append(synonym_samples[sample_number])

    # apply optional activity masking
    if activity_masking in [SINGLE_MASK, MULTI_MASK]:
        sample_dicts = _mask_activities(sample_dicts, activity_masking)

    sample_sentences = [sample_dict['tokens'] for sample_dict in sample_dicts]

    # 2) transform tokens tags into IDs classification
    dataset_tokens = _tokenizer(sample_sentences, is_split_into_words=True, padding=True, return_tensors='tf')
    max_sentence_length = dataset_tokens['input_ids'].shape[1]

    # 3) transform NER token tags into labels for classification
    dataset_labels = []
    dataset_sample_weights = []
    dataset_word_ids = []
    for i, sample_dict in enumerate(sample_dicts):
        # tokenize again every single sample to get access to .word_ids()
        tokenization = _tokenizer(sample_dict['tokens'], is_split_into_words=True,
                                  padding='max_length', max_length=max_sentence_length, return_tensors='tf')
        sample_tokens = _tokenizer.convert_ids_to_tokens(tokenization['input_ids'][0])

        sample_labels = []
        sample_sample_weights = []
        # word index necessary, because one token in PET could be splitted into multiple tokens with tokenizer
        # multiple tokens have all the same word_id -> allows retrieval of the same one NER label from PET tokens
        for token, word_index in zip(sample_tokens, tokenization.word_ids()):
            # set special class for special tokens
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                sample_labels.append(TC_LABEL_OUT_OF_SCOPE)
                sample_sample_weights.append(TC_WEIGHTS_BERT_TOKENS)
            else:
                token_tag = sample_dict['ner-tags'][word_index]
                # XOR
                if token_tag.endswith(XOR_GATEWAY):
                    sample_labels.append(TC_LABEL_XOR)  # 2
                    sample_sample_weights.append(TC_WEIGHTS_GATEWAY_LABELS)
                # AND
                elif token_tag.endswith(AND_GATEWAY):
                    sample_labels.append(TC_LABEL_AND)  # 3
                    sample_sample_weights.append(TC_WEIGHTS_GATEWAY_LABELS)
                else:
                    if label_set == 'filtered':
                        sample_labels.append(TC_LABEL_OTHER)
                        sample_sample_weights.append(other_labels_weight)
                    else:
                        sample_sample_weights.append(other_labels_weight)
                        if token_tag.endswith("O"):
                            sample_labels.append(TC_LABEL_OTHER)
                        elif token_tag.endswith(ACTIVITY):
                            sample_labels.append(TC_LABEL_ACTIVITY)
                        elif token_tag.endswith(ACTIVITY_DATA):
                            sample_labels.append(TC_LABEL_ACTIVITY_DATA)
                        elif token_tag.endswith(ACTOR):
                            sample_labels.append(TC_LABEL_ACTOR)
                        elif token_tag.endswith(FURTHER_SPECIFICATION):
                            sample_labels.append(TC_LABEL_FURTHER_SPECIFICATION)
                        elif token_tag.endswith(CONDITION_SPECIFICATION):
                            sample_labels.append(TC_LABEL_CONDITION_SPECIFICATION)
                        else:
                            raise ValueError("Unexpected token tag:", token_tag)

        dataset_sample_weights.append(sample_sample_weights)
        dataset_labels.append(sample_labels)
        dataset_word_ids.append(tokenization.word_ids())

    dataset_labels = tf.constant(dataset_labels)
    dataset_sample_weights = tf.constant(dataset_sample_weights)
    return dataset_tokens, dataset_labels, dataset_sample_weights, dataset_word_ids


def create_token_cls_dataset_full(args: argparse.Namespace) -> tf.data.Dataset:
    """
    create one training dataset of the whole data without separating a dev set
    :param args: args namespace
    :return: one tensorflow dataset
    """
    logger.info(f"Create full token classification dataset (batch_size={args.batch_size})")
    
    # load samples to include in dataset
    sample_ids = _get_sample_ids(strategy=args.sampling_strategy)
    random.shuffle(sample_ids)
    logger.info(f"Generate token data with params: sampling_strategy={args.sampling_strategy} - use_synonyms={args.use_synonyms}"
                    f" - labels={args.labels} - other_labels_weight={args.other_labels_weight}")
    logger.info(f"Basis are {len(sample_ids)} samples from strategy '{args.sampling_strategy}'")
    
    # include synonyms in samples
    samples_number_old = len(sample_ids)
    if args.use_synonyms:
        synonym_samples = [synonyms for original_sample_id, synonyms in synonyms_of_original_samples.items()
                           if original_sample_id in sample_ids]
        synonym_samples_flattened = [item for sublist in synonym_samples for item in sublist]
        sample_ids += synonym_samples_flattened
        random.shuffle(sample_ids)

    logger.info(f"Final Dataset -> {len(sample_ids)}{f' ({samples_number_old} without syn.)' if args.use_synonyms else ''}")            

    # create data based on number of samples and transform to tf dataset
    tokens, labels, sample_weights, _ = _prepare_data_tc(
        sample_numbers=sample_ids,
        use_synonyms=args.use_synonyms,
        other_labels_weight=args.other_labels_weight,
        label_set=args.labels,
        activity_masking=args.activity_masking
    )
    
    # create and batch tf dataset
    tf_dataset = _create_dataset(tokens["input_ids"], tokens["attention_mask"], labels, sample_weights)
    if args.batch_size:
        tf_dataset = tf_dataset.batch(args.batch_size)        
        
    return tf_dataset


def create_token_cls_dataset_cv(args: argparse.Namespace) -> List[Tuple[tf.data.Dataset, tf.data.Dataset]]:
    """
    create the dataset for token classification with huggingface transformers bert like models
    split into kfolds splits to use for cross validation
    :param args: args namespace
    :return: list of tuples (train, dev) as tf.data.Dataset objects
    """
    logger.info(f"Create token classification cv dataset (folds={args.folds} - batch_size={args.batch_size})")
    # load samples to include in dataset
    sample_ids = _get_sample_ids(strategy=args.sampling_strategy)
    random.shuffle(sample_ids)
    logger.info(f"Generate token data with params: sampling_strategy={args.sampling_strategy} - use_synonyms={args.use_synonyms}"
                    f" - labels={args.labels} - other_labels_weight={args.other_labels_weight}")
    logger.info(f"Basis are {len(sample_ids)} samples from strategy '{args.sampling_strategy}'")
    
    # create datasets for k fold cross validation
    folded_datasets = []
        
    kfold = KFold(n_splits=5)
    for i, (train, dev) in enumerate(kfold.split(sample_ids)):
        
        train_samples = [p for j, p in enumerate(sample_ids) if j in train]
        dev_samples = [p for j, p in enumerate(sample_ids) if j in dev]
        
        # include synonyms in train samples
        train_samples_number_old = len(train_samples)
        if args.use_synonyms:
            train_synonym_samples = [synonyms for original_sample_id, synonyms in synonyms_of_original_samples.items()
                                     if original_sample_id in train_samples]
            train_synonym_samples_flattened = [item for sublist in train_synonym_samples for item in sublist]
            train_samples += train_synonym_samples_flattened
            random.shuffle(train_samples)
        
        logger.info(f"Fold {i} -> {len(train_samples)}{f' ({train_samples_number_old} without syn.)' if args.use_synonyms else ''}"
                            f"/ {len(dev_samples)}")            
        
        # create train data based on number of samples and transform to tf dataset
        tokens, labels, sample_weights, _ = _prepare_data_tc(
            sample_numbers=train_samples,
            use_synonyms=args.use_synonyms,
            other_labels_weight=args.other_labels_weight,
            label_set=args.labels,
            activity_masking=args.activity_masking
        )
        train_tf_dataset = _create_dataset(tokens["input_ids"], tokens["attention_mask"], labels, sample_weights)
        
        # create dev data based on number of samples and transform to tf dataset
        tokens, labels, sample_weights, _ = _prepare_data_tc(
            sample_numbers=dev_samples,
            use_synonyms=False,
            other_labels_weight=args.other_labels_weight,
            label_set=args.labels,
            activity_masking=args.activity_masking
        )
        dev_tf_dataset = _create_dataset(tokens["input_ids"], tokens["attention_mask"], labels, sample_weights)
        
        # batch both datasets
        if args.batch_size:
            train_tf_dataset = train_tf_dataset.batch(args.batch_size)
            dev_tf_dataset = dev_tf_dataset.batch(args.batch_size)
        
        folded_datasets.append((train_tf_dataset, dev_tf_dataset))
        
    return folded_datasets


parser = argparse.ArgumentParser()

parser.add_argument("--batch_size", default=8, type=int, help="Batch size.")
parser.add_argument("--folds", default=2, type=int, help="Number of folds in cross validation routine.")
parser.add_argument("--labels", default=ALL, type=str, help="Label set to use.")
parser.add_argument("--other_labels_weight", default=0.1, type=float, help="Sample weight for non gateway tokens.")
parser.add_argument("--sampling_strategy", default=NORMAL, type=str, help="How to sample samples.")
parser.add_argument("--use_synonyms", default=True, type=str, help="Include synonym samples.")
parser.add_argument("--activity_masking", default=NOT, type=str, help="How to include activity data.")

args_tc = parser.parse_args([] if "__file__" not in globals() else None)

In [91]:
folded_datasets_tc = create_token_cls_dataset_cv(args_tc)
for i, (train, dev) in enumerate(folded_datasets_tc):
    print(f"Fold {i} (batched): train {len(train)} / dev {len(dev)}")

INFO:Data Preparation [Same Gateway CLS]:Create token classification cv dataset (folds=2 - batch_size=8)
INFO:Data Preparation [Same Gateway CLS]:Generate token data with params: sampling_strategy=normal - use_synonyms=True - labels=all - other_labels_weight=0.1
INFO:Data Preparation [Same Gateway CLS]:Basis are 417 samples from strategy 'normal'
INFO:Data Preparation [Same Gateway CLS]:Fold 0 -> 824 (333 without syn.)/ 84
INFO:Data Preparation [Same Gateway CLS]:Fold 1 -> 791 (333 without syn.)/ 84
INFO:Data Preparation [Same Gateway CLS]:Fold 2 -> 753 (334 without syn.)/ 83
INFO:Data Preparation [Same Gateway CLS]:Fold 3 -> 721 (334 without syn.)/ 83
INFO:Data Preparation [Same Gateway CLS]:Fold 4 -> 771 (334 without syn.)/ 83


Fold 0 (batched): train 103 / dev 11
Fold 1 (batched): train 99 / dev 11
Fold 2 (batched): train 95 / dev 11
Fold 3 (batched): train 91 / dev 11
Fold 4 (batched): train 97 / dev 11


In [90]:
full_dataset_tc = create_token_cls_dataset_full(args_tc)
print(f"Full dataset size (batched): {len(full_dataset_tc)}")

INFO:Data Preparation [Same Gateway CLS]:Create full token classification dataset (batch_size=8)
INFO:Data Preparation [Same Gateway CLS]:Generate token data with params: sampling_strategy=normal - use_synonyms=True - labels=all - other_labels_weight=0.1
INFO:Data Preparation [Same Gateway CLS]:Basis are 417 samples from strategy 'normal'
INFO:Data Preparation [Same Gateway CLS]:Final Dataset -> 965 (417 without syn.)


Full dataset size: 121
