#### For error "IOPub message rate exceeded"
- in detail:IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`.
- solution: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [1]:
import pandas as pd
import logging
import tensorflow as tf
import mlflow
import random
from pathlib import Path

import json
import matplotlib.pyplot as plt
import dataclass_cli
import dataclasses

import numpy as np
from tqdm import tqdm
from typing import Any, Dict, Tuple, List, Generator, Set
from bs4 import BeautifulSoup

In [2]:
admission_file: Path = Path('data/ADMISSIONS.csv')
admission_df = pd.read_csv(admission_file)
admission_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


# Three parts
- Load Configuration
- Key Component
- Run DomainML

# Load Configuration

Following configurations are loaded:
- ExperimentConfig()
- preprocessing.huawei.HuaweiPreprocessorConfig()
- preprocessing.mimic.MimicPreprocessorConfig()
- sequences.SequenceConfig()
- models.ModelConfig()
- models.TextualPaperModelConfig()
- knowledge.KnowledgeConfig()
- refinement.RefinementConfig()

In [3]:
#@dataclass_cli.add
@dataclasses.dataclass
class ExperimentConfig:
    n_epochs: int = 10
    sequence_type: str = "mimic"
    model_type: str = "gram"
    # NOISE
    noise_to_add: float = 0.0
    noise_to_remove: float = 0.0
    attention_weight_reference_file: Path = Path('data/attention_mimic_gram.json')
    attention_noise_to_remove: float = 0.0
    # DATASET GENERATION
    max_data_size: int = -1
    use_dataset_generator: bool = True
    batch_size: int = 32
    multilabel_classification: bool = False
    # using this will cache dataset accross different runs.
    # don't use this if you change settings for creating the dataset!
    dataset_generator_cache_file: str = ""
    # SEEDING
    dataset_shuffle_buffer: int = 1000
    dataset_shuffle_seed: int = 12345
    random_seed: int = 82379498237
    tensorflow_seed: int = 7796

In [4]:
'''
# preprocessing.huawei.HuaweiPreprocessorConfig()
@dataclass_cli.add
@dataclasses.dataclass
class HuaweiPreprocessorConfig:
    aggregated_log_file: Path = Path("data/logs_aggregated_concurrent.csv")
    traces_root_directory: Path = Path("data/concurrent_data/traces/")
    final_log_file: Path = Path("data/huawei.pkl")
    relevant_aggregated_log_columns: List[str] = dataclasses.field(
        default_factory=lambda: [
            "Hostname",
            "log_level",
            "programname",
            "python_module",
            "http_status",
            "http_method",
        ],
    )
    relevant_trace_columns: List[str] = dataclasses.field(
        default_factory=lambda: [
            "Hostname",
            "trace_name",
            "trace_service",
            "python_module",
            "trace_project",
            "payload",
            "etype",
            "http_method",
            "function",
        ],
    )
    use_trace_data: bool = False
    aggregate_per_trace: bool = False
    aggregate_per_max_number: int = -1
    aggregate_per_time_frequency: str = ""
    log_datetime_column_name: str = "@timestamp"
    log_payload_column_name: str = "Payload"
    use_log_hierarchy: bool = False
    fine_drain_log_depth: int = 10
    fine_drain_log_st: float = 0.75
    coarse_drain_log_depth: int = 4
    coarse_drain_log_st: float = 0.2
    drain_log_depths: List[int] = dataclasses.field(default_factory=lambda: [],)
    drain_log_sts: List[float] = dataclasses.field(default_factory=lambda: [],)
    url_column_name: str = "http_url"
    drain_url_depth: int = 10
    drain_url_st: float = 0.5
    add_log_clusters: bool = True
    min_logs_per_trace: int = 2
    min_causality: float = 0.0
    log_only_causality: bool = False
    relevant_log_column: str = "fine_log_cluster_template"
    log_template_file: Path = Path("data/attention_log_templates.csv")
'''

'\n# preprocessing.huawei.HuaweiPreprocessorConfig()\n@dataclass_cli.add\n@dataclasses.dataclass\nclass HuaweiPreprocessorConfig:\n    aggregated_log_file: Path = Path("data/logs_aggregated_concurrent.csv")\n    traces_root_directory: Path = Path("data/concurrent_data/traces/")\n    final_log_file: Path = Path("data/huawei.pkl")\n    relevant_aggregated_log_columns: List[str] = dataclasses.field(\n        default_factory=lambda: [\n            "Hostname",\n            "log_level",\n            "programname",\n            "python_module",\n            "http_status",\n            "http_method",\n        ],\n    )\n    relevant_trace_columns: List[str] = dataclasses.field(\n        default_factory=lambda: [\n            "Hostname",\n            "trace_name",\n            "trace_service",\n            "python_module",\n            "trace_project",\n            "payload",\n            "etype",\n            "http_method",\n            "function",\n        ],\n    )\n    use_trace_data: bool 

In [5]:
# preprocessing.mimic.MimicPreprocessorConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class MimicPreprocessorConfig:
    admission_file: Path = Path("data/ADMISSIONS.csv")
    diagnosis_file: Path = Path("data/DIAGNOSES_ICD.csv")
    hierarchy_file: Path = Path("data/ccs_multi_dx_tool_2015.csv")
    icd9_file: Path = Path("data/icd9.csv")
    use_icd9_data: bool = True
    min_admissions_per_user: int = 2
    sequence_column_name: str = "icd9_code_converted_3digits"
    add_icd9_info_to_sequences: bool = True
    cluster_file: Path = Path("data/icd9_clusters.csv")
    knowlife_file: Path = Path("data/knowlife_dump.tsv")
    umls_file: Path = Path("data/umls.csv")
    umls_api_key: str = ""
    replace_keys: List[str] = dataclasses.field(default_factory=lambda: [],)
    replace_with_keys: List[str] = dataclasses.field(default_factory=lambda: [],)
    replacement_percentages: List[float] = dataclasses.field(
        default_factory=lambda: [],
    )
    replace_columns: List[str] = dataclasses.field(default_factory=lambda: [],)
    prediction_column: str = ""

In [6]:
# sequences.SequenceConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class SequenceConfig:
    test_percentage: float = 0.1  # how much of the data should be used for testing
    random_test_split: bool = True  # if true, split randomly; if false, split after 1-test_percentage datapoints
    random_state: int = 12345  # seed used for random test split
    flatten_x: bool = True  # if true, produces one mulit-hot encoded vector per timestamp;
    flatten_y: bool = True  #       if false, produces multiple (number of features in timestamp) one-hot encoded vectors per timestamp
    max_window_size: int = 10  # max number of timestamps per prediction input
    min_window_size: int = 2  # min number of timestamps per prediction input
    window_overlap: bool = True  # if true, timestamps for different prediction inputs may overlap
    allow_subwindows: bool = False  # if true, all subsequences of a given sequence are used; if false, resembles sliding window approach
    valid_y_features: List[str] = dataclasses.field(
        default_factory=lambda: [],
    )  # if not empty, only these features are used as prediction goals
    remove_empty_y_vecs: bool = True  # if true, removes (x,y) pairs where y is a zero vector
    remove_empty_x_vecs: bool = True  # if true, removes (x) inputs where x is a zero vector
    x_sequence_column_name: str = ""
    y_sequence_column_name: str = ""
    predict_full_y_sequence: bool = False
    predict_full_y_sequence_wide: bool = False

In [7]:
# models.ModelConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class ModelConfig:
    rnn_type: str = "gru"
    rnn_dim: int = 32
    rnn_dropout: float = 0.0
    embedding_dim: int = 16
    attention_dim: int = 16
    base_feature_embeddings_trainable: bool = True
    base_hidden_embeddings_trainable: bool = True
    feature_embedding_initializer: str = "random_uniform"
    feature_embedding_initializer_seed: int = 12345
    hidden_embedding_initializer: str = "random_uniform"
    hidden_embedding_initializer_seed: int = 67890
    distribute_strategy: str = ""
    best_model_metric: str = "val_loss"
    best_model_metric_minimize: bool = True
    early_stopping_epochs: int = 5
    metrics_num_percentiles: int = 5
    final_activation_function: str = "softmax"
    loss: str = "binary_crossentropy"
    optimizer: str = "adam"
    dropout_rate: float = 0.5
    dropout_seed: int = 12345
    kernel_regularizer_type: str = "l2"
    kernel_regularizer_value: float = 0.001
    kernel_regularizer_scope: List[str] = dataclasses.field(
        default_factory=lambda: []
    )

In [8]:
# models.TextualPaperModelConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class TextualPaperModelConfig:
    num_filters: int = 16
    kernel_sizes: List[int] = dataclasses.field(default_factory=lambda: [2, 3, 4],)

In [9]:
# knowledge.KnowledgeConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class KnowledgeConfig:
    add_causality_prefix: bool = False
    file_knowledge: Path = Path("data/file_knowledge.json")
    combined_knowledge_components: List[str] = dataclasses.field(
        default_factory=lambda: ["gram", "text", "causal",],
    )
    build_text_hierarchy: bool = False

In [10]:
# refinement.RefinementConfig()
#@dataclass_cli.add
@dataclasses.dataclass
class RefinementConfig:
    num_refinements: int = 1
    min_edge_weight: float = 0.8
    max_train_examples: int = 10
    refinement_metric: str = "mean_outlier_score"
    refinement_metric_maxrank: int = -1
    max_edges_to_remove: int = 10
    max_refinement_metric: int = -1
    original_file_knowledge: Path = Path("data/gram_original_file_knowledge.json")
    edges_to_add: float = -1
    reference_file_knowledge: Path = Path("data/gram_without_unknowns.json")
    mlflow_dir: str = "mlruns/1/"

In [11]:
def _log_all_configs_to_mlflow():
    for config in [
        ExperimentConfig(),
        #HuaweiPreprocessorConfig(),
        MimicPreprocessorConfig(),
        SequenceConfig(),
        ModelConfig(),
        TextualPaperModelConfig(),
        KnowledgeConfig(),
        RefinementConfig(),
    ]:
        for config_name, config_value in vars(config).items():
            full_config_name = config.__class__.__name__ + config_name
            mlflow.log_param(full_config_name, str(config_value))

# Key Component

- 1 Generate sequences 
- 2 Preprocessing
- 3 Knowledge
- 4 Genarate models
- 5 Analysis 

## 1 Generate sequences

#### 1.1 transformer 
- NextPartialSequenceTransformer 
- NextSequenceTransformer
- TrainTestSplit
- load_sequence_transformer
- SequenceMetadata 
- _SplittedSequence

#### 1.2 generator 
- generate_test
- generate_train

### 1.1 transformer

In [12]:
# from .config import SequenceConfig
from sklearn.model_selection import train_test_split

class SequenceMetadata:
    def __init__(
        self,
        max_x_length,
        max_sequence_length,
        max_features_per_time,
        max_features_per_sequence,
        x_vocab,
        y_vocab,
        full_y_prediction,
    ):
        self.max_x_length: int = max_x_length
        self.max_sequence_length: int = max_sequence_length
        self.max_features_per_time: int = max_features_per_time
        self.max_features_per_sequence: int = max_features_per_sequence
        self.x_vocab: Dict[str, int] = x_vocab
        self.y_vocab: Dict[str, int] = y_vocab
        self.full_y_prediction: bool = full_y_prediction


class TrainTestSplit:
    def __init__(self, train_x, test_x, train_y, test_y, metadata):
        self.train_x: tf.Tensor = train_x
        self.test_x: tf.Tensor = test_x
        self.train_y: tf.Tensor = train_y
        self.test_y: tf.Tensor = test_y
        self.metadata: SequenceMetadata = metadata


class _SplittedSequence:
    def __init__(self):
        self.x: List[List[str]] = []
        self.y: List[List[str]] = []
        self.x_vecs_stacked: tf.Tensor = None
        self.y_vec: tf.Tensor = None


class NextSequenceTransformer:
    """Split Sequences for next sequence prediction."""

    def __init__(
        self, config: SequenceConfig,
    ):
        self.config = config

    def collect_metadata(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> SequenceMetadata:
        (x_vocab, y_vocab) = self._generate_vocabs(sequence_df, sequence_column_name)
        max_sequence_length = sequence_df[sequence_column_name].apply(len).max() - 1
        if (
            not self.config.predict_full_y_sequence
            and not self.config.predict_full_y_sequence_wide
        ):
            max_sequence_length = min(self.config.max_window_size, max_sequence_length)
        max_features_per_time = (
            sequence_df[sequence_column_name]
            .apply(
                lambda list: max([len(sublist) for sublist in list])
                if len(list) > 0
                else 0
            )
            .max()
        )
        max_features_per_sequence = max_sequence_length * max_features_per_time

        return SequenceMetadata(
            max_x_length=(
                max_sequence_length
                if self.config.flatten_x
                else max_features_per_sequence
            ),
            max_sequence_length=max_sequence_length,
            max_features_per_time=max_features_per_time,
            max_features_per_sequence=max_features_per_sequence,
            x_vocab=x_vocab,
            y_vocab=y_vocab,
            full_y_prediction=self.config.predict_full_y_sequence,
        )

    def transform_train_test_split(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> TrainTestSplit:
        metadata = self.collect_metadata(sequence_df, sequence_column_name)
        train_sequences, test_sequences = self._split_train_test(
            sequence_df, sequence_column_name
        )

        transformed_train_sequences = self._transform_sequences(
            sequences=train_sequences, metadata=metadata
        )
        transformed_test_sequences = self._transform_sequences(
            sequences=test_sequences, metadata=metadata
        )

        return TrainTestSplit(
            train_x=tf.stack(
                [
                    transformed.x_vecs_stacked
                    for transformed in transformed_train_sequences
                ]
            ),
            test_x=tf.stack(
                [
                    transformed.x_vecs_stacked
                    for transformed in transformed_test_sequences
                ]
            ),
            train_y=tf.stack(
                [transformed.y_vec for transformed in transformed_train_sequences]
            ),
            test_y=tf.stack(
                [transformed.y_vec for transformed in transformed_test_sequences]
            ),
            metadata=metadata,
        )

    def _split_train_test(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> Tuple[List[List[List[str]]], List[List[List[str]]]]:
        if len(sequence_df) == 1:
            sequence_list = sequence_df[sequence_column_name].tolist()[0]
            logging.debug(
                "Splitting values of df with only one row and %d items as list",
                len(sequence_list),
            )

            test_size = int(self.config.test_percentage * len(sequence_list))
            split_index = len(sequence_list) - test_size
            train_sequence_list = sequence_list[:split_index]
            test_sequence_list = sequence_list[split_index : len(sequence_list)]
            return ([train_sequence_list], [test_sequence_list])
        elif self.config.random_test_split:
            return train_test_split(
                sequence_df[sequence_column_name],
                test_size=self.config.test_percentage,
                random_state=self.config.random_state,
            )
        else:
            test_size = int(self.config.test_percentage * len(sequence_df))
            split_index = len(sequence_df) - test_size
            train_sequence_df = sequence_df[:split_index]
            test_sequence_df = sequence_df[split_index : len(sequence_df)]
            return (
                train_sequence_df[sequence_column_name].tolist(),
                test_sequence_df[sequence_column_name].tolist(),
            )

    def _transform_sequences(
        self, sequences: List[List[List[str]]], metadata: SequenceMetadata
    ) -> List[_SplittedSequence]:
        splitted_sequences = self._split_sequences(sequences)
        resulting_splits = []
        for splitted in tqdm(
            splitted_sequences, desc="Transforming splitted sequences to tensors"
        ):
            self._translate_and_pad(splitted, metadata)
            resulting_splits.append(splitted)

        return resulting_splits

    def _split_sequences(self, sequences: List[List[List[str]]]):
        for sequence in tqdm(sequences, desc="Splitting sequences into x/y windows"):
            splitted_sequences = self._split_sequence(sequence)
            for splitted_sequence in splitted_sequences:
                yield splitted_sequence

    def _split_sequence(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        if self.config.predict_full_y_sequence_wide:
            return self._split_sequence_full_window_wide(sequence)
        elif self.config.predict_full_y_sequence:
            return self._split_sequence_full_window(sequence)
        elif self.config.window_overlap:
            return self._split_sequence_overlap(sequence)
        else:
            return self._split_sequence_no_overlap(sequence)

    def _split_sequence_full_window(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        splitted_sequence = _SplittedSequence()
        splitted_sequence.x = sequence[: len(sequence) - 1]
        splitted_sequence.y = sequence[1 : len(sequence)]
        yield splitted_sequence

    def _split_sequence_full_window_wide(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        for end_index in range(1, len(sequence)):
            if self.config.flatten_y:
                splitted_sequences = self._split_sequence_y_flat(
                    sequence, start_index=0, end_index=end_index
                )
            else:
                splitted_sequences = self._split_sequence_y_wide(
                    sequence, start_index=0, end_index=end_index
                )
            for splitted_sequence in splitted_sequences:
                yield splitted_sequence

    def _split_sequence_overlap(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        for start_index in range(0, len(sequence)):
            max_end_index = min(
                start_index + self.config.max_window_size + 1, len(sequence)
            )
            min_end_index = (
                start_index + self.config.min_window_size
                if self.config.allow_subwindows
                else max(max_end_index - 1, start_index)
            )
            for end_index in range(min_end_index, max_end_index):
                if self.config.flatten_y:
                    splitted_sequences = self._split_sequence_y_flat(
                        sequence, start_index, end_index
                    )
                else:
                    splitted_sequences = self._split_sequence_y_wide(
                        sequence, start_index, end_index
                    )
                for splitted_sequence in splitted_sequences:
                    yield splitted_sequence

    def _split_sequence_no_overlap(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        start_index = 0
        max_start_index = len(sequence) - 1 - self.config.min_window_size
        while start_index <= max_start_index:
            end_index = start_index + self.config.min_window_size
            if self.config.flatten_y:
                splitted_sequences = self._split_sequence_y_flat(
                    sequence, start_index, end_index
                )
            else:
                splitted_sequences = self._split_sequence_y_wide(
                    sequence, start_index, end_index
                )
            for splitted_sequence in splitted_sequences:
                yield splitted_sequence
            start_index = end_index + 1

    def _split_sequence_y_flat(
        self, sequence: List[List[str]], start_index: int, end_index: int
    ) -> List[_SplittedSequence]:
        splitted_sequence = _SplittedSequence()
        splitted_sequence.x = sequence[start_index:end_index]
        splitted_sequence.y = [sequence[end_index]]
        return [splitted_sequence]

    def _split_sequence_y_wide(
        self, sequence: List[List[str]], start_index: int, end_index: int
    ) -> List[_SplittedSequence]:
        splitted_sequences = []
        y_features = sequence[end_index]
        for feature in set(y_features):
            splitted_sequence = _SplittedSequence()
            splitted_sequence.x = sequence[start_index:end_index]
            splitted_sequence.y = [[feature]]
            splitted_sequences.append(splitted_sequence)

        return splitted_sequences

    def _transform_to_tensor(
        self, active_features: List[str], vocab: Dict[str, int]
    ) -> tf.Tensor:
        feature_vec = np.zeros(len(vocab))
        for active_feature in active_features:
            if active_feature in vocab:
                feature_vec[vocab[active_feature]] = 1
        return tf.convert_to_tensor(feature_vec, dtype="float32")

    def _translate_and_pad_x_flat(
        self,
        x_features: List[List[str]],
        x_vocab: Dict[str, int],
        max_sequence_length: int,
    ) -> tf.Tensor:
        x_vecs = []
        for x in x_features:
            x_vecs.append(self._transform_to_tensor(x, x_vocab))
        for _ in range(max_sequence_length - len(x_features)):
            x_vecs.append(self._transform_to_tensor([], x_vocab))
        return tf.stack(x_vecs)

    def _translate_and_pad_x_wide(
        self,
        x_features: List[List[str]],
        x_vocab: Dict[str, int],
        max_features_per_sequence: int,
    ) -> tf.Tensor:
        all_features = [feature for x in x_features for feature in x]
        x_vecs = []
        for feature in all_features:
            x_vecs.append(self._transform_to_tensor([feature], x_vocab))
        for _ in range(max_features_per_sequence - len(all_features)):
            x_vecs.append(self._transform_to_tensor([], x_vocab))
        return tf.stack(x_vecs)

    def _translate_and_pad_generator(
        self, x: List[List[str]], y: List[List[str]], metadata: SequenceMetadata
    ):
        y_vec = (
            self._translate_and_pad_x_flat(
                y, metadata.y_vocab, metadata.max_sequence_length
            )
            if self.config.predict_full_y_sequence
            else self._transform_to_tensor(y[0], metadata.y_vocab)
        )
        if self.config.flatten_x:
            x_vecs_stacked = self._translate_and_pad_x_flat(
                x, metadata.x_vocab, metadata.max_sequence_length
            )
        else:
            x_vecs_stacked = self._translate_and_pad_x_wide(
                x, metadata.x_vocab, metadata.max_features_per_sequence
            )
        return (x_vecs_stacked, y_vec)

    def _translate_and_pad(
        self, splitted: _SplittedSequence, metadata: SequenceMetadata
    ):
        x_vecs_stacked, y_vec = self._translate_and_pad_generator(
            splitted.x, splitted.y, metadata
        )
        splitted.x_vecs_stacked = x_vecs_stacked
        splitted.y_vec = y_vec

    def _generate_vocabs(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> Tuple[Dict[str, int], Dict[str, int]]:
        vocab = self._generate_vocab(sequence_df, sequence_column_name)
        return (vocab, vocab)

    def _generate_vocab(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> Dict[str, int]:
        flattened_sequences = (
            sequence_df[sequence_column_name]
            .agg(
                lambda x: [
                    item for sublist in x for item in sublist
                ]  # flatten labels per timestamp to one list
            )
            .tolist()
        )
        flattened_sequences = list(
            set([item for sublist in flattened_sequences for item in sublist])
        )
        return self._generate_vocab_from_list(flattened_sequences)

    def _generate_vocab_from_list(self, features: List[str]) -> Dict[str, int]:
        vocab = {}
        index = 0
        for feature in features:
            if len(feature) == 0 or feature.lower() == "nan":
                continue
            vocab[feature] = index
            index = index + 1

        return vocab


class NextPartialSequenceTransformer(NextSequenceTransformer):
    """Split Sequences for next sequence prediction, but only keep some of the features as prediciton goals."""

    def __init__(self, config: SequenceConfig):
        super().__init__(config=config)
        self.valid_x_features: List[str] = []
        self.valid_y_features: List[str] = config.valid_y_features

    def set_valid_x_features(self, valid_x_features: List[str]):
        self.valid_x_features = valid_x_features

    def set_valid_y_features(self, valid_y_features: List[str]):
        self.valid_y_features = valid_y_features

    def _generate_vocabs(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> Tuple[Dict[str, int], Dict[str, int]]:
        x_vocab = (
            self._generate_vocab_from_list(self.valid_x_features)
            if len(self.valid_x_features) > 0
            else self._generate_vocab(sequence_df, sequence_column_name)
        )
        y_vocab = (
            self._generate_vocab_from_list(self.valid_y_features)
            if len(self.valid_y_features) > 0
            else self._generate_vocab(sequence_df, sequence_column_name)
        )

        return (x_vocab, y_vocab)

    def _split_sequence(
        self, sequence: List[List[str]]
    ) -> Generator[_SplittedSequence, None, None]:
        splitted_sequence_generator = super()._split_sequence(sequence)
        should_remove_empty_y_vecs = (
            self.config.remove_empty_y_vecs
            and len(self.valid_y_features) > 0
            and not self.config.predict_full_y_sequence
        )
        should_remove_empty_x_vecs = (
            self.config.remove_empty_x_vecs
            and len(self.valid_x_features) > 0
            and not self.config.predict_full_y_sequence
        )
        for splitted_sequence in splitted_sequence_generator:
            if should_remove_empty_y_vecs and set(splitted_sequence.y[0]).isdisjoint(
                self.valid_y_features
            ):
                continue
            if should_remove_empty_x_vecs:
                splitted_sequence.x = [
                    x
                    for x in splitted_sequence.x
                    if not set(x).isdisjoint(self.valid_x_features)
                ]
            if len(splitted_sequence.x) > 0:
                yield splitted_sequence


class NextPartialSequenceTransformerFromDataframe(NextPartialSequenceTransformer):
    """Split Sequences for next sequence prediction, but only keep some of the features as prediciton goals."""

    def _generate_vocabs(
        self, sequence_df: pd.DataFrame, sequence_column_name: str
    ) -> Tuple[Dict[str, int], Dict[str, int]]:
        x_vocab = self._generate_vocab(
            sequence_df,
            self.config.x_sequence_column_name
            if (
                self.config.x_sequence_column_name is not None
                and len(self.config.x_sequence_column_name) > 0
            )
            else sequence_column_name,
        )
        if len(self.valid_y_features) == 0:
            y_vocab = self._generate_vocab(
                sequence_df,
                self.config.y_sequence_column_name
                if (
                    self.config.y_sequence_column_name is not None
                    and len(self.config.y_sequence_column_name) > 0
                )
                else sequence_column_name,
            )
        else:
            y_vocab = self._generate_vocab_from_list(self.valid_y_features)

        super().set_valid_x_features([x for x in x_vocab.keys()])
        super().set_valid_y_features([y for y in y_vocab.keys()])

        return (x_vocab, y_vocab)


def load_sequence_transformer() -> NextSequenceTransformer:
    config = SequenceConfig()
    if (
        len(config.x_sequence_column_name) > 0 or len(config.y_sequence_column_name) > 0
    ):
        logging.debug(
            "Using only features in column %s as inputs, and features from column %s as prediction goals",
            config.x_sequence_column_name,
            config.y_sequence_column_name,
        )
        return NextPartialSequenceTransformerFromDataframe(config=config)
    elif len(config.valid_y_features) > 0:
        logging.debug(
            "Using only features %s as prediction goals",
            ",".join(config.valid_y_features),
        )
        return NextPartialSequenceTransformer(config=config)
    else:
        return NextSequenceTransformer(config=config)


### 1.2 generator

In [13]:
# from .transformer import load_sequence_transformer

def generate(sequence_df_pickle_path: str, sequence_column_name: str, for_train: bool):
    sequence_df = pd.read_pickle(Path(sequence_df_pickle_path))
    sequence_transformer = load_sequence_transformer()
    sequence_metadata = sequence_transformer.collect_metadata(sequence_df, sequence_column_name)

    train_sequences, test_sequences = sequence_transformer._split_train_test(sequence_df, sequence_column_name)
    relevant_sequences = train_sequences if for_train else test_sequences
    for sequence in relevant_sequences:
        splitted_sequences = sequence_transformer._split_sequence(sequence)
        for splitted_sequence in splitted_sequences:
            sequence_transformer._translate_and_pad(splitted_sequence, sequence_metadata)
            yield splitted_sequence.x_vecs_stacked, splitted_sequence.y_vec

def generate_train(sequence_df_pickle_path: bytes, sequence_column_name: bytes):
    return generate(sequence_df_pickle_path.decode(), sequence_column_name.decode(), for_train=True)

def generate_test(sequence_df_pickle_path: bytes, sequence_column_name: bytes):
    return generate(sequence_df_pickle_path.decode(), sequence_column_name.decode(), for_train=False)

## 2 Preprocessing

#### 2.1 base 
- Preprocessor

#### 2.2 icd9data  
- ICD9DataPreprocessor
- ICD9KnowlifeMatcher

#### 2.3 mimic
- MimicPreprocessor
- MimicPreprocessorConfig
- CCSHierarchyPreprocessor
- ICD9HierarchyPreprocessor
- ICD9DescriptionPreprocessor
- KnowlifePreprocessor


In [14]:
# Preprocessing files, unneeded for gram
'''
from src.preprocessing.huawei import ConcurrentAggregatedLogsPreprocessor, HuaweiPreprocessorConfig, ConcurrentAggregatedLogsDescriptionPreprocessor, ConcurrentAggregatedLogsHierarchyPreprocessor, ConcurrentAggregatedLogsCausalityPreprocessor
from src.preprocessing.c24 import C24FraudPreprocessor, C24HierarchyPreprocessor, C24PreprocessorConfig
from src.preprocessing.drain import Drain, DrainParameters
from src.preprocessing.huawei_traces import HuaweiTracePreprocessor
'''

'\nfrom src.preprocessing.huawei import ConcurrentAggregatedLogsPreprocessor, HuaweiPreprocessorConfig, ConcurrentAggregatedLogsDescriptionPreprocessor, ConcurrentAggregatedLogsHierarchyPreprocessor, ConcurrentAggregatedLogsCausalityPreprocessor\nfrom src.preprocessing.c24 import C24FraudPreprocessor, C24HierarchyPreprocessor, C24PreprocessorConfig\nfrom src.preprocessing.drain import Drain, DrainParameters\nfrom src.preprocessing.huawei_traces import HuaweiTracePreprocessor\n'

### 2.1 base

In [15]:
class Preprocessor:
    def load_data(self):
        """Implement this in the child classes!"""
        raise NotImplementedError()

### 2.2 icd9data 

In [16]:
# from .base import Preprocessor
import urllib.request
import time
import requests
from lxml.html import fromstring

class ICD9DataPreprocessor(Preprocessor):
    icd9data_base_url = "http://www.icd9data.com"

    def __init__(
        self,
        icd9_file: Path,
        icd9_hierarchy_file: Path = Path("data/hierarchy_icd9.csv"),
    ):
        self.icd9_file = icd9_file
        self.icd9_hierarchy_file = icd9_hierarchy_file

    def load_data(self) -> pd.DataFrame:
        logging.info("Trying to read icd9_df from %s", self.icd9_file)
        if not self.icd9_file.is_file():
            icd9_df = self._query_data()
            icd9_df.to_csv(self.icd9_file, index=False)

        return pd.read_csv(self.icd9_file, dtype=str)

    def load_data_as_hierarchy(self) -> pd.DataFrame:
        logging.info(
            "Trying to read icd9_hierarchy_df from %s", self.icd9_hierarchy_file
        )
        if not self.icd9_hierarchy_file.is_file():
            icd9_hierarchy_df = self._generate_icd9_hierarchy()
            icd9_hierarchy_df.to_csv(self.icd9_hierarchy_file, index=False)

        icd9_hierarchy_df = pd.read_csv(self.icd9_hierarchy_file, dtype=str)
        return icd9_hierarchy_df

    def _find_icd9_parents_for_child(
        self, icd9_df: pd.DataFrame, child_code: str
    ) -> List[str]:
        direct_parents = [
            x
            for x in set(icd9_df[icd9_df["child_code"] == child_code]["parent_code"])
            if not x == child_code
        ]
        if len(direct_parents) == 0:
            return []
        if len(direct_parents) > 1:
            logging.warn(
                "Found multiple icd9 parents for child %s: %s",
                child_code,
                ",".join(direct_parents),
            )

        parent_code = direct_parents[0]
        if parent_code == "-1":
            return []

        next_parents = self._find_icd9_parents_for_child(icd9_df, parent_code)
        return [parent_code] + next_parents

    def _generate_icd9_hierarchy(self) -> pd.DataFrame:
        icd9_df = self.load_data()

        logging.info("Converting icd9_df to hierarchy")
        child_codes = set(icd9_df["child_code"])
        children_to_parents = {}
        for child_code in tqdm(child_codes, "Converting icd9 data to hierarchy dict"):
            children_to_parents[child_code] = self._find_icd9_parents_for_child(
                icd9_df, child_code
            )

        max_parents = max([len(x) for x in children_to_parents.values()]) + 1
        child_hierarchy_records = []
        for child_code, parents in tqdm(
            children_to_parents.items(),
            desc="Converting icd9 hierarchy dict to dataframe",
        ):
            parents = [str(x) for x in parents if len(str(x)) > 0]
            while len(parents) < max_parents:
                parents = [child_code] + parents

            child_to_parents: Dict[str, str] = {}
            for parent_idx in range(len(parents)):
                child_to_parents["level_" + str(parent_idx)] = parents[parent_idx]

            child_hierarchy_records.append(child_to_parents)

        return pd.DataFrame.from_records(
            child_hierarchy_records,
            columns=["level_" + str(i) for i in range(max_parents)],
        ).astype(str)

    def _query_data(self) -> pd.DataFrame:
        logging.info("Starting to query ICD9 data")
        return self._query_hierarchy_from(
            "http://www.icd9data.com/2015/Volume1/default.htm", "root", "-1"
        )

    def _open_url(self, url):
        request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        response = urllib.request.urlopen(request)
        return BeautifulSoup(
            response, "html.parser", from_encoding=response.info().get_param("charset")
        )

    def _open_url_gentle(self, url, max_retries=10, timeout_s=3, error_timeout_s=60):
        try:
            time.sleep(timeout_s)
            return self._open_url(url)
        except urllib.error.HTTPError as error:
            logging.error("Error trying to query URL %s: %s", url, error)
            if max_retries < 0:
                raise error
            else:
                time.sleep(error_timeout_s)
                return self._open_url_gentle(
                    url, max_retries - 1, timeout_s, error_timeout_s
                )

    def _get_direct_parent(self, default_parent: str, child_code: str):
        splitted_child_code = child_code.split(".")
        if len(splitted_child_code) == 1:
            return default_parent

        if len(splitted_child_code) > 2:
            logging.error(
                "ERROR! Code %s was splitted into more than two parts: %s",
                child_code,
                splitted_child_code,
            )
            return default_parent

        code_ending = splitted_child_code[1]
        if len(code_ending) == 1:
            return splitted_child_code[0]
        else:
            return child_code[: len(child_code) - 1]

    def _query_leaf_hierarchy_from(self, parent_url, parent_name, parent_code):
        logging.debug("Querying ICD9 data from %s", parent_url)
        soup = self._open_url_gentle(parent_url)

        hierarchy_df = pd.DataFrame(
            columns=[
                "parent_url",
                "parent_name",
                "parent_code",
                "child_url",
                "child_name",
                "child_code",
            ]
        )
        hierarchy_infos: Dict[str, Dict[str, str]] = {}
        definition_list = soup.find_all(class_="codeHierarchyUL")[0]
        for list_item in definition_list.find_all("li"):
            child_url = self.icd9data_base_url + list_item.a["href"]
            child_name = list_item.find_all(class_="threeDigitCodeListDescription")[
                0
            ].get_text()
            child_code = list_item.a.get_text()
            hierarchy_infos[child_code] = {
                "code": child_code,
                "name": child_name,
                "url": child_url,
            }

            direct_parent = self._get_direct_parent(parent_code, child_code)
            if direct_parent != parent_code and direct_parent in hierarchy_infos:
                hierarchy_df = hierarchy_df.append(
                    {
                        "parent_url": hierarchy_infos[direct_parent]["url"],
                        "parent_name": hierarchy_infos[direct_parent]["name"],
                        "parent_code": hierarchy_infos[direct_parent]["code"],
                        "child_url": child_url,
                        "child_name": child_name,
                        "child_code": child_code,
                    },
                    ignore_index=True,
                )
            else:
                if direct_parent != parent_code:
                    logging.error(
                        "Direct parent for child code %s is %s (default parent: %s), but it wasn't read yet.",
                        child_code,
                        direct_parent,
                        parent_code,
                    )
                hierarchy_df = hierarchy_df.append(
                    {
                        "parent_url": parent_url,
                        "parent_name": parent_name,
                        "parent_code": parent_code,
                        "child_url": child_url,
                        "child_name": child_name,
                        "child_code": child_code,
                    },
                    ignore_index=True,
                )
        return hierarchy_df

    def _query_hierarchy_from(
        self, parent_url, parent_name, parent_code
    ) -> pd.DataFrame:
        logging.debug("Querying ICD9 data from %s", parent_url)
        soup = self._open_url_gentle(parent_url)

        hierarchy_df = pd.DataFrame(
            columns=[
                "parent_url",
                "parent_name",
                "parent_code",
                "child_url",
                "child_name",
                "child_code",
            ]
        )
        definition_list = soup.find_all(class_="definitionList")[0]
        for list_item in tqdm(
            definition_list.find_all("li"),
            desc="Parsing child codes from code " + str(parent_code),
        ):
            child_url = self.icd9data_base_url + list_item.a["href"]
            child_text = list_item.get_text()
            child_code = child_text.split(" ")[0]
            child_name = " ".join(child_text.split(" ")[1:])
            hierarchy_df = hierarchy_df.append(
                {
                    "parent_url": parent_url,
                    "parent_name": parent_name,
                    "parent_code": parent_code,
                    "child_url": child_url,
                    "child_name": child_name,
                    "child_code": child_code,
                },
                ignore_index=True,
            )
            if "-" in child_code:
                hierarchy_df = hierarchy_df.append(
                    self._query_hierarchy_from(child_url, child_name, child_code),
                    ignore_index=True,
                )
            else:
                hierarchy_df = hierarchy_df.append(
                    self._query_leaf_hierarchy_from(child_url, child_name, child_code),
                    ignore_index=True,
                )

        return hierarchy_df


class ICD9KnowlifeMatcher:
    umls_query_endpoint = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}/atoms?sabs=ICD9CM,MTHICD9"
    umls_auth_endpoint = "https://utslogin.nlm.nih.gov/cas/v1/api-key"

    def __init__(
        self, umls_file: Path, umls_api_key: str,
    ):
        self.umls_file = umls_file
        self.umls_api_key = umls_api_key

    def _query_data(self, knowlife_df: pd.DataFrame) -> pd.DataFrame:
        knowlife_cuis = self._load_knowlife_cuis(knowlife_df)

        tgt = self._umls_gettgt()
        mapping = {}
        for knowlife_cui in tqdm(
            knowlife_cuis, desc="Querying Knowlife CUI <> ICD9 mapping from UMLS"
        ):
            mapping[knowlife_cui] = self._load_icd9_code_via_umls(knowlife_cui, tgt)

        mapping_df = pd.DataFrame.from_dict(
            {k: [v] for k, v in mapping.items()}, orient="index", columns=["icd9_url"]
        )
        mapping_df["icd9_code"] = mapping_df["icd9_url"].apply(
            lambda x: list(set([u.split("/")[-1] for u in x]))
        )
        mapping_df = mapping_df.explode("icd9_code", ignore_index=False).dropna()
        mapping_df["icd9_code"] = mapping_df["icd9_code"].apply(
            lambda x: x[0 : len(x) - 3]
            if "-" in x and x[len(x) - 3 : len(x)] == ".99"
            else x
        )
        return mapping_df.reset_index(drop=False).rename(columns={"index": "cui"})[
            ["icd9_code", "cui"]
        ]

    def load_data(self, knowlife_df: pd.DataFrame) -> pd.DataFrame:
        logging.info("Trying to read icd9_cui_file from %s", self.umls_file)
        if not self.umls_file.is_file():
            umls_df = self._query_data(knowlife_df)
            umls_df.to_csv(self.umls_file, index=False)

        return pd.read_csv(self.umls_file, dtype=str)

    def _load_knowlife_cuis(self, knowlife_df: pd.DataFrame) -> Set[str]:
        knowlife_cuis = set(knowlife_df["leftfactentity"])
        knowlife_cuis.update(set(knowlife_df["rightfactentity"]))
        return knowlife_cuis

    def _load_icd9_code_via_umls(self, cui: str, tgt) -> List[str]:
        path = self.umls_query_endpoint.format(code=cui)
        try:
            params = {"ticket": self._umls_getst(tgt)}
            response = requests.get(path, params=params)
            items = json.loads(response.text)
            if "result" not in items:
                logging.debug("Unable to find results for CUI %s", cui)
                return []
            else:
                source_atoms = items["result"]
                return [source_atom["code"] for source_atom in source_atoms]

        except:
            logging.error("Error trying to query CUI %s", cui)
            return []

    def _umls_gettgt(self):
        params = {"apikey": self.umls_api_key}
        headers = {
            "Content-type": "application/x-www-form-urlencoded",
            "Accept": "text/plain",
            "User-Agent": "python",
        }
        r = requests.post(self.umls_auth_endpoint, data=params, headers=headers)
        response = fromstring(r.text)
        return response.xpath("//form/@action")[0]

    def _umls_getst(self, tgt):
        params = {"service": "http://umlsks.nlm.nih.gov"}
        headers = {
            "Content-type": "application/x-www-form-urlencoded",
            "Accept": "text/plain",
            "User-Agent": "python",
        }
        response = requests.post(tgt, data=params, headers=headers)
        return response.text

### 2.3 mimic 

In [17]:
# from .base import Preprocessor
# from .icd9data import ICD9DataPreprocessor, ICD9KnowlifeMatcher

from unicodedata import digit

def _convert_to_icd9(dxStr: str):
    if dxStr.startswith("E"):
        if len(dxStr) > 4:
            return dxStr[:4] + "." + dxStr[4:]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3] + "." + dxStr[3:]
        else:
            return dxStr


def _convert_to_3digit_icd9(dxStr: str):
    if dxStr.startswith("E"):
        if len(dxStr) > 4:
            return dxStr[:4]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3]
        else:
            return dxStr


class ICD9HierarchyPreprocessor(Preprocessor):
    def __init__(self, config: MimicPreprocessorConfig):
        self.config = config

    def load_data(self) -> pd.DataFrame:
        logging.info("Starting to preprocess ICD9 hierarchy")
        hierarchy_df = self._read_hierarchy_df()
        hierarchy_df = self._transform_hierarchy_df(hierarchy_df)
        if len(self.config.replace_keys) > 0:
            hierarchy_df = self._add_noise_connections(hierarchy_df)
        return hierarchy_df

    def _read_hierarchy_df(self) -> pd.DataFrame:
        return ICD9DataPreprocessor(self.config.icd9_file).load_data()

    def _transform_hierarchy_df(self, hierarchy_df: pd.DataFrame):
        hierarchy_df["parent_id"] = hierarchy_df["parent_code"]
        hierarchy_df["child_id"] = hierarchy_df["child_code"]

        if len(self.config.prediction_column) > 0:
            hierarchy_df["child_id"] = hierarchy_df["child_id"].apply(
                lambda x: self.config.prediction_column + "#" + str(x)
            )
            hierarchy_df["parent_id"] = hierarchy_df["parent_id"].apply(
                lambda x: self.config.prediction_column + "#" + str(x)
            )

        return hierarchy_df[
            ["parent_id", "child_id", "parent_name", "child_name"]
        ]

    def _add_noise_connections(self, hierarchy_df: pd.DataFrame):
        to_replace_keys = [str(x) for x in self.config.replace_keys]
        replacement_keys = [str(x) for x in self.config.replace_with_keys]
        if not len(to_replace_keys) == len(replacement_keys):
            logging.error(
                "Unable to add MIMIC noise connections, different list sizes: %d, %d",
                len(to_replace_keys),
                len(replacement_keys),
            )
            return hierarchy_df

        for idx in tqdm(
            range(len(to_replace_keys)),
            desc="Adding noise connections for MIMIC Hierarchy",
        ):
            to_replace_name = (
                hierarchy_df[hierarchy_df["child_id"] == to_replace_keys[idx]]
                .reset_index(drop=True)["child_name"]
                .to_list()[0]
            )
            hierarchy_df = hierarchy_df.append(
                {
                    "child_id": to_replace_keys[idx],
                    "child_name": to_replace_name,
                    "parent_id": "NOISENODE" + str(idx),
                    "parent_name": "NOISENODE" + str(idx),
                },
                ignore_index=True,
            ).append(
                {
                    "child_id": replacement_keys[idx],
                    "child_name": replacement_keys[idx],
                    "parent_id": "NOISENODE" + str(idx),
                    "parent_name": "NOISENODE" + str(idx),
                },
                ignore_index=True,
            )

        return hierarchy_df


class CCSHierarchyPreprocessor(Preprocessor):
    def __init__(self, config: MimicPreprocessorConfig):
        self.config = config

    def load_data(self) -> pd.DataFrame:
        logging.info("Starting to preprocess CCS hierarchy")
        hierarchy_df = self._read_hierarchy_df()
        return self._transform_hierarchy_df(hierarchy_df)

    def _read_hierarchy_df(self) -> pd.DataFrame:
        logging.info("Reading hierarchy_df from %s", self.config.hierarchy_file)
        return pd.read_csv(self.config.hierarchy_file, quotechar="'", dtype=str)

    def _transform_hierarchy_df(self, hierarchy_df: pd.DataFrame):
        transformed_hierarchy_df = pd.DataFrame(
            columns=["parent_id", "child_id", "parent_name", "child_name"]
        )
        for _, row in tqdm(
            hierarchy_df.iterrows(),
            desc="Building flat hierarchy df",
            total=len(transformed_hierarchy_df),
        ):
            all_parents: List[Tuple[str, str]] = list(
                zip(
                    [
                        row["CCS LVL 1"],
                        row["CCS LVL 2"],
                        row["CCS LVL 3"],
                        row["CCS LVL 4"],
                    ],
                    [
                        row["CCS LVL 1 LABEL"],
                        row["CCS LVL 2 LABEL"],
                        row["CCS LVL 3 LABEL"],
                        row["CCS LVL 4 LABEL"],
                    ],
                )
            )
            all_parents = [(str(id).strip(), name) for (id, name) in all_parents]
            all_parents = [(id, name) for (id, name) in all_parents if len(id) > 0]

            # Labels are sorted from general -> specific

            transformed_hierarchy_df = transformed_hierarchy_df.append(
                pd.DataFrame(
                    data={
                        "parent_id": [id for (id, _) in all_parents],
                        "parent_name": [name for (_, name) in all_parents],
                        "child_id": [id for (id, _) in all_parents[1:]]
                        + [_convert_to_3digit_icd9(row["ICD-9-CM CODE"])],
                        "child_name": [name for (_, name) in all_parents[1:]]
                        + [_convert_to_3digit_icd9(row["ICD-9-CM CODE"])],
                    }
                )
            )

        return transformed_hierarchy_df


class ICD9DescriptionPreprocessor(Preprocessor):
    def __init__(self, config: MimicPreprocessorConfig):
        self.config = config

    def load_data(self) -> pd.DataFrame:
        logging.info("Starting to preprocess ICD9 descriptions")
        description_df = self._read_description_df()
        description_df["label"] = description_df["child_code"]
        if len(self.config.prediction_column) > 0:
            description_df["label"] = description_df["label"].apply(lambda x: self.config.prediction_column + "#" + x)

        description_df["description"] = description_df["child_name"].apply(
            lambda x: x.replace('"', "")
        )
        if len(self.config.replace_keys) > 0:
            description_df = self._add_noise_connections(description_df)
        return description_df[["label", "description"]]

    def _read_description_df(self) -> pd.DataFrame:
        return ICD9DataPreprocessor(self.config.icd9_file).load_data()

    def _add_noise_connections(self, description_df: pd.DataFrame):
        to_replace_keys = [str(x) for x in self.config.replace_keys]
        replacement_keys = [str(x) for x in self.config.replace_with_keys]
        if not len(to_replace_keys) == len(replacement_keys):
            logging.error(
                "Unable to add MIMIC noise connections, different list sizes: %d, %d",
                len(to_replace_keys),
                len(replacement_keys),
            )
            return description_df

        for idx in tqdm(
            range(len(to_replace_keys)),
            desc="Adding noise connections for MIMIC Descriptions",
        ):
            description_df.loc[
                description_df["label"] == to_replace_keys[idx], "description"
            ] = (
                description_df.loc[
                    description_df["label"] == to_replace_keys[idx], "description"
                ]
                + " NOISENODE"
                + str(idx)
            )
            description_df = description_df.append(
                {
                    "label": replacement_keys[idx],
                    "description": "NOISENODE" + str(idx),
                },
                ignore_index=True,
            )

        return description_df


class KnowlifePreprocessor(Preprocessor):
    def __init__(
        self, config: MimicPreprocessorConfig,
    ):
        self.config = config

    def load_data(self) -> pd.DataFrame:
        logging.info("Starting to preprocess Knowlife causality")
        knowlife_df = self._read_knowlife_df()
        knowlife_icd9_matching = (
            self._read_knowlife_icd_mapping(knowlife_df)
            .drop_duplicates()
            .groupby(by="cui")
            .agg({"icd9_code": lambda x: list(x),})
        )
        left_knowlife_df = pd.merge(
            knowlife_df,
            knowlife_icd9_matching,
            left_on="leftfactentity",
            right_on="cui",
            how="left",
        )
        right_knowlife_df = pd.merge(
            knowlife_df,
            knowlife_icd9_matching,
            left_on="rightfactentity",
            right_on="cui",
            how="left",
        )
        knowlife_df["parent_id"] = left_knowlife_df["icd9_code"]
        knowlife_df["child_id"] = right_knowlife_df["icd9_code"]
        knowlife_df = (
            knowlife_df.explode(column="parent_id")
            .explode(column="child_id")
            .dropna(subset=["parent_id", "child_id"],)
            .drop_duplicates(subset=["parent_id", "child_id"],)
            .reset_index(drop=True)
        )

        knowlife_df["parent_name"] = knowlife_df["parent_id"]
        knowlife_df["child_name"] = knowlife_df["child_id"]
        if len(self.config.prediction_column) > 0:
            knowlife_df["child_id"] = knowlife_df["child_id"].apply(
                lambda x: self.config.prediction_column + "#" + str(x)
            )
            knowlife_df["parent_id"] = knowlife_df["parent_id"].apply(
                lambda x: self.config.prediction_column + "#" + str(x)
            )

        if len(self.config.replace_keys) > 0:
            knowlife_df = self._add_noise_connections(knowlife_df)
        return knowlife_df[["parent_id", "child_id", "parent_name", "child_name"]]

    def _read_knowlife_icd_mapping(self, knowlife_df: pd.DataFrame) -> pd.DataFrame:
        return ICD9KnowlifeMatcher(
            self.config.umls_file, self.config.umls_api_key
        ).load_data(knowlife_df)

    def _read_knowlife_df(self) -> pd.DataFrame:
        knowlife_df = pd.read_csv(self.config.knowlife_file, sep="\t")
        knowlife_df = knowlife_df[knowlife_df["relation"] == "causes"].reset_index(
            drop=True
        )
        return knowlife_df

    def _add_noise_connections(self, knowlife_df: pd.DataFrame):
        to_replace_keys = [str(x) for x in self.config.replace_keys]
        replacement_keys = [str(x) for x in self.config.replace_with_keys]
        if not len(to_replace_keys) == len(replacement_keys):
            logging.error(
                "Unable to add MIMIC noise connections, different list sizes: %d, %d",
                len(to_replace_keys),
                len(replacement_keys),
            )
            return knowlife_df

        for idx in tqdm(
            range(len(to_replace_keys)),
            desc="Adding noise connections for MIMIC Knowlife Causality",
        ):
            knowlife_df = knowlife_df.append(
                {
                    "child_id": to_replace_keys[idx],
                    "child_name": to_replace_keys[idx],
                    "parent_id": "NOISENODE" + str(idx),
                    "parent_name": "NOISENODE" + str(idx),
                },
                ignore_index=True,
            ).append(
                {
                    "child_id": replacement_keys[idx],
                    "child_name": replacement_keys[idx],
                    "parent_id": "NOISENODE" + str(idx),
                    "parent_name": "NOISENODE" + str(idx),
                },
                ignore_index=True,
            )

        return knowlife_df


class MimicPreprocessor(Preprocessor):
    def __init__(
        self, config: MimicPreprocessorConfig,
    ):
        self.config = config
        self.aggregation_column_names = set(
            ["icd9_code", "icd9_code_converted", "icd9_code_converted_3digits",]
        )

    def load_data(self) -> pd.DataFrame:
        logging.info("Starting to preprocess MIMIC dataset")
        admission_df = self._read_admission_df()
        diagnosis_df = self._read_diagnosis_df()
        aggregated_df = self._aggregate_codes_per_admission(
            diagnosis_df=diagnosis_df, admission_df=admission_df
        )
        return aggregated_df[
            aggregated_df["num_admissions"] >= self.config.min_admissions_per_user
        ]

    def _read_admission_df(self) -> pd.DataFrame:
        logging.info("Reading admission_df from %s", self.config.admission_file)
        admission_df = pd.read_csv(self.config.admission_file)
        admission_df.columns = [x.lower() for x in admission_df.columns]
        admission_df["admittime"] = pd.to_datetime(admission_df["admittime"])
        admission_df["dischtime"] = pd.to_datetime(admission_df["dischtime"])
        admission_df["deathtime"] = pd.to_datetime(admission_df["deathtime"])
        admission_df["edregtime"] = pd.to_datetime(admission_df["edregtime"])
        admission_df["edouttime"] = pd.to_datetime(admission_df["edouttime"])
        return admission_df

    def _read_diagnosis_df(self) -> pd.DataFrame:
        logging.info("Reading diagnosis_df from %s", self.config.diagnosis_file)
        diagnosis_df = pd.read_csv(self.config.diagnosis_file)
        diagnosis_df.columns = [x.lower() for x in diagnosis_df.columns]

        diagnosis_df["icd9_code"] = diagnosis_df["icd9_code"].fillna("").apply(str)
        diagnosis_df["icd9_code_converted"] = diagnosis_df["icd9_code"].apply(
            _convert_to_icd9
        )
        diagnosis_df["icd9_code_converted_3digits"] = diagnosis_df["icd9_code"].apply(
            _convert_to_3digit_icd9
        )

        if self.config.add_icd9_info_to_sequences:
            diagnosis_df = self._add_icd9_information(diagnosis_df)
        if self.config.cluster_file.exists():
            diagnosis_df = self._add_cluster_information(diagnosis_df)
        if len(self.config.replace_keys) > 0:
            diagnosis_df = self._add_noise(diagnosis_df)
        if len(self.config.prediction_column) > 0:
            for column in self.aggregation_column_names:
                diagnosis_df[column] = diagnosis_df[column].apply(lambda x: str(column) + "#" + str(x))

        diagnosis_df["level_all"] = diagnosis_df[self.aggregation_column_names].apply(lambda x: list(x), axis=1)
        self.aggregation_column_names.add("level_all")
        return diagnosis_df

    def _add_cluster_information(self, diagnosis_df: pd.DataFrame) -> pd.DataFrame:
        cluster_df = pd.read_csv(self.config.cluster_file)
        self.aggregation_column_names.update(cluster_df.columns)
        return pd.merge(
            diagnosis_df,
            cluster_df,
            how="inner",
            left_on="icd9_code_converted",
            right_on="original_level_cluster",
        )

    def _add_noise(self, diagnosis_df: pd.DataFrame) -> pd.DataFrame:
        to_replace_keys = [str(x) for x in self.config.replace_keys]
        replacement_keys = [str(x) for x in self.config.replace_with_keys]
        replacement_percentages = [
            float(x) for x in self.config.replacement_percentages
        ]
        replacement_columns = self.config.replace_columns
        if (
            not len(to_replace_keys) == len(replacement_keys)
            or not len(to_replace_keys) == len(replacement_percentages)
            or not len(to_replace_keys) == len(replacement_columns)
        ):
            logging.error(
                "Unable to add MIMIC noise, different list sizes: %d, %d, %d, %d",
                len(to_replace_keys),
                len(replacement_keys),
                len(replacement_percentages),
                len(replacement_columns),
            )
            return diagnosis_df

        for idx in tqdm(
            range(len(to_replace_keys)), desc="Adding noise to MIMIC dataset"
        ):
            replace_samples = diagnosis_df[
                diagnosis_df[replacement_columns[idx]] == to_replace_keys[idx]
            ].sample(frac=replacement_percentages[idx])

            diagnosis_df.loc[
                replace_samples.index, replacement_columns[idx]
            ] = replacement_keys[idx]

        return diagnosis_df

    def _add_icd9_information(self, diagnosis_df: pd.DataFrame) -> pd.DataFrame:
        icd9_preprocessor = ICD9DataPreprocessor(self.config.icd9_file)

        icd9_df = icd9_preprocessor.load_data()[
            ["child_code", "child_name"]
        ].drop_duplicates()
        diagnosis_df["icd9_code_name"] = pd.merge(
            diagnosis_df,
            icd9_df,
            how="left",
            left_on="icd9_code_converted",
            right_on="child_code",
        )["child_name"].fillna(diagnosis_df["icd9_code_converted"])
        diagnosis_df["icd9_code_name_3digits"] = pd.merge(
            diagnosis_df,
            icd9_df,
            how="left",
            left_on="icd9_code_converted_3digits",
            right_on="child_code",
        )["child_name"].fillna(diagnosis_df["icd9_code_converted_3digits"])
        self.aggregation_column_names.update(
            ["icd9_code_name", "icd9_code_name_3digits",]
        )

        icd9_hierarchy_df = icd9_preprocessor.load_data_as_hierarchy()
        self.aggregation_column_names.update(icd9_hierarchy_df.columns)
        return pd.merge(
            diagnosis_df,
            icd9_hierarchy_df,
            how="inner",
            left_on="icd9_code_converted",
            right_on="level_0",
        )

    def _aggregate_codes_per_admission(
        self, diagnosis_df: pd.DataFrame, admission_df: pd.DataFrame
    ) -> pd.DataFrame:
        codes_per_admission = diagnosis_df.groupby("hadm_id").agg(
            {
                column_name: lambda x: list(x)
                for column_name in self.aggregation_column_names
            }
        )
        if "level_all" in codes_per_admission.columns:
            codes_per_admission["level_all"] = codes_per_admission["level_all"].apply(
                lambda x: [c for sublist in x for c in sublist]
            )

        combined_df = pd.merge(admission_df, codes_per_admission, on=["hadm_id"])

        subject_aggregation_column_names = list(self.aggregation_column_names) + [
            "hadm_id",
            "admittime",
            "diagnosis",
        ]
        admissions_per_subject = (
            combined_df.groupby("subject_id")
            .agg(
                {
                    column_name: lambda x: list(x)
                    for column_name in set(subject_aggregation_column_names)
                }
            )
            .reset_index()
        )
        admissions_per_subject["num_admissions"] = admissions_per_subject[
            "hadm_id"
        ].apply(len)
        return admissions_per_subject


## 3 Knowledge

#### 3.1 base  
BaseKnowledge

#### 3.2 node
Node

#### 3.3 hierarchy  
HierarchyKnowledge

#### 3.4 noise 
NoiseKnowledge

In [18]:
# Knowledge files, unneeded for gram
'''
from src.knowledge.causality import CausalityKnowledge
from src.knowledge.descriptions import DescriptionKnowledge
from src.knowledge.file import FileKnowledge
from src.knowledge.combined import CombinedKnowledge
'''

'\nfrom src.knowledge.causality import CausalityKnowledge\nfrom src.knowledge.descriptions import DescriptionKnowledge\nfrom src.knowledge.file import FileKnowledge\nfrom src.knowledge.combined import CombinedKnowledge\n'

### 3.1 base  

In [19]:
# from .config import KnowledgeConfig

class BaseKnowledge:
    def __init__(self, config: KnowledgeConfig):
        self.config = config
        self.vocab: Dict[str, int] = {}
        self.extended_vocab: Dict[str, int] = {}

    def get_vocab(self) -> Dict[str, int]:
        return self.vocab

    def get_extended_vocab(self) -> Dict[str, int]:
        return self.extended_vocab

    def get_connections_for_idx(self, idx: int) -> Set[int]:
        return set([idx])

    def get_description_vocab(self, ids: Set[int]) -> Dict[int, str]:
        return {}

### 3.2 node

In [20]:
class Node:
    def __init__(self, label_idx: int, label_str: str, label_names: Set[str]):
        self.label_idx = label_idx
        self.label_str = label_str
        self.label_name = self._select_label_name(label_names)
        self.in_nodes: Set['Node'] =set()
        self.out_nodes: Set['Node'] =set()

    def _select_label_name(self, label_names: Set[str]) -> str:
        potential_label_names = [x for x in label_names if len(x) > 0]
        if len(potential_label_names) > 0:
            return potential_label_names[0]
        else:
            return self.label_str

    def is_root(self) -> bool:
        return len(self.in_nodes) == 0

    def is_leaf(self) -> bool:
        return len(self.out_nodes) == 0

    def get_neighbours(self) -> List['Node']:
        return list(set(
            list(self.in_nodes) + list(self.out_nodes) + [self]
        ))

    def get_neighbour_label_idxs(self) -> List[int]:
        neighbours = self.get_neighbours()
        return [neighbour.label_idx for neighbour in neighbours]

    def get_ancestors(self) -> List['Node']:
        ancestors = [self]
        for node in self.in_nodes:
            ancestors = ancestors + node.get_ancestors()
        
        return list(set(ancestors))

    def get_ancestor_label_idxs(self) -> List[int]:
        ancestors = self.get_ancestors()
        return [ancestor.label_idx for ancestor in ancestors]

    def __str__(self):
        return "Node for idx " + str(self.label_idx) + " (label: " + str(self.label_str) + ", name: " + str(self.label_name) + ")" + \
             "\n<-Parent nodes: " + ",".join([str(p.label_idx) + "(" + str(p.label_str) + ")" for p in self.in_nodes]) + \
             "\n->Child nodes: " + ",".join([str(c.label_idx) + "(" + str(c.label_str) + ")" for c in self.out_nodes]) 


### 3.3 hierarchy  

In [21]:
'''
from .node import Node
from .base import BaseKnowledge
from .config import KnowledgeConfig
'''

class HierarchyKnowledge(BaseKnowledge):
    def __init__(
        self,
        config: KnowledgeConfig,
        child_id_col="child_id",
        parent_id_col="parent_id",
        child_name_col="child_name",
        parent_name_col="parent_name",
    ):
        super(HierarchyKnowledge, self).__init__(config=config)
        self.child_id_col = child_id_col
        self.parent_id_col = parent_id_col
        self.child_name_col = child_name_col
        self.parent_name_col = parent_name_col

    def get_connections_for_idx(self, idx: int) -> Set[int]:
        return set(self.nodes[idx].get_ancestor_label_idxs() + [idx])

    def get_description_vocab(self, ids: Set[int]) -> Dict[int, str]:
        return {idx: node.label_name for idx, node in self.nodes.items() if idx in ids}

    def build_hierarchy_from_df(
        self, hierarchy_df: pd.DataFrame, vocab: Dict[str, int]
    ):
        self.vocab: Dict[str, int] = vocab
        self._build_extended_vocab(hierarchy_df, vocab)
        for _, row in tqdm(hierarchy_df.iterrows(), desc="Building Hierarchy from df"):
            child_id = row[self.child_id_col]
            if child_id not in self.extended_vocab:
                logging.debug("Ignoring node %s as not in dataset", child_id)
                continue

            child_node = self.nodes[self.extended_vocab[child_id]]
            parent_node = self.nodes[self.extended_vocab[row[self.parent_id_col]]]

            if child_node is not parent_node:
                child_node.in_nodes.add(parent_node)
                parent_node.out_nodes.add(child_node)

        logging.info("Built hierarchy with %d nodes", len(self.nodes))

    def _build_extended_vocab(self, hierarchy_df: pd.DataFrame, vocab: Dict[str, int]):
        self.extended_vocab: Dict[str, int] = {}
        self.nodes: Dict[int, Node] = {}

        labels_to_handle = list(vocab.keys())
        max_index = max(vocab.values())
        while len(labels_to_handle) > 0:
            label = labels_to_handle.pop()
            if label in self.extended_vocab:
                continue

            if label in vocab:
                self.extended_vocab[label] = vocab[label]
            else:
                self.extended_vocab[label] = max_index + 1
                max_index = max_index + 1

            label_names = set(
                hierarchy_df[hierarchy_df[self.child_id_col] == label][
                    self.child_name_col
                ]
            )
            label_names.update(
                set(
                    hierarchy_df[hierarchy_df[self.parent_id_col] == label][
                        self.parent_name_col
                    ]
                )
            )
            self.nodes[self.extended_vocab[label]] = Node(
                label_idx=self.extended_vocab[label],
                label_str=label,
                label_names=label_names,
            )

            parents_df = hierarchy_df[hierarchy_df[self.child_id_col] == label]
            parents = list(set(parents_df[self.parent_id_col]))
            labels_to_handle = labels_to_handle + parents

        self.extra_vocab: Dict[str, int] = {
            k: v for k, v in self.extended_vocab.items() if k not in self.vocab
        }

    def __str__(self):
        roots = [node for node in self.nodes.values() if node.is_root()]
        all_strings = []
        for root in roots:
            all_strings = all_strings + self._to_string_recursive(root, "")
        return "\n".join(all_strings)

    def _to_string_recursive(self, current_node, current_prefix):
        strings = [current_prefix + current_node.label_str]
        for node in current_node.out_nodes:
            strings = strings + self._to_string_recursive(node, current_prefix + "-")

        return strings

### 3.4 noise 

In [22]:
'''
from src.features.knowledge.base import BaseKnowledge
from .base import BaseKnowledge
'''
class NoiseKnowledge(BaseKnowledge):
    def __init__(self, knowledge: BaseKnowledge):
        self.knowledge = knowledge
        self.vocab: Dict[str, int] = knowledge.vocab
        self.extended_vocab: Dict[str, int] = knowledge.extended_vocab

        self._initialize_connections_from_knowledge(knowledge)
        self.original_num_connections = self.num_connections
        self.original_connections = {k: set(v) for k, v in self.connections.items()}
        self.original_reverse_connections = {
            k: set(v) for k, v in self.reverse_connections.items()
        }

    def get_text_connections(self) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
        reverse_text_vocab: Dict[int, str] = {
            v: k for k, v in self.extended_vocab.items()
        }
        original_connections_text = {
            reverse_text_vocab[k]: [reverse_text_vocab[v] for v in vs]
            for k, vs in self.original_connections.items()
        }
        noise_connections_text = {
            reverse_text_vocab[k]: [reverse_text_vocab[v] for v in vs]
            for k, vs in self.connections.items()
        }
        return (original_connections_text, noise_connections_text)

    def _initialize_connections_from_knowledge(self, knowledge: BaseKnowledge):
        self.num_connections = 0
        self.reverse_connections: Dict[int, Set[int]] = {}
        self.connections: Dict[int, Set[int]] = {}
        for _, idx in knowledge.get_vocab().items():
            connections = knowledge.get_connections_for_idx(idx)
            self.connections[idx] = connections
            for connected_idx in connections:
                self.num_connections += 1
                if idx == connected_idx:
                    continue

                if connected_idx not in self.reverse_connections:
                    self.reverse_connections[connected_idx] = set()
                self.reverse_connections[connected_idx].add(idx)

    def add_random_connections(self, percentage: float = 0.1):
        num_connections_to_add = int(percentage * self.original_num_connections)
        added_connections = 0
        with tqdm(
            total=num_connections_to_add,
            desc="Adding {} random connections to knowledge".format(
                num_connections_to_add
            ),
        ) as pbar:
            while added_connections < num_connections_to_add:
                from_idx = random.choice(list(self.connections.keys()))
                to_idx = random.choice(list(self.reverse_connections.keys()))
                if (from_idx == to_idx) or (to_idx in self.connections[from_idx]):
                    continue

                self.connections[from_idx].add(to_idx)
                self.reverse_connections[to_idx].add(from_idx)
                added_connections += 1
                self.num_connections += 1
                pbar.update(n=1)

    def remove_random_connections(self, percentage: float = 0.1):
        num_connections_to_remove = int(percentage * self.original_num_connections)
        removed_connections = 0
        with tqdm(
            total=num_connections_to_remove,
            desc="Removing {} random connections to knowledge".format(
                num_connections_to_remove
            ),
        ) as pbar:
            while removed_connections < num_connections_to_remove:
                from_idx = random.choice(list(self.connections.keys()))
                to_idx = random.choice(list(self.reverse_connections.keys()))
                if (from_idx == to_idx) or (to_idx not in self.connections[from_idx]):
                    continue

                self.connections[from_idx].remove(to_idx)
                self.reverse_connections[to_idx].remove(from_idx)
                removed_connections += 1
                self.num_connections -= 1
                pbar.update(n=1)

    def remove_lowest_connections(
        self,
        percentage: float = 0.1,
        connections_reference_file: Path = Path("data/attention.json"),
    ):
        if not connections_reference_file.exists():
            logging.error(
                "Cannot read attention reference file from %s",
                connections_reference_file,
            )
            return

        num_connections_to_remove = int(percentage * self.original_num_connections)
        with open(connections_reference_file) as attention_file:
            connections_reference = json.load(attention_file)["attention_weights"]
            flattened_connections = [
                float(connection_weight)
                for from_node, attention_info in connections_reference.items()
                for to_node, connection_weight in attention_info.items()
                if from_node != to_node
            ]
            threshold = sorted(flattened_connections)[num_connections_to_remove + 1]
            self._remove_connections_below(threshold, connections_reference)

    def remove_connections_below(
        self,
        threshold: float = 0.001,
        connections_reference_file: Path = Path("data/attention.json"),
    ):
        if not connections_reference_file.exists():
            logging.error(
                "Cannot read attention reference file from %s",
                connections_reference_file,
            )
            return

        with open(connections_reference_file) as attention_file:
            connections_reference = json.load(attention_file)["attention_weights"]
            self._remove_connections_below(threshold, connections_reference)

    def _remove_connections_below(
        self,
        threshold: float = 0.001,
        connections_reference: Dict[str, Dict[str, float]] = {},
    ):
        removed_connections = 0
        for from_word, connections in tqdm(
            connections_reference.items(),
            total=len(connections_reference),
            desc="Removing connections with weights below {}".format(threshold),
        ):
            if from_word not in self.get_extended_vocab():
                continue
            from_idx = self.get_extended_vocab()[from_word]
            for to_word, connection_weight in connections.items():
                if to_word not in self.get_extended_vocab():
                    continue
                to_idx = self.get_extended_vocab()[to_word]
                if (from_idx == to_idx) or (to_idx not in self.connections[from_idx]):
                    continue

                if float(connection_weight) < threshold:
                    self.connections[from_idx].remove(to_idx)
                    self.reverse_connections[to_idx].remove(from_idx)
                    self.num_connections -= 1
                    removed_connections += 1
        logging.info(
            "Removed %d connections that had weight < %f. %d connections remaining.",
            removed_connections,
            threshold,
            self.num_connections,
        )

    def get_vocab(self) -> Dict[str, int]:
        return self.vocab

    def get_extended_vocab(self) -> Dict[str, int]:
        return self.extended_vocab

    def get_connections_for_idx(self, idx: int) -> Set[int]:
        return self.connections[idx]

    def get_description_vocab(self, ids: Set[int]) -> Dict[int, str]:
        return self.knowledge.get_description_vocab(ids)

## 4 Genarate models

#### 4.1 metrics 
- MulticlassTruePositiveRate
- MulticlassAccuracy
- MulticlassMetric
- MulticlassTrueNegativeRate
- PercentileSubsetMetricHelper

#### 4.2 fasttext
- FastTextInitializer

#### 4.3 base 
- BaseEmbedding
- BaseModel

#### 4.4 knowledge_embedding  
- KnowledgeEmbedding

#### 4.5 gram 
- GramEmbedding
- GramModel

#### 4.6 callback 
- MLFlowCallback
- BestModelRestoreCallback




In [23]:
# Models files, unneeded for gram,
'''
from src.models.simple import SimpleModel, SimpleEmbedding
from src.models.textual import DescriptionModel, DescriptionEmbedding
from src.models.textual_paper import DescriptionPaperModel, DescriptionPaperEmbedding
from src.models.causal import CausalityEmbedding, CausalityModel
from src.models.file import FileEmbedding, FileModel
from src.models.combined import CombinedEmbedding, CombinedModel
'''

'\nfrom src.models.simple import SimpleModel, SimpleEmbedding\nfrom src.models.textual import DescriptionModel, DescriptionEmbedding\nfrom src.models.textual_paper import DescriptionPaperModel, DescriptionPaperEmbedding\nfrom src.models.causal import CausalityEmbedding, CausalityModel\nfrom src.models.file import FileEmbedding, FileModel\nfrom src.models.combined import CombinedEmbedding, CombinedModel\n'

### 4.1 metrics

In [24]:
from numpy.core.numeric import full

class MulticlassMetric(tf.keras.metrics.Metric):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.true_positive_predictions = self.add_weight(
            name="true_positive_predictions", initializer="zeros"
        )
        self.false_positive_predictions = self.add_weight(
            name="false_positive_predictions", initializer="zeros"
        )
        self.true_negative_predictions = self.add_weight(
            name="true_negative_predictions", initializer="zeros"
        )
        self.false_negative_predictions = self.add_weight(
            name="false_negative_predictions", initializer="zeros"
        )

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.round(y_pred)

        correct_positive_predictions = tf.where((y_pred == 1) & (y_true == 1), x=1, y=0)
        wrong_positive_predictions = tf.where((y_pred == 1) & (y_true == 0), x=1, y=0)
        correct_negative_predictions = tf.where((y_pred == 0) & (y_true == 0), x=1, y=0)
        wrong_negative_predictions = tf.where((y_pred == 0) & (y_true == 1), x=1, y=0)

        self.true_positive_predictions.assign_add(
            tf.cast(tf.reduce_sum(correct_positive_predictions), dtype="float32")
        )
        self.false_positive_predictions.assign_add(
            tf.cast(tf.reduce_sum(wrong_positive_predictions), dtype="float32")
        )
        self.true_negative_predictions.assign_add(
            tf.cast(tf.reduce_sum(correct_negative_predictions), dtype="float32")
        )
        self.false_negative_predictions.assign_add(
            tf.cast(tf.reduce_sum(wrong_negative_predictions), dtype="float32")
        )

    def result(self):
        raise NotImplementedError("This should be implemented by subclass!!!!!")

    def reset_states(self):
        self.true_positive_predictions.assign(0.0)
        self.false_positive_predictions.assign(0.0)
        self.true_negative_predictions.assign(0.0)
        self.false_negative_predictions.assign(0.0)


class MulticlassAccuracy(MulticlassMetric):
    def __init__(self, *args, **kwargs):
        super().__init__(name="multiclass_accuracy", *args, **kwargs)

    def result(self):
        return (self.true_positive_predictions + self.true_negative_predictions) / (
            self.true_positive_predictions
            + self.false_positive_predictions
            + self.true_negative_predictions
            + self.false_negative_predictions
        )


class MulticlassTruePositiveRate(MulticlassMetric):
    def __init__(self, *args, **kwargs):
        super().__init__(name="multiclass_true_positive_rate", *args, **kwargs)

    def result(self):
        return self.true_positive_predictions / (
            self.true_positive_predictions + self.false_negative_predictions
        )


class MulticlassTrueNegativeRate(MulticlassMetric):
    def __init__(self, *args, **kwargs):
        super().__init__(name="multiclass_true_negative_rate", *args, **kwargs)

    def result(self):
        return self.true_negative_predictions / (
            self.true_negative_predictions + self.false_positive_predictions
        )


class MultilabelNestedMetric(tf.keras.metrics.Metric):
    def __init__(self, nested_metric: tf.keras.metrics.Metric, full_prediction: bool, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.nested_metric = nested_metric
        self.full_prediction = full_prediction

    def update_state(self, y_true, y_pred, sample_weight=None):
        if self.full_prediction:
            y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
            y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0] * tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
        id_tensor = tf.eye(tf.shape(y_true)[1], dtype="int32")
        id_tensor_expanded = tf.reshape(
            tf.broadcast_to(
                tf.expand_dims(id_tensor, axis=0),
                (tf.shape(y_true)[0], tf.shape(y_true)[1], tf.shape(y_true)[1]),
            ),
            (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[1]),
        )
        cleaned_id_tensor = tf.where(
            (id_tensor_expanded == 1)
            & (tf.repeat(y_true, repeats=tf.shape(y_true)[1], axis=0) == 1),
            x=1,
            y=0,
        )

        weights = tf.reduce_sum(cleaned_id_tensor, axis=1,)
        if sample_weight is not None:
            weights = weights * tf.repeat(sample_weight, tf.shape(y_true)[1], axis=0)

        self.nested_metric.update_state(
            y_true=cleaned_id_tensor,
            y_pred=tf.repeat(y_pred, tf.shape(y_true)[1], axis=0),
            sample_weight=weights,
        )

    def result(self):
        return self.nested_metric.result()

    def reset_states(self):
        self.nested_metric.reset_states()


class SubsetMetric(tf.keras.metrics.Metric):
    def __init__(
        self,
        dataset_mask: np.array,
        nested_metric: tf.keras.metrics.Metric,
        full_prediction: bool,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.dataset_mask = dataset_mask
        self.nested_metric = nested_metric
        self.full_prediction = full_prediction

    def update_state(self, y_true, y_pred, sample_weight=None):        
        if self.full_prediction:
            y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
            y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0] * tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
        weights = tf.reduce_sum(tf.where(self.dataset_mask, x=y_true, y=0), axis=1)
        if sample_weight is not None:
            weights = weights * sample_weight
        self.nested_metric.update_state(
            y_true, y_pred, sample_weight=weights,
        )

    def result(self):
        return self.nested_metric.result()

    def reset_states(self):
        self.nested_metric.reset_states()


class PercentileSubsetMetricHelper:
    def __init__(
        self, dataset: tf.data.Dataset, num_percentiles: int, y_vocab: Dict[str, int], full_prediction: bool,
    ):
        self.dataset = dataset
        self.num_percentiles = num_percentiles
        self.y_vocab = y_vocab
        self.full_prediction = full_prediction
        self._init_percentiles()
        self._log_percentile_mapping_to_mlflow()

    def get_accuracy_at_k_for(
        self, k, is_multilabel: bool, use_cumulative: bool
    ) -> List[tf.keras.metrics.Metric]:
        metrics = []
        for i in range(self.num_percentiles):
            name = (
                "top_"
                + str(k)
                + "_categorical_accuracy_"
                + ("cp" if use_cumulative else "p")
                + str(i)
            )
            mask = self._get_mask_for_percentile(i, use_cumulative=use_cumulative)

            metrics.append(
                self._get_accuracy_at_k_with_mask(k, is_multilabel=is_multilabel, mask=mask, name=name)
            )

        return metrics

    def _get_accuracy_at_k_with_mask(self, k, is_multilabel: bool, mask, name: str) -> tf.keras.metrics.Metric:
        if is_multilabel:
            return MultilabelNestedMetric(
                nested_metric=SubsetMetric(
                    dataset_mask=mask,
                    nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(
                        k=k, name=name
                    ),
                    full_prediction=False,
                ),
                full_prediction=self.full_prediction,
                name=name,
            )
        else:
            return SubsetMetric(
                dataset_mask=mask,
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(
                    k=k, name=name
                ),
                full_prediction=self.full_prediction,
                name=name,
            )


    def _get_mask_for_percentile(self, p, use_cumulative: bool):
        if use_cumulative:
            mask = np.where(
                (self.cpercentile_ranks > self.percentile_steps[p])
                & (self.cpercentile_ranks <= self.percentile_steps[p + 1]),
                True,
                False,
            )
        else:
            mask = np.where(
                (self.frequency_ranks > self.percentiles[p])
                & (self.frequency_ranks <= self.percentiles[p + 1]),
                True,
                False,
            )
        if not np.any(mask):
            logging.warn("No class labels in percentile %d", p)

        return mask

    def _init_percentiles(self):
        num_classes = len(self.y_vocab)
        absolute_class_frequencies = np.zeros(shape=(num_classes,), dtype=np.int32)
        for (_, y_true) in tqdm(
            self.dataset.as_numpy_iterator(),
            desc="Calculating percentile frequencies...",
        ):        
            if self.full_prediction:
                y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
            next_sum = np.sum(y_true, axis=0,)
            absolute_class_frequencies = absolute_class_frequencies + next_sum

        self.frequencies = absolute_class_frequencies / np.sum(
            absolute_class_frequencies
        )
        self.frequency_ranks = np.empty_like(self.frequencies.argsort())
        self.frequency_ranks[self.frequencies.argsort()] = np.arange(
            len(self.frequencies)
        )
        self._init_percentile_values()
        self._init_cpercentiles()

    def _init_percentile_values(self):
        self.percentile_steps = [
            100 * i / self.num_percentiles for i in range(self.num_percentiles + 1)
        ]
        self.percentiles = np.percentile(self.frequency_ranks, self.percentile_steps)
        self.percentiles[0] = -1
        self.percentile_steps[0] = -1

    def _init_cpercentiles(self):
        sorted_frequencies = self.frequencies[self.frequencies.argsort()]
        self.cfrequencies = np.cumsum(sorted_frequencies)[self.frequency_ranks]
        self.cpercentile_ranks = (self.cfrequencies - 0.5 * self.frequencies) * 100

    def _log_percentile_mapping_to_mlflow(self):
        percentile_mapping = self._create_percentile_mapping()
        mlflow.log_dict(percentile_mapping, "percentile_mapping.json")

    def _create_percentile_mapping(self) -> Dict[int, Any]:
        percentile_mapping = {}
        for i in range(self.num_percentiles):
            percentile_mapping[i] = {
                "percentile_steps": [
                    self.percentile_steps[i],
                    self.percentile_steps[i + 1],
                ],
                "percentile_values": [self.percentiles[i], self.percentiles[i + 1]],
                "percentile_classes": [
                    name
                    for (name, idx) in self.y_vocab.items()
                    if self.frequency_ranks[idx] > self.percentiles[i]
                    and self.frequency_ranks[idx] <= self.percentiles[i + 1]
                ],
                "cpercentile_classes": [
                    name
                    for (name, idx) in self.y_vocab.items()
                    if self.cpercentile_ranks[idx] > self.percentile_steps[i]
                    and self.cpercentile_ranks[idx] <= self.percentile_steps[i + 1]
                ],
            }
        return percentile_mapping

### 4.2 fasttext

In [25]:
import fasttext
import fasttext.util
import re

class FastTextInitializer:
    def __init__(self, embedding_dim: int):
        self.embedding_dim = embedding_dim
        self.fasttext_model = self._load_fasttext_model()

    def _load_fasttext_model(self):
        logging.info('(Down)loading fasttext English language model')
        fasttext.util.download_model('en', if_exists='ignore')
        model = fasttext.load_model('cc.en.300.bin')
        if model.get_dimension() > self.embedding_dim:
            logging.info('Reducing dimension of FastText word model from %d to %d', model.get_dimension(), self.embedding_dim)
            fasttext.util.reduce_model(model, self.embedding_dim)

        return model

    def _load_word_embedding(self, description: str) -> tf.Tensor:
        description_words = ' '.join(re.split('[,._-]+', description)).split(' ')
        description_vectors = [
            self.fasttext_model.get_word_vector(word)
            for word in description_words
        ]     
        return tf.convert_to_tensor(np.mean(description_vectors, axis=0))

    def _load_word_embeddings(self, description_vocab: Dict[int, str]) -> Dict[int, tf.Variable]:
        word_embeddings = {}
        for idx, description in tqdm(description_vocab.items(), desc='Initializing word embeddings from model'):
            word_embeddings[idx] = tf.constant(
                tf.expand_dims(
                    self._load_word_embedding(description),
                    axis=0,
                ),
                shape=(1,self.embedding_dim),
            )
        return word_embeddings

    def get_initializer(self, description_vocab: Dict[int, str]) -> tf.keras.initializers.Initializer:
        word_embeddings = self._load_word_embeddings(description_vocab)
        concatenated_word_embeddings = tf.concat(
            [word_embeddings[x] for x in sorted(word_embeddings.keys())],
            axis=1,
        )
        return tf.keras.initializers.Constant(value=concatenated_word_embeddings.numpy())

### 4.3 base

In [26]:
'''
from src.config import ExperimentConfig
from src.features.sequences.transformer import SequenceMetadata
from .metrics import (
    MulticlassAccuracy,
    MulticlassTrueNegativeRate,
    MulticlassTruePositiveRate,
    PercentileSubsetMetricHelper,
    MultilabelNestedMetric,
)
from .config import ModelConfig
from .callbacks import MLFlowCallback, BestModelRestoreCallback
from .initializers import FastTextInitializer
'''

import datetime
from tensorflow.python.keras.layers.core import Dropout, Masking

def full_prediction_binary_accuracy_loss(y_true, y_pred):
    sum = tf.reduce_sum(y_true, axis=-1)
    weights = tf.where(sum > 1, x=1.0, y=sum)
    weights = tf.cast(weights, dtype="float32")
    loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
    loss = tf.reduce_sum(weights * loss, axis=1) / tf.reduce_sum(weights, axis=1)
    return tf.reduce_mean(loss)


class BaseEmbedding:
    config: ModelConfig
    num_features: int = 0
    num_hidden_features: int = 0
    num_connections: int = 0

    basic_feature_embeddings: tf.Variable  # shape: (num_features, embedding_size)
    basic_hidden_embeddings: tf.Variable  # shape: (num_hidden_features, embedding_size)

    def _final_embedding_matrix(self):
        """Overwrite this in case embedding uses attention mechanism etc"""
        return self.basic_feature_embeddings

    def _get_kernel_regularizer(self, scope: str):
        if scope not in self.config.kernel_regularizer_scope:
            logging.debug("Regularization not enabled for %s", scope)
            return None
        elif self.config.kernel_regularizer_value <= 0.0:
            return None
        elif self.config.kernel_regularizer_type == "l2":
            return tf.keras.regularizers.l2(self.config.kernel_regularizer_value)
        elif self.config.kernel_regularizer_type == "l2":
            return tf.keras.regularizers.l1(self.config.kernel_regularizer_value)
        else:
            return None

    def _get_initializer(
        self,
        initializer_name: str,
        initializer_seed: int,
        description_vocab: Dict[int, str],
    ) -> tf.keras.initializers.Initializer:
        if initializer_name == "random_uniform":
            return tf.keras.initializers.GlorotUniform(seed=initializer_seed)
        elif initializer_name == "random_normal":
            return tf.keras.initializers.GlorotNormal(seed=initializer_seed)
        elif initializer_name == "fasttext":
            initializer = FastTextInitializer(self.config.embedding_dim)
            return initializer.get_initializer(description_vocab)
        else:
            logging.error("Unknown initializer %s", initializer_name)

    def _get_feature_initializer(
        self, description_vocab: Dict[int, str]
    ) -> tf.keras.initializers.Initializer:
        return self._get_initializer(
            self.config.feature_embedding_initializer,
            self.config.feature_embedding_initializer_seed,
            description_vocab,
        )

    def _get_hidden_initializer(
        self, description_vocab: Dict[int, str]
    ) -> tf.keras.initializers.Initializer:
        return self._get_initializer(
            self.config.hidden_embedding_initializer,
            self.config.hidden_embedding_initializer_seed,
            description_vocab,
        )


class BaseModel:
    def __init__(self):
        self.prediction_model: tf.keras.Model = None
        self.embedding_layer: BaseEmbedding = None
        self.metrics: List[tf.keras.metrics.Metric] = []
        self.config = ModelConfig()

    def _get_embedding_layer(
        self, metadata: SequenceMetadata, knowledge: Any
    ) -> BaseEmbedding:
        raise NotImplementedError("This should be implemented by the subclass!!!")

    def _select_distribute_strategy(self) -> tf.distribute.Strategy:
        if self.config.distribute_strategy == "mirrored":
            return tf.distribute.MirroredStrategy()
        elif self.config.distribute_strategy.startswith("/gpu"):
            return tf.distribute.OneDeviceStrategy(
                device=self.config.distribute_strategy
            )
        else:
            return tf.distribute.get_strategy()

    def build(self, metadata: SequenceMetadata, knowledge: Any):
        self.metadata = metadata
        self.strategy = self._select_distribute_strategy()
        logging.info(
            "Using strategy with %d workers", self.strategy.num_replicas_in_sync
        )

        with self.strategy.scope():
            self.embedding_layer = self._get_embedding_layer(metadata, knowledge)
            self._log_embedding_stats()
            self.prediction_model = tf.keras.models.Sequential(
                [
                    tf.keras.layers.Input(
                        shape=(metadata.max_x_length, len(metadata.x_vocab)),
                    ),
                    self.embedding_layer,
                    tf.keras.layers.Masking(mask_value=0),
                    self._get_rnn_layer(),
                    tf.keras.layers.Dropout(
                        rate=self.config.dropout_rate, seed=self.config.dropout_seed
                    ),
                    tf.keras.layers.Dense(
                        len(metadata.y_vocab),
                        activation=self.config.final_activation_function,
                        kernel_regularizer=self.embedding_layer._get_kernel_regularizer(
                            scope="prediction_dense"
                        ),
                    ),
                ]
            )

    def _log_embedding_stats(self):
        mlflow.log_metric("num_features", self.embedding_layer.num_features)
        mlflow.log_metric(
            "num_hidden_features", self.embedding_layer.num_hidden_features
        )
        mlflow.log_metric("num_connections", self.embedding_layer.num_connections)

    def _get_rnn_layer(self):
        if self.config.rnn_type == "rnn":
            return tf.keras.layers.SimpleRNN(
                units=self.config.rnn_dim,
                kernel_regularizer=self.embedding_layer._get_kernel_regularizer(
                    scope="prediction_rnn"
                ),
                return_sequences=self.metadata.full_y_prediction,
                dropout=self.config.rnn_dropout,
            )
        elif self.config.rnn_type == "lstm":
            return tf.keras.layers.LSTM(
                units=self.config.rnn_dim,
                kernel_regularizer=self.embedding_layer._get_kernel_regularizer(
                    scope="prediction_rnn"
                ),
                return_sequences=self.metadata.full_y_prediction,
                dropout=self.config.rnn_dropout,
            )
        elif self.config.rnn_type == "gru":
            return tf.keras.layers.GRU(
                units=self.config.rnn_dim,
                kernel_regularizer=self.embedding_layer._get_kernel_regularizer(
                    scope="prediction_rnn"
                ),
                return_sequences=self.metadata.full_y_prediction,
                dropout=self.config.rnn_dropout,
            )
        else:
            logging.error("Unknown rnn layer type: %s", self.config.rnn_type)

    def train_dataset(
        self,
        train_dataset: tf.data.Dataset,
        test_dataset: tf.data.Dataset,
        multilabel_classification: bool,
        n_epochs: int,
    ):
        with self.strategy.scope():
            if self.metadata.full_y_prediction:
                self._compile_full_prediction(train_dataset)
            elif len(self.metadata.y_vocab) == 1:
                self._compile_singleclass()
            elif multilabel_classification:
                self._compile_multilabel(train_dataset)
            else:
                self._compile_multiclass(train_dataset)

            model_summary = []
            self.prediction_model.summary(print_fn=lambda x: model_summary.append(x))
            mlflow.log_text("\n".join(model_summary), artifact_file="model_summary.txt")

            self.history = self.prediction_model.fit(
                train_dataset,
                validation_data=test_dataset,
                epochs=n_epochs,
                callbacks=[
                    MLFlowCallback(),
                    BestModelRestoreCallback(
                        metric=self.config.best_model_metric,
                        minimize=self.config.best_model_metric_minimize,
                        early_stopping_epochs=self.config.early_stopping_epochs,
                    ),
                ],
            )

    def _compile_singleclass(self):
        self.metrics = [
            tf.keras.metrics.Accuracy(),
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.AUC(),
        ]
        self.prediction_model.compile(
            loss=self.config.loss,
            optimizer=self.config.optimizer,
            metrics=self.metrics,
        )

    def _compile_full_prediction(self, train_dataset: tf.data.Dataset):
        self.metrics = [
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.CategoricalAccuracy(),
                name="categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=5),
                name="top_5_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=10),
                name="top_10_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=20),
                name="top_20_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
        ]
        metric_helper = PercentileSubsetMetricHelper(
            train_dataset,
            num_percentiles=self.config.metrics_num_percentiles,
            y_vocab=self.metadata.y_vocab,
            full_prediction=self.metadata.full_y_prediction,
        )
        for k in [5, 10, 20]:
            self.metrics = (
                self.metrics
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=True, use_cumulative=True
                )
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=True, use_cumulative=False
                )
            )

        self.prediction_model.compile(
            loss=full_prediction_binary_accuracy_loss,
            optimizer=self.config.optimizer,
            metrics=self.metrics,
        )

    def _compile_multilabel(self, train_dataset: tf.data.Dataset):
        self.metrics = [
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.CategoricalAccuracy(),
                name="categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=5),
                name="top_5_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=10),
                name="top_10_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
            MultilabelNestedMetric(
                nested_metric=tf.keras.metrics.TopKCategoricalAccuracy(k=20),
                name="top_20_categorical_accuracy",
                full_prediction=self.metadata.full_y_prediction,
            ),
        ]
        metric_helper = PercentileSubsetMetricHelper(
            train_dataset,
            num_percentiles=self.config.metrics_num_percentiles,
            y_vocab=self.metadata.y_vocab,
            full_prediction=self.metadata.full_y_prediction,
        )
        for k in [5, 10, 20]:
            self.metrics = (
                self.metrics
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=True, use_cumulative=True
                )
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=True, use_cumulative=False
                )
            )

        self.prediction_model.compile(
            loss=self.config.loss,
            optimizer=self.config.optimizer,
            metrics=self.metrics,
        )

    def _compile_multiclass(self, train_dataset: tf.data.Dataset):
        metric_helper = PercentileSubsetMetricHelper(
            train_dataset,
            num_percentiles=self.config.metrics_num_percentiles,
            y_vocab=self.metadata.y_vocab,
            full_prediction=self.metadata.full_y_prediction,
        )
        self.metrics = [
            tf.keras.metrics.CategoricalAccuracy(),
            tf.keras.metrics.TopKCategoricalAccuracy(
                k=5, name="top_5_categorical_accuracy"
            ),
            tf.keras.metrics.TopKCategoricalAccuracy(
                k=10, name="top_10_categorical_accuracy"
            ),
            tf.keras.metrics.TopKCategoricalAccuracy(
                k=20, name="top_20_categorical_accuracy"
            ),
        ]
        for k in [5, 10, 20]:
            self.metrics = (
                self.metrics
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=False, use_cumulative=True
                )
                + metric_helper.get_accuracy_at_k_for(
                    k=k, is_multilabel=False, use_cumulative=False
                )
            )

        self.prediction_model.compile(
            loss=self.config.loss,
            optimizer=self.config.optimizer,
            metrics=self.metrics,
        )



### 4.3 knowledge_embedding

In [27]:
'''
from src.features.knowledge.base import BaseKnowledget
from .config import ModelConfig
from .base import BaseEmbedding
'''

class KnowledgeEmbedding(BaseEmbedding, tf.keras.Model):
    def __init__(
        self, knowledge: BaseKnowledge, config: ModelConfig, embedding_name: str
    ):
        super(KnowledgeEmbedding, self).__init__()
        self.embedding_name = embedding_name
        self.config = config

        self.num_features = len(knowledge.get_vocab())
        self.num_hidden_features = len(knowledge.get_extended_vocab()) - len(
            knowledge.get_vocab()
        )

        self.w = tf.keras.layers.Dense(
            self.config.attention_dim,
            use_bias=True,
            activation="tanh",
            kernel_regularizer=super()._get_kernel_regularizer(scope="attention"),
        )
        self.u = tf.keras.layers.Dense(
            1, use_bias=False, kernel_regularizer=super()._get_kernel_regularizer(scope="attention")
        )

        self._init_basic_embedding_variables(knowledge)
        self._init_connection_information(knowledge)

    def _init_basic_embedding_variables(self, knowledge: BaseKnowledge):
        logging.info("Initializing %s basic embedding variables", self.embedding_name)
        self.basic_feature_embeddings = self.add_weight(
            initializer=self._get_feature_initializer(
                knowledge.get_description_vocab(set(knowledge.get_vocab().values()))
            ),
            trainable=self.config.base_feature_embeddings_trainable,
            name="{}/basic_feature_embeddings".format(self.embedding_name),
            shape=(self.num_features, self.config.embedding_dim),
            regularizer=super()._get_kernel_regularizer(scope="base_embeddings"),
        )
        self.basic_hidden_embeddings = self.add_weight(
            initializer=self._get_hidden_initializer(
                knowledge.get_description_vocab(set(knowledge.get_vocab().values()))
            ),
            trainable=self.config.base_hidden_embeddings_trainable,
            name="{}/basic_hidden_embeddings".format(self.embedding_name),
            shape=(self.num_hidden_features, self.config.embedding_dim),
            regularizer=super()._get_kernel_regularizer(scope="base_embeddings"),
        )

    def _init_connection_information(self, knowledge: BaseKnowledge):
        logging.info("Initializing %s connection information", self.embedding_name)
        self.connections: Dict[int, List[int]] = {}
        self.connection_partition: List[
            int
        ] = []  # connection_partition[i] = j -> {connection i relevant for j}

        for idx in tqdm(
            range(self.num_features),
            desc="Initializing {} connections".format(self.embedding_name),
        ):
            connected_idxs = knowledge.get_connections_for_idx(idx)
            self.connections[idx] = sorted(list(connected_idxs))
            self.connection_partition = self.connection_partition + [idx] * len(
                connected_idxs
            )

        self.connection_indices = [
            v for _, v in sorted(self.connections.items(), key=lambda x: x[0])
        ]  # connection_indices[i,j] = k -> feature i is connected to feature k
        self.flattened_connection_indices = [
            x for sublist in self.connection_indices for x in sublist
        ]  # connection k is between connection_partition[k] and flattened_connection_indices[k]
        # connection_indices[i,j] = k -> connection_partition[l]=i, flattened_connection_indices[l]=k
        self.num_connections = len(self.flattened_connection_indices)

    def _load_connection_embedding_matrix(self):
        embeddings = tf.concat(
            [self.basic_feature_embeddings, self.basic_hidden_embeddings],
            axis=0,
            name="all_feature_embeddings",
        )  # shape: (num_all_features, embedding_size)
        return tf.gather(
            embeddings,
            self.flattened_connection_indices,
            name="connected_embeddings_per_connection",
        )  # shape: (num_connections, embedding_size)

    def _load_attention_embedding_matrix(self):
        connection_embedding_matrix = self._load_connection_embedding_matrix()
        feature_embedding_matrix = tf.gather(
            self.basic_feature_embeddings,
            self.connection_partition,
            axis=0,
            name="feature_embeddings_per_connection",
        )  # shape: (num_connections, embedding_size)
        return tf.concat(
            [feature_embedding_matrix, connection_embedding_matrix],
            axis=1,
            name="concatenated_connection_embeddings",
        )  # (num_connections, 2*embedding_size)

    def _calculate_attention_embeddings(self):
        connection_embedding_matrix = self._load_connection_embedding_matrix()
        attention_embedding_matrix = self._load_attention_embedding_matrix()

        scores = self.u(
            self.w(attention_embedding_matrix)
        )  # shape: (num_connections, 1)
        scores = tf.math.exp(scores)

        scores_per_feature = tf.ragged.stack_dynamic_partitions(
            scores,
            partitions=self.connection_partition,
            num_partitions=self.num_features,
            name="attention_scores_per_feature",
        )  # shape: (num_features, num_connections per feature)
        score_sum_per_feature = tf.reduce_sum(
            scores_per_feature, axis=1, name="attention_score_sum_per_feature",
        )  # shape: (num_features, 1)
        attention_weights = scores_per_feature / tf.expand_dims(
            score_sum_per_feature,
            axis=1,
            name="normalised_attention_scores_per_feature",
        )  # shape: (num_features, num_connections per feature)

        connections_per_feature = tf.ragged.stack_dynamic_partitions(
            connection_embedding_matrix,
            partitions=self.connection_partition,
            num_partitions=self.num_features,
            name="connection_embeddings_per_feature",
        )  # shape: (num_features, num_connections per feature, embedding_size)
        context_vector = (
            attention_weights * connections_per_feature
        )  # shape: (num_features, num_connections, embedding_size)
        context_vector = tf.reduce_sum(
            context_vector, axis=1, name="context_vector",
        )  # shape: (num_features, embedding_size)

        return (context_vector, attention_weights)

    def _final_embedding_matrix(self):
        context_vector, _ = self._calculate_attention_embeddings()
        return context_vector

    def call(
        self, values
    ):  # values shape: (dataset_size, max_sequence_length, num_features)
        embedding_matrix = self._final_embedding_matrix()
        return tf.linalg.matmul(
            values, embedding_matrix,
        )  # shape: (dataset_size, max_sequence_length, embedding_size)


### 4.4 gram

In [28]:
'''
from src.features.sequences.transformer import SequenceMetadata
from src.features.knowledge import HierarchyKnowledge
from .base import BaseModel
from .knowledge_embedding import KnowledgeEmbedding
from .config import ModelConfig
'''

class GramEmbedding(KnowledgeEmbedding):
    def __init__(self, knowledge: HierarchyKnowledge, config: ModelConfig):
        super(GramEmbedding, self).__init__(knowledge, config, "gram_embedding")


class GramModel(BaseModel):
    def _get_embedding_layer(
        self, metadata: SequenceMetadata, knowledge: HierarchyKnowledge
    ) -> GramEmbedding:
        return GramEmbedding(knowledge, self.config)


### 4.5 callback

In [29]:
class MLFlowCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        mlflow.log_metric("epoch", epoch)
        for log_key in logs.keys():
            mlflow.log_metric(key=log_key, value=logs[log_key], step=epoch)


class BestModelRestoreCallback(tf.keras.callbacks.Callback):
    def __init__(self, metric="val_loss", minimize=True, early_stopping_epochs=5):
        super(BestModelRestoreCallback, self).__init__()
        self.metric = metric
        self.minimize = minimize
        self.early_stopping_epochs = early_stopping_epochs

    def on_train_begin(self, logs=None):
        self.best_weights = None
        self.best_metric_value = np.Inf if self.minimize else np.NINF
        self.best_epoch = -1

    def on_epoch_end(self, epoch, logs=None):
        current_metric_value = logs.get(self.metric)
        if self._is_better(current_metric_value):
            logging.debug(
                "Model metric %s improved from %f to %f",
                self.metric,
                self.best_metric_value,
                current_metric_value,
            )
            self.best_metric_value = current_metric_value
            self.best_weights = self.model.get_weights()
            self.best_epoch = epoch
        elif self.early_stopping_epochs > -1 and self.best_epoch > -1:
            epochs_without_improvement = epoch - self.best_epoch
            if epochs_without_improvement > self.early_stopping_epochs:
                logging.info("Early stopping at epoch %d after waiting for %d epochs", epoch, epochs_without_improvement)
                self.model.stop_training = True

    def _is_better(self, current_metric_value):
        if self.minimize:
            return np.less(current_metric_value, self.best_metric_value)
        else:
            return np.greater(current_metric_value, self.best_metric_value)

    def on_train_end(self, logs=None):
        if self.best_epoch > -1:
            logging.info(
                "Restoring best model weights with %s: %f from epoch %d",
                self.metric,
                self.best_metric_value,
                self.best_epoch,
            )
            self.model.set_weights(self.best_weights)


## 5 Analysis

#### 5.1 embedding 
- EmbeddingHelper

#### 5.2 plotting 
- MetricPlotter

#### 5.3 confusion 
- ConfusionCalculator

#### 5.4 frequency 
- FrequencyCalculator

#### 5.5 predictions 
- PredictionOutputCalculator

### 5.1 Embedding

In [30]:
# from src.features.knowledge import BaseKnowledge
import io

class EmbeddingHelper:
    def __init__(self, knowledge: BaseKnowledge, embedding: tf.keras.Model):
        self.knowledge = knowledge
        self.embedding = embedding

    def load_base_embeddings(self):
        base_embeddings = {}
        base_embedding_matrix = self.embedding.basic_feature_embeddings
        for word, idx in self.knowledge.get_vocab().items():
            base_embeddings[word + "_base"] = (
                base_embedding_matrix[idx].numpy().flatten()
            )
        base_hidden_embedding_matrix = self.embedding.basic_hidden_embeddings
        hidden_vocab = self._load_hidden_vocab()
        for word, idx in hidden_vocab.items():
            base_embeddings[word + "_hidden"] = (
                base_hidden_embedding_matrix[idx - len(self.knowledge.get_vocab())].numpy().flatten()
            )

        return base_embeddings

    def _load_hidden_vocab(self) -> Dict[str, int]:
        return {
            key: value
            for (key, value) in self.knowledge.get_extended_vocab().items()
            if key not in self.knowledge.get_vocab()
        }

    def load_final_embeddings(self):
        final_embeddings = {}
        final_embedding_matrix = self.embedding._final_embedding_matrix()
        for word, idx in self.knowledge.get_vocab().items():
            final_embeddings[word] = final_embedding_matrix[idx].numpy().flatten()

        return final_embeddings

    def write_embeddings(
        self,
        vec_file_name: str = "data/vecs.tsv",
        meta_file_name: str = "data/meta.tsv",
        include_base_embeddings: bool = True,
    ):
        out_vecs = io.open(vec_file_name, "w", encoding="utf-8")
        out_meta = io.open(meta_file_name, "w", encoding="utf-8")
        embeddings = self.load_final_embeddings()
        if include_base_embeddings:
            embeddings = dict(
                list(embeddings.items()) + list(self.load_base_embeddings().items())
            )

        for word, vec in embeddings.items():
            out_vecs.write("\t".join([str(x) for x in vec]) + "\n")
            out_meta.write(word + "\n")

        out_vecs.close()
        out_meta.close()

    def load_attention_weights(self) -> Dict[str, Dict[str, str]]:
        return self._load_attention_weights(
            self._reverse_vocab(self.knowledge.get_extended_vocab())
        )

    def _reverse_vocab(self, vocab: Dict[str, int]) -> Dict[int, str]:
        return {v: k for k, v in vocab.items()}

    def write_attention_weights(self, file_name: str = "data/attention.json"):
        attention_weights = self.load_attention_weights()
        json_file = io.open(file_name, "w", encoding="utf-8")
        json_file.write(json.dumps({"attention_weights": attention_weights,}))
        json_file.close()

    def _load_attention_weights(
        self, vocab: Dict[int, str]
    ) -> Dict[str, Dict[str, str]]:
        attention_weights: Dict[str, Dict[str, str]] = {}
        _, attention_matrix = self.embedding._calculate_attention_embeddings()
        flattened_attention_matrix = [
            aw[0] for sublist in attention_matrix.numpy() for aw in sublist
        ]
        connection_indices = self.embedding.flattened_connection_indices
        connection_partition = self.embedding.connection_partition

        for connection_idx in range(len(connection_indices)):
            from_idx = connection_partition[connection_idx]
            to_idx = connection_indices[connection_idx]

            from_word = vocab[from_idx]
            to_word = vocab[to_idx]

            if from_word not in attention_weights:
                attention_weights[from_word] = {}

            attention_weights[from_word][to_word] = str(
                flattened_attention_matrix[connection_idx]
            )

        return attention_weights

    def _create_one_hot_vector_for(self, idx: int, total_length: int) -> tf.Tensor:
        vec = np.zeros(total_length)
        vec[idx] = 1
        return tf.expand_dims(tf.convert_to_tensor(vec, dtype="float32"), 0)

### 5.2 Confusion

In [31]:
# from ...features.sequences import SequenceMetadata
from sklearn.metrics import confusion_matrix 

class ConfusionCalculator:
    def __init__(self, metadata: SequenceMetadata, model: tf.keras.Model):
        self.y_vocab = metadata.y_vocab
        self.model = model

    def write_confusion_for_dataset(
        self, dataset: tf.data.Dataset, out_file_name: str = "data/confusion.csv"
    ):
        confusion_df = self._calculate_confusion_df_for_dataset(dataset)
        confusion_df.to_csv(out_file_name, index_label="true_label")

    def _calculate_confusion_df_for_dataset(
        self, dataset: tf.data.Dataset
    ) -> pd.DataFrame:
        confusion_matrix = self._calculate_confusion_matrix_for_dataset(dataset)
        sorted_features = [
            feature_name
            for (feature_name, _) in sorted(self.y_vocab.items(), key=lambda x: x[1])
        ]
        return pd.DataFrame(
            confusion_matrix, index=sorted_features, columns=sorted_features,
        )

    def _calculate_confusion_matrix_for_dataset(
        self, dataset: tf.data.Dataset
    ) -> np.array:
        num_labels = len(self.y_vocab)
        confusion_matrix = np.zeros(shape=(num_labels, num_labels), dtype=np.int32)
        for (x, y_true) in tqdm(
            dataset.as_numpy_iterator(), desc="Calculating confusion matrix..."
        ):
            y_pred = self.model(x).numpy()  # shape: (batch_size, num_labels)
            y_pred = self._convert_to_int_vector(y_pred)
            confusion_matrix = confusion_matrix + self._calculate_confusion_matrix(
                y_true, y_pred
            )

        return confusion_matrix

    def _convert_to_int_vector(self, y_pred: np.array) -> np.array:
        predicted_labels = np.argmax(y_pred, axis=1)  # size: batch_size
        y_vec = np.zeros(shape=y_pred.shape, dtype=np.int8)
        y_vec[np.arange(predicted_labels.size), predicted_labels] = 1
        return y_vec

    def _calculate_confusion_matrix(
        self, y_true: np.array, y_pred: np.array
    ) -> np.array:
        return confusion_matrix(
            y_true.argmax(axis=1), 
            y_pred.argmax(axis=1),
            labels=range(y_true.shape[1])
        )


### 5.3 Predictions

In [32]:
# from ...features.sequences import SequenceMetadata
from datetime import time

class PredictionOutputCalculator:
    def __init__(self, metadata: SequenceMetadata, model: tf.keras.Model):
        self.x_vocab = metadata.x_vocab
        self.x_vocab_reverse = {v: k for k, v in metadata.x_vocab.items()}
        self.y_vocab = metadata.y_vocab
        self.y_vocab_reverse = {v: k for k, v in metadata.y_vocab.items()}
        self.model = model
        self.metadata = metadata

    def write_prediction_output_for_dataset(
        self,
        dataset: tf.data.Dataset,
        out_file_name: str = "data/prediction_output.csv",
    ):
        prediction_output_df = (
            self._calculate_full_prediction_output_for_dataset(dataset)
            if self.metadata.full_y_prediction
            else self._calculate_prediction_output_for_dataset(dataset)
        )
        prediction_output_df.to_csv(out_file_name, index=False)

    def _calculate_prediction_output_for_dataset(
        self, dataset: tf.data.Dataset
    ) -> np.array:
        all_prediction_dfs = []
        for (x, y) in tqdm(
            dataset.as_numpy_iterator(), desc="Calculating prediction outputs..."
        ):
            x_words = self._transform_to_words_x(x)
            y_words = self._transform_to_words_y(y)
            y_pred = self.model(x).numpy()
            predictions = self._transform_to_words_per_prediction(y_pred)
            all_prediction_dfs.append(
                pd.DataFrame(
                    {"input": x_words, "output": y_words, "predictions": predictions,}
                )
            )

        return pd.concat(all_prediction_dfs, ignore_index=True)

    def _calculate_full_prediction_output_for_dataset(
        self, dataset: tf.data.Dataset
    ) -> np.array:
        all_prediction_dfs = []
        for (x, y) in tqdm(
            dataset.as_numpy_iterator(), desc="Calculating prediction outputs..."
        ):
            x_words = self._transform_to_words_x(x)
            y_words = self._transform_to_words_wide(y, self.y_vocab_reverse)
            y_pred = self.model(x).numpy()
            predictions = self._transform_to_words_per_prediction_wide(y_pred, y_true=y)
            all_prediction_dfs.append(
                pd.DataFrame(
                    {"input": x_words, "output": y_words, "predictions": predictions,}
                )
            )

        return pd.concat(all_prediction_dfs, ignore_index=True)

    def _transform_to_words_x(self, x: tf.Tensor) -> List[Dict[int, List[str]]]:
        return self._transform_to_words_wide(x, self.x_vocab_reverse)

    def _transform_to_words_wide(
        self, x: tf.Tensor, reverse_vocab: Dict[int, str]
    ) -> List[Dict[int, List[str]]]:
        words_per_idx: Dict[int, Dict[int, List[str]]] = {
            idx: {} for idx in range(x.shape[0])
        }
        all_indices = np.argwhere(x == 1)
        for idx in range(all_indices.shape[0]):
            indices = all_indices[idx]
            batch_idx = indices[0]
            sequence_idx = indices[1]
            feature_idx = indices[2]

            if batch_idx not in words_per_idx:
                words_per_idx[batch_idx] = {}
            if sequence_idx not in words_per_idx[batch_idx]:
                words_per_idx[batch_idx][sequence_idx] = []

            words_per_idx[batch_idx][sequence_idx].append(reverse_vocab[feature_idx])

        return [words for _, words in sorted(words_per_idx.items(), key=lambda x: x[0])]

    def _transform_to_words_per_prediction_wide(
        self, y_pred: tf.Tensor, y_true: tf.Tensor
    ):
        predictions_per_idx: Dict[int, Dict[int, Dict[str, float]]] = {
            idx: {} for idx in range(y_pred.shape[0])
        }
        for batch_idx in range(y_pred.shape[0]):
            predictions_per_idx[batch_idx] = {idx: {} for idx in range(y_pred.shape[1])}
            for time_idx in range(y_pred.shape[1]):
                has_positive_feature = False
                for feature_idx in range(y_pred.shape[2]):
                    if y_true[batch_idx][time_idx][feature_idx] == 1:
                        has_positive_feature = True
                    predictions_per_idx[batch_idx][time_idx][
                        self.y_vocab_reverse[feature_idx]
                    ] = y_pred[batch_idx][time_idx][feature_idx]
                if not has_positive_feature:
                    predictions_per_idx[batch_idx][time_idx] = {}
                    break

        return [
            predictions
            for _, predictions in sorted(
                predictions_per_idx.items(), key=lambda x: x[0]
            )
        ]

    def _transform_to_words_per_prediction(self, y_pred: tf.Tensor):
        predictions_per_idx: Dict[int, Dict[str, float]] = {
            idx: {} for idx in range(y_pred.shape[0])
        }
        for batch_idx in range(y_pred.shape[0]):
            for feature_idx in range(y_pred.shape[1]):
                predictions_per_idx[batch_idx][
                    self.y_vocab_reverse[feature_idx]
                ] = y_pred[batch_idx][feature_idx]

        return [
            predictions
            for _, predictions in sorted(
                predictions_per_idx.items(), key=lambda x: x[0]
            )
        ]

    def _transform_to_words_y(self, y: tf.Tensor):
        words_per_idx: Dict[int, List[str]] = {idx: [] for idx in range(y.shape[0])}
        all_indices = np.argwhere(y == 1)
        for idx in range(all_indices.shape[0]):
            indices = all_indices[idx]
            batch_idx = indices[0]
            feature_idx = indices[1]

            if batch_idx not in words_per_idx:
                words_per_idx[batch_idx] = []

            words_per_idx[batch_idx].append(self.y_vocab_reverse[feature_idx])

        return [words for _, words in sorted(words_per_idx.items(), key=lambda x: x[0])]



### 5.4 Frequency

In [33]:
# from ...features.sequences import SequenceMetadata

class FrequencyCalculator:
    def __init__(self, metadata: SequenceMetadata):
        self.x_vocab = metadata.x_vocab

    def write_frequency_for_dataset(
        self, dataset: tf.data.Dataset, out_file_name: str = "data/frequency.csv"
    ):
        frequency_df = self._calculate_frequency_df_for_dataset(dataset)
        frequency_df.to_csv(out_file_name, index_label="feature")

    def _calculate_frequency_df_for_dataset(
        self, dataset: tf.data.Dataset
    ) -> pd.DataFrame:
        frequencies = self._calculate_frequencies_for_dataset(dataset)
        sorted_features = [
            feature_name
            for (feature_name, _) in sorted(self.x_vocab.items(), key=lambda x: x[1])
        ]
        return pd.DataFrame(
            frequencies, index=sorted_features, columns=["absolue_frequency"],
        )

    def _calculate_frequencies_for_dataset(self, dataset: tf.data.Dataset) -> np.array:
        num_labels = len(self.x_vocab)
        frequencies = np.zeros(shape=(num_labels,), dtype=np.int32)
        for (x, _) in tqdm(
            dataset.as_numpy_iterator(), desc="Calculating x frequencies..."
        ):
            frequencies = frequencies + self._calculate_frequencies(x)

        return frequencies

    def _calculate_frequencies(
        self, x: np.array  # shape: (batch_size, num_steps, num_features)
    ) -> np.array:
        summed_batch = np.sum(x, axis=1, dtype=np.int32)  # shape: (batch_size, num_features)
        return np.sum(summed_batch, axis=0, dtype=np.int32)  # shape: (num_features,)



### 5.5 Plotting

In [34]:
# from ..models import BaseModel

class MetricPlotter:
    def __init__(self, model: BaseModel, plot_path: str = 'plots/'):
        self.model = model
        self.plot_path = plot_path
    
    def plot_all_metrics(self):
        self._plot_metric('loss')

        for metric in self.model.metrics:
            self._plot_metric(metric.name)

    def _plot_metric(self, metric_name: str):
        history = self.model.history.history

        plt.figure(figsize=(20, 10))
        plt.title(metric_name)
        plt.xlabel('epoch')
        plt.ylabel(metric_name)
        plt.plot(history[metric_name])
        if ('val_' + metric_name) in history:
            plt.plot(history['val_' + metric_name])
            plt.legend(['train', 'val'], loc='upper left')
        plt.savefig(self.plot_path + metric_name + '.png')


# Run DomainML

In [35]:
class ExperimentRunner:
    sequence_df_pkl_file: str = "data/sequences_df.pkl"

    def __init__(self, run_id: str):
        self.run_id = run_id
        self.config = ExperimentConfig()
        self.multilabel_classification = self.config.multilabel_classification

    def run(self):
        logging.info("Starting run %s", self.run_id)
        tf.random.set_seed(self.config.tensorflow_seed)
        random.seed(self.config.random_seed)
        sequence_df = self._load_sequences()
        if self.config.max_data_size > 0 and self.config.max_data_size < len(
            sequence_df
        ):
            logging.info(
                "Only using first %d rows of sequence_df with %d rows",
                self.config.max_data_size,
                len(sequence_df),
            )
            sequence_df = sequence_df[0 : self.config.max_data_size]

        metadata = self._collect_sequence_metadata(sequence_df)
        (train_dataset, test_dataset) = self._create_dataset(sequence_df)
        (knowledge, model) = self._load_model(metadata)
        knowledge = self._build_model(metadata, knowledge, model)

        model.train_dataset(
            train_dataset,
            test_dataset,
            self.multilabel_classification,
            self.config.n_epochs,
        )

        self._log_dataset_info(train_dataset, test_dataset, metadata)
        '''
        self._generate_artifacts(
            metadata, train_dataset, test_dataset, knowledge, model
        )
        '''
        self._set_mlflow_tags(metadata)
        plt.close("all")
        logging.info("Finished run %s", self.run_id)

    def _log_dataset_info(
        self,
        train_dataset: tf.data.Dataset,
        test_dataset: tf.data.Dataset,
        metadata: SequenceMetadata,
    ):
        mlflow.log_metric("train_size", len([x for x in train_dataset]))
        mlflow.log_metric("test_size", len([x for x in test_dataset]))
        mlflow.log_metric("x_vocab_size", len(metadata.x_vocab))
        mlflow.log_metric("y_vocab_size", len(metadata.y_vocab))

    def _set_mlflow_tags(self, metadata: SequenceMetadata):
        mlflow.set_tag("sequence_type", self.config.sequence_type)
        mlflow.set_tag("model_type", self.config.model_type)
        if len(metadata.y_vocab) == 1:
            mlflow.set_tag("task_type", "risk_prediction")
        else:
            mlflow.set_tag("task_type", "sequence_prediction")

    '''
    def _generate_artifacts(
        self,
        metadata: SequenceMetadata,
        train_dataset: tf.data.Dataset,
        test_dataset: tf.data.Dataset,
        knowledge: Any,
        model: BaseModel,
    ):
        artifact_dir = "artifacts/run_{}/".format(self.run_id)
        artifact_path = Path(artifact_dir)
        if not artifact_path.exists():
            artifact_path.mkdir()

        self._generate_metric_artifacts(artifact_dir, model)
        self._generate_embedding_artifacts(artifact_dir, knowledge, model)
        self._generate_confusion_artifacts(artifact_dir, metadata, model, test_dataset)
        self._generate_frequency_artifacts(artifact_dir, metadata, train_dataset)
        mlflow.log_artifacts(artifact_dir)
    '''
    
    def _generate_metric_artifacts(
        self, artifact_dir: str, model: BaseModel,
    ):
        metric_plotter = MetricPlotter(model, plot_path=artifact_dir)
        metric_plotter.plot_all_metrics()

    def _generate_frequency_artifacts(
        self,
        artifact_dir: str,
        metadata: SequenceMetadata,
        train_dataset: tf.data.Dataset,
    ):
        frequency_calculator = FrequencyCalculator(metadata)
        frequency_calculator.write_frequency_for_dataset(
            train_dataset, out_file_name=artifact_dir + "train_frequency.csv"
        )

    def _generate_confusion_artifacts(
        self,
        artifact_dir: str,
        metadata: SequenceMetadata,
        model: BaseModel,
        test_dataset: tf.data.Dataset,
    ):
        prediction_output_calculator = PredictionOutputCalculator(
            metadata, model.prediction_model,
        )
        prediction_output_calculator.write_prediction_output_for_dataset(
            test_dataset, out_file_name=artifact_dir + "prediction_output.csv",
        )

        mlflow.log_dict(metadata.x_vocab, "x_vocab.json")
        mlflow.log_dict(metadata.y_vocab, "y_vocab.json")

    def _generate_embedding_artifacts(
        self,
        artifact_dir: str,
        knowledge: BaseKnowledge,
        model: BaseModel,
    ):
        embedding_helper = EmbeddingHelper(knowledge, model.embedding_layer)
        if self.config.model_type in ["simple", "text_paper"]:
            embedding_helper.write_embeddings(
                vec_file_name=artifact_dir + "vecs.tsv",
                meta_file_name=artifact_dir + "meta.tsv",
                include_base_embeddings=False,
            )
        else:
            embedding_helper.write_embeddings(
                vec_file_name=artifact_dir + "vecs.tsv",
                meta_file_name=artifact_dir + "meta.tsv",
                include_base_embeddings=True,
            )
            embedding_helper.write_attention_weights(
                file_name=artifact_dir + "attention.json",
            )

    def _create_dataset(
        self, sequence_df: pd.DataFrame
    ) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
        if self.config.use_dataset_generator:
            sequence_df.to_pickle(self.sequence_df_pkl_file)
            train_dataset = (
                tf.data.Dataset.from_generator(
                    generate_train,
                    args=(self.sequence_df_pkl_file, self.sequence_column_name),
                    output_types=(tf.float32, tf.float32),
                )
                .cache(self._get_cache_file_name(is_test=False))
                .shuffle(
                    self.config.dataset_shuffle_buffer,
                    seed=self.config.dataset_shuffle_seed,
                    reshuffle_each_iteration=True,
                )
                .batch(self.config.batch_size)
                .prefetch(tf.data.experimental.AUTOTUNE)
            )
            test_dataset = (
                tf.data.Dataset.from_generator(
                    generate_test,
                    args=(self.sequence_df_pkl_file, self.sequence_column_name),
                    output_types=(tf.float32, tf.float32),
                )
                .cache(self._get_cache_file_name(is_test=True))
                .batch(self.config.batch_size)
                .prefetch(tf.data.experimental.AUTOTUNE)
            )

            return (train_dataset, test_dataset)
        else:
            transformer = load_sequence_transformer()
            split = transform_train_test_split(
                sequence_df, self.sequence_column_name
            )
            train_dataset = (
                tf.data.Dataset.from_tensor_slices((split.train_x, split.train_y),)
                .batch(self.config.batch_size)
                .prefetch(tf.data.experimental.AUTOTUNE)
                .cache()
                .shuffle(
                    self.config.dataset_shuffle_buffer,
                    seed=self.config.dataset_shuffle_seed,
                    reshuffle_each_iteration=True,
                )
            )
            test_dataset = (
                tf.data.Dataset.from_tensor_slices((split.test_x, split.test_y),)
                .batch(self.config.batch_size)
                .prefetch(tf.data.experimental.AUTOTUNE)
                .cache()
            )

            return (train_dataset, test_dataset)

    def _get_cache_file_name(self, is_test: bool) -> str:
        if len(self.config.dataset_generator_cache_file) < 1:
            return ""
        else:
            return self.config.dataset_generator_cache_file + (
                "_test" if is_test else "_train"
            )

    def _build_model(
        self,
        metadata: SequenceMetadata,
        base_knowledge: BaseKnowledge,
        model: BaseModel,
    ) -> BaseKnowledge:
        if (
            self.config.noise_to_add > 0
            or self.config.noise_to_remove > 0
            or self.config.attention_noise_to_remove > 0
        ):
            noise_knowledge = NoiseKnowledge(base_knowledge)
            noise_knowledge.remove_lowest_connections(
                percentage=self.config.attention_noise_to_remove,
                connections_reference_file=self.config.attention_weight_reference_file,
            )
            noise_knowledge.add_random_connections(percentage=self.config.noise_to_add)
            noise_knowledge.remove_random_connections(
                percentage=self.config.noise_to_remove
            )

            mlflow.set_tag(
                "noise_type",
                "added{}_removed{}_threshold{}".format(
                    self.config.noise_to_add,
                    self.config.noise_to_remove,
                    self.config.attention_noise_to_remove,
                ),
            )
            (
                original_connections_text,
                noise_connections_text,
            ) = noise_knowledge.get_text_connections()
            mlflow.log_dict(
                original_connections_text, "original_knowledge.json",
            )
            mlflow.log_dict(
                noise_connections_text, "noise_knowledge.json",
            )
            model.build(metadata, noise_knowledge)
            return noise_knowledge
        model.build(metadata, base_knowledge)
        return base_knowledge

    def _load_model(
        self, metadata: SequenceMetadata
    ) -> Tuple[BaseKnowledge, BaseModel]:
        model: BaseModel
        if self.config.model_type == "simple":
            base_knowledge = BaseKnowledge(
                config=KnowledgeConfig(),
            )
            base_knowledge.vocab = metadata.x_vocab
            base_knowledge.extended_vocab = metadata.x_vocab
            model = models.SimpleModel()
            return (base_knowledge, model)

        elif self.config.model_type == "gram" or self.config.model_type == "hierarchy":
            hierarchy = self._load_hierarchy_knowledge(metadata)
            model = GramModel()
            return (hierarchy, model)
        
        else:
            logging.fatal("Unknown model type %s", self.config.model_type)
            raise InputError(
                message="Unknown model type: " + str(self.config.model_type)
            )
    
    def _load_hierarchy_knowledge(
        self, metadata: SequenceMetadata
    ) -> HierarchyKnowledge:
        hierarchy_preprocessor: Preprocessor
        if self.config.sequence_type == "mimic":
            mimic_config = MimicPreprocessorConfig()
            hierarchy_preprocessor = ICD9HierarchyPreprocessor(
                config=mimic_config
            )
            hierarchy_df = hierarchy_preprocessor.load_data()
            hierarchy = HierarchyKnowledge(
                config=KnowledgeConfig(),
            )
            hierarchy.build_hierarchy_from_df(hierarchy_df, metadata.x_vocab)
            return hierarchy
        else:
            logging.fatal(
                "Hierarchy knowledge not available for data type %s",
                self.config.sequence_type,
            )
            raise InputError(
                message="Hierarchy knowledge not available for data type: "
                + str(self.config.sequence_type)
            )

    def _load_sequences(self) -> pd.DataFrame:
        sequence_preprocessor: Preprocessor

        if self.config.sequence_type == "mimic":
            mimic_config = MimicPreprocessorConfig()
            sequence_preprocessor = MimicPreprocessor(
                config=mimic_config,
            )
            self.sequence_column_name = mimic_config.sequence_column_name
            return sequence_preprocessor.load_data()
        else:
            logging.fatal("Unknown data type %s", self.config.sequence_type)
            raise InputError(
                message="Unknown data type: " + str(self.config.sequence_type)
            )

    def _collect_sequence_metadata(
        self, sequence_df: pd.DataFrame
    ) -> SequenceMetadata:
        if self.config.max_data_size > 0:
            logging.debug(
                "Using subset of length %d instead total df of length %d",
                self.config.max_data_size,
                len(sequence_df),
            )
            sequence_df = sequence_df[0 : self.config.max_data_size]

        transformer = load_sequence_transformer()
        if not transformer.config.flatten_y:
            self.multilabel_classification = False
        return transformer.collect_metadata(sequence_df, self.sequence_column_name)


class InputError(Exception):
    """Exception raised for errors in the input."""

    def __init__(self, message):
        self.message = message

In [36]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("matplotlib.font_manager").disabled = True
mlflow.set_experiment("Domain Guided Monitoring")
with mlflow.start_run() as run:
    _log_all_configs_to_mlflow()
    runner = ExperimentRunner(run.info.run_id)
    runner.run()
    print(run.info.run_id) 

DEBUG:git.cmd:Popen(['git', 'version'], cwd=E:\Domain-Guided-Monitoring\GRAM, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=E:\Domain-Guided-Monitoring\GRAM, universal_newlines=False, shell=None, istream=None)
DEBUG:git.util:Failed checking if running in CYGWIN due to: FileNotFoundError(2, '系统找不到指定的文件。', None, 2, None)
INFO:root:Starting run c2954be6e57b4a3398e5beaa334f27f7
INFO:root:Starting to preprocess MIMIC dataset
INFO:root:Reading admission_df from data\ADMISSIONS.csv
INFO:root:Reading diagnosis_df from data\DIAGNOSES_ICD.csv
INFO:root:Trying to read icd9_df from data\icd9.csv
INFO:root:Trying to read icd9_hierarchy_df from data\hierarchy_icd9.csv
INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:root:Starting to preprocess ICD9 hierarchy
INFO:root:Trying to read icd9_df from data\icd9.csv
Building Hierarchy 

DEBUG:root:Ignoring node 011.24 as not in dataset
DEBUG:root:Ignoring node 011.25 as not in dataset
DEBUG:root:Ignoring node 011.26 as not in dataset
DEBUG:root:Ignoring node 011.3 as not in dataset
DEBUG:root:Ignoring node 011.30 as not in dataset
DEBUG:root:Ignoring node 011.31 as not in dataset
DEBUG:root:Ignoring node 011.32 as not in dataset
DEBUG:root:Ignoring node 011.33 as not in dataset
DEBUG:root:Ignoring node 011.34 as not in dataset
DEBUG:root:Ignoring node 011.35 as not in dataset
DEBUG:root:Ignoring node 011.36 as not in dataset
DEBUG:root:Ignoring node 011.4 as not in dataset
DEBUG:root:Ignoring node 011.40 as not in dataset
DEBUG:root:Ignoring node 011.41 as not in dataset
DEBUG:root:Ignoring node 011.42 as not in dataset
DEBUG:root:Ignoring node 011.43 as not in dataset
DEBUG:root:Ignoring node 011.44 as not in dataset
DEBUG:root:Ignoring node 011.45 as not in dataset
DEBUG:root:Ignoring node 011.46 as not in dataset
DEBUG:root:Ignoring node 011.5 as not in dataset
DEB

DEBUG:root:Ignoring node 013.90 as not in dataset
DEBUG:root:Ignoring node 013.91 as not in dataset
DEBUG:root:Ignoring node 013.92 as not in dataset
DEBUG:root:Ignoring node 013.93 as not in dataset
DEBUG:root:Ignoring node 013.94 as not in dataset
DEBUG:root:Ignoring node 013.95 as not in dataset
DEBUG:root:Ignoring node 013.96 as not in dataset
DEBUG:root:Ignoring node 014 as not in dataset
DEBUG:root:Ignoring node 014 as not in dataset
DEBUG:root:Ignoring node 014.0 as not in dataset
DEBUG:root:Ignoring node 014.00 as not in dataset
DEBUG:root:Ignoring node 014.01 as not in dataset
DEBUG:root:Ignoring node 014.02 as not in dataset
DEBUG:root:Ignoring node 014.03 as not in dataset
DEBUG:root:Ignoring node 014.04 as not in dataset
DEBUG:root:Ignoring node 014.05 as not in dataset
DEBUG:root:Ignoring node 014.06 as not in dataset
DEBUG:root:Ignoring node 014.8 as not in dataset
DEBUG:root:Ignoring node 014.80 as not in dataset
DEBUG:root:Ignoring node 014.81 as not in dataset
DEBUG:ro

DEBUG:root:Ignoring node 016.96 as not in dataset
DEBUG:root:Ignoring node 017 as not in dataset
DEBUG:root:Ignoring node 017 as not in dataset
DEBUG:root:Ignoring node 017.0 as not in dataset
DEBUG:root:Ignoring node 017.00 as not in dataset
DEBUG:root:Ignoring node 017.01 as not in dataset
DEBUG:root:Ignoring node 017.02 as not in dataset
DEBUG:root:Ignoring node 017.03 as not in dataset
DEBUG:root:Ignoring node 017.04 as not in dataset
DEBUG:root:Ignoring node 017.05 as not in dataset
DEBUG:root:Ignoring node 017.06 as not in dataset
DEBUG:root:Ignoring node 017.1 as not in dataset
DEBUG:root:Ignoring node 017.10 as not in dataset
DEBUG:root:Ignoring node 017.11 as not in dataset
DEBUG:root:Ignoring node 017.12 as not in dataset
DEBUG:root:Ignoring node 017.13 as not in dataset
DEBUG:root:Ignoring node 017.14 as not in dataset
DEBUG:root:Ignoring node 017.15 as not in dataset
DEBUG:root:Ignoring node 017.16 as not in dataset
DEBUG:root:Ignoring node 017.2 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 031.9 as not in dataset
DEBUG:root:Ignoring node 032.0 as not in dataset
DEBUG:root:Ignoring node 032.1 as not in dataset
DEBUG:root:Ignoring node 032.2 as not in dataset
DEBUG:root:Ignoring node 032.3 as not in dataset
DEBUG:root:Ignoring node 032.8 as not in dataset
DEBUG:root:Ignoring node 032.81 as not in dataset
DEBUG:root:Ignoring node 032.82 as not in dataset
DEBUG:root:Ignoring node 032.83 as not in dataset
DEBUG:root:Ignoring node 032.84 as not in dataset
DEBUG:root:Ignoring node 032.85 as not in dataset
DEBUG:root:Ignoring node 032.89 as not in dataset
DEBUG:root:Ignoring node 032.9 as not in dataset
DEBUG:root:Ignoring node 033.0 as not in dataset
DEBUG:root:Ignoring node 033.1 as not in dataset
DEBUG:root:Ignoring node 033.8 as not in dataset
DEBUG:root:Ignoring node 033.9 as not in dataset
DEBUG:root:Ignoring node 034.0 as not in dataset
DEBUG:root:Ignoring node 034.1 as not in dataset
DEBUG:root:Ignoring node 036.0 as not in dataset
DEBUG:root:Ign

DEBUG:root:Ignoring node 053.0 as not in dataset
DEBUG:root:Ignoring node 053.1 as not in dataset
DEBUG:root:Ignoring node 053.10 as not in dataset
DEBUG:root:Ignoring node 053.11 as not in dataset
DEBUG:root:Ignoring node 053.12 as not in dataset
DEBUG:root:Ignoring node 053.13 as not in dataset
DEBUG:root:Ignoring node 053.14 as not in dataset
DEBUG:root:Ignoring node 053.19 as not in dataset
DEBUG:root:Ignoring node 053.2 as not in dataset
DEBUG:root:Ignoring node 053.20 as not in dataset
DEBUG:root:Ignoring node 053.21 as not in dataset
DEBUG:root:Ignoring node 053.22 as not in dataset
DEBUG:root:Ignoring node 053.29 as not in dataset
DEBUG:root:Ignoring node 053.7 as not in dataset
DEBUG:root:Ignoring node 053.71 as not in dataset
DEBUG:root:Ignoring node 053.79 as not in dataset
DEBUG:root:Ignoring node 053.8 as not in dataset
DEBUG:root:Ignoring node 053.9 as not in dataset
DEBUG:root:Ignoring node 054.0 as not in dataset
DEBUG:root:Ignoring node 054.1 as not in dataset
DEBUG:ro

DEBUG:root:Ignoring node 070.53 as not in dataset
DEBUG:root:Ignoring node 070.54 as not in dataset
DEBUG:root:Ignoring node 070.59 as not in dataset
DEBUG:root:Ignoring node 070.6 as not in dataset
DEBUG:root:Ignoring node 070.7 as not in dataset
DEBUG:root:Ignoring node 070.70 as not in dataset
DEBUG:root:Ignoring node 070.71 as not in dataset
DEBUG:root:Ignoring node 070.9 as not in dataset
DEBUG:root:Ignoring node 071 as not in dataset
DEBUG:root:Ignoring node 071 as not in dataset
DEBUG:root:Ignoring node 072 as not in dataset
DEBUG:root:Ignoring node 072 as not in dataset
DEBUG:root:Ignoring node 072.0 as not in dataset
DEBUG:root:Ignoring node 072.1 as not in dataset
DEBUG:root:Ignoring node 072.2 as not in dataset
DEBUG:root:Ignoring node 072.3 as not in dataset
DEBUG:root:Ignoring node 072.7 as not in dataset
DEBUG:root:Ignoring node 072.71 as not in dataset
DEBUG:root:Ignoring node 072.72 as not in dataset
DEBUG:root:Ignoring node 072.79 as not in dataset
DEBUG:root:Ignoring 

DEBUG:root:Ignoring node 090.6 as not in dataset
DEBUG:root:Ignoring node 090.7 as not in dataset
DEBUG:root:Ignoring node 090.9 as not in dataset
DEBUG:root:Ignoring node 091.0 as not in dataset
DEBUG:root:Ignoring node 091.1 as not in dataset
DEBUG:root:Ignoring node 091.2 as not in dataset
DEBUG:root:Ignoring node 091.3 as not in dataset
DEBUG:root:Ignoring node 091.4 as not in dataset
DEBUG:root:Ignoring node 091.5 as not in dataset
DEBUG:root:Ignoring node 091.50 as not in dataset
DEBUG:root:Ignoring node 091.51 as not in dataset
DEBUG:root:Ignoring node 091.52 as not in dataset
DEBUG:root:Ignoring node 091.6 as not in dataset
DEBUG:root:Ignoring node 091.61 as not in dataset
DEBUG:root:Ignoring node 091.62 as not in dataset
DEBUG:root:Ignoring node 091.69 as not in dataset
DEBUG:root:Ignoring node 091.7 as not in dataset
DEBUG:root:Ignoring node 091.8 as not in dataset
DEBUG:root:Ignoring node 091.81 as not in dataset
DEBUG:root:Ignoring node 091.82 as not in dataset
DEBUG:root:I

DEBUG:root:Ignoring node 104.9 as not in dataset
DEBUG:root:Ignoring node 110.0 as not in dataset
DEBUG:root:Ignoring node 110.1 as not in dataset
DEBUG:root:Ignoring node 110.2 as not in dataset
DEBUG:root:Ignoring node 110.3 as not in dataset
DEBUG:root:Ignoring node 110.4 as not in dataset
DEBUG:root:Ignoring node 110.5 as not in dataset
DEBUG:root:Ignoring node 110.6 as not in dataset
DEBUG:root:Ignoring node 110.8 as not in dataset
DEBUG:root:Ignoring node 110.9 as not in dataset
DEBUG:root:Ignoring node 111.0 as not in dataset
DEBUG:root:Ignoring node 111.1 as not in dataset
DEBUG:root:Ignoring node 111.2 as not in dataset
DEBUG:root:Ignoring node 111.3 as not in dataset
DEBUG:root:Ignoring node 111.8 as not in dataset
DEBUG:root:Ignoring node 111.9 as not in dataset
DEBUG:root:Ignoring node 112.0 as not in dataset
DEBUG:root:Ignoring node 112.1 as not in dataset
DEBUG:root:Ignoring node 112.2 as not in dataset
DEBUG:root:Ignoring node 112.3 as not in dataset
DEBUG:root:Ignoring 

DEBUG:root:Ignoring node 131.8 as not in dataset
DEBUG:root:Ignoring node 131.9 as not in dataset
DEBUG:root:Ignoring node 132.0 as not in dataset
DEBUG:root:Ignoring node 132.1 as not in dataset
DEBUG:root:Ignoring node 132.2 as not in dataset
DEBUG:root:Ignoring node 132.3 as not in dataset
DEBUG:root:Ignoring node 132.9 as not in dataset
DEBUG:root:Ignoring node 133.0 as not in dataset
DEBUG:root:Ignoring node 133.8 as not in dataset
DEBUG:root:Ignoring node 133.9 as not in dataset
DEBUG:root:Ignoring node 134 as not in dataset
DEBUG:root:Ignoring node 134 as not in dataset
DEBUG:root:Ignoring node 134.0 as not in dataset
DEBUG:root:Ignoring node 134.1 as not in dataset
DEBUG:root:Ignoring node 134.2 as not in dataset
DEBUG:root:Ignoring node 134.8 as not in dataset
DEBUG:root:Ignoring node 134.9 as not in dataset
DEBUG:root:Ignoring node 136.0 as not in dataset
DEBUG:root:Ignoring node 136.1 as not in dataset
DEBUG:root:Ignoring node 136.2 as not in dataset
DEBUG:root:Ignoring node

DEBUG:root:Ignoring node 160.9 as not in dataset
DEBUG:root:Ignoring node 161.0 as not in dataset
DEBUG:root:Ignoring node 161.1 as not in dataset
DEBUG:root:Ignoring node 161.2 as not in dataset
DEBUG:root:Ignoring node 161.3 as not in dataset
DEBUG:root:Ignoring node 161.8 as not in dataset
DEBUG:root:Ignoring node 161.9 as not in dataset
DEBUG:root:Ignoring node 162.0 as not in dataset
DEBUG:root:Ignoring node 162.2 as not in dataset
DEBUG:root:Ignoring node 162.3 as not in dataset
DEBUG:root:Ignoring node 162.4 as not in dataset
DEBUG:root:Ignoring node 162.5 as not in dataset
DEBUG:root:Ignoring node 162.8 as not in dataset
DEBUG:root:Ignoring node 162.9 as not in dataset
DEBUG:root:Ignoring node 163.0 as not in dataset
DEBUG:root:Ignoring node 163.1 as not in dataset
DEBUG:root:Ignoring node 163.8 as not in dataset
DEBUG:root:Ignoring node 163.9 as not in dataset
DEBUG:root:Ignoring node 164.0 as not in dataset
DEBUG:root:Ignoring node 164.1 as not in dataset
DEBUG:root:Ignoring 

DEBUG:root:Ignoring node 188.3 as not in dataset
DEBUG:root:Ignoring node 188.4 as not in dataset
DEBUG:root:Ignoring node 188.5 as not in dataset
DEBUG:root:Ignoring node 188.6 as not in dataset
DEBUG:root:Ignoring node 188.7 as not in dataset
DEBUG:root:Ignoring node 188.8 as not in dataset
DEBUG:root:Ignoring node 188.9 as not in dataset
DEBUG:root:Ignoring node 189.0 as not in dataset
DEBUG:root:Ignoring node 189.1 as not in dataset
DEBUG:root:Ignoring node 189.2 as not in dataset
DEBUG:root:Ignoring node 189.3 as not in dataset
DEBUG:root:Ignoring node 189.4 as not in dataset
DEBUG:root:Ignoring node 189.8 as not in dataset
DEBUG:root:Ignoring node 189.9 as not in dataset
DEBUG:root:Ignoring node 190 as not in dataset
DEBUG:root:Ignoring node 190 as not in dataset
DEBUG:root:Ignoring node 190.0 as not in dataset
Building Hierarchy from df: 2054it [00:01, 1644.82it/s]DEBUG:root:Ignoring node 190.1 as not in dataset
DEBUG:root:Ignoring node 190.2 as not in dataset
DEBUG:root:Ignorin

DEBUG:root:Ignoring node 200.75 as not in dataset
DEBUG:root:Ignoring node 200.76 as not in dataset
Building Hierarchy from df: 2225it [00:01, 1663.52it/s]DEBUG:root:Ignoring node 200.77 as not in dataset
DEBUG:root:Ignoring node 200.78 as not in dataset
DEBUG:root:Ignoring node 200.8 as not in dataset
DEBUG:root:Ignoring node 200.80 as not in dataset
DEBUG:root:Ignoring node 200.81 as not in dataset
DEBUG:root:Ignoring node 200.82 as not in dataset
DEBUG:root:Ignoring node 200.83 as not in dataset
DEBUG:root:Ignoring node 200.84 as not in dataset
DEBUG:root:Ignoring node 200.85 as not in dataset
DEBUG:root:Ignoring node 200.86 as not in dataset
DEBUG:root:Ignoring node 200.87 as not in dataset
DEBUG:root:Ignoring node 200.88 as not in dataset
DEBUG:root:Ignoring node 201.0 as not in dataset
DEBUG:root:Ignoring node 201.00 as not in dataset
DEBUG:root:Ignoring node 201.01 as not in dataset
DEBUG:root:Ignoring node 201.02 as not in dataset
DEBUG:root:Ignoring node 201.03 as not in datas

DEBUG:root:Ignoring node 202.7 as not in dataset
DEBUG:root:Ignoring node 202.70 as not in dataset
DEBUG:root:Ignoring node 202.71 as not in dataset
Building Hierarchy from df: 2394it [00:01, 1583.72it/s]DEBUG:root:Ignoring node 202.72 as not in dataset
DEBUG:root:Ignoring node 202.73 as not in dataset
DEBUG:root:Ignoring node 202.74 as not in dataset
DEBUG:root:Ignoring node 202.75 as not in dataset
DEBUG:root:Ignoring node 202.76 as not in dataset
DEBUG:root:Ignoring node 202.77 as not in dataset
DEBUG:root:Ignoring node 202.78 as not in dataset
DEBUG:root:Ignoring node 202.8 as not in dataset
DEBUG:root:Ignoring node 202.80 as not in dataset
DEBUG:root:Ignoring node 202.81 as not in dataset
DEBUG:root:Ignoring node 202.82 as not in dataset
DEBUG:root:Ignoring node 202.83 as not in dataset
DEBUG:root:Ignoring node 202.84 as not in dataset
DEBUG:root:Ignoring node 202.85 as not in dataset
DEBUG:root:Ignoring node 202.86 as not in dataset
DEBUG:root:Ignoring node 202.87 as not in datas

DEBUG:root:Ignoring node 209.26 as not in dataset
DEBUG:root:Ignoring node 209.27 as not in dataset
DEBUG:root:Ignoring node 209.29 as not in dataset
DEBUG:root:Ignoring node 209.3 as not in dataset
DEBUG:root:Ignoring node 209.30 as not in dataset
DEBUG:root:Ignoring node 209.31 as not in dataset
DEBUG:root:Ignoring node 209.32 as not in dataset
DEBUG:root:Ignoring node 209.33 as not in dataset
DEBUG:root:Ignoring node 209.34 as not in dataset
DEBUG:root:Ignoring node 209.35 as not in dataset
DEBUG:root:Ignoring node 209.36 as not in dataset
DEBUG:root:Ignoring node 209.4 as not in dataset
DEBUG:root:Ignoring node 209.40 as not in dataset
DEBUG:root:Ignoring node 209.41 as not in dataset
DEBUG:root:Ignoring node 209.42 as not in dataset
DEBUG:root:Ignoring node 209.43 as not in dataset
DEBUG:root:Ignoring node 209.5 as not in dataset
DEBUG:root:Ignoring node 209.50 as not in dataset
DEBUG:root:Ignoring node 209.51 as not in dataset
DEBUG:root:Ignoring node 209.52 as not in dataset
DEB

DEBUG:root:Ignoring node 227.8 as not in dataset
DEBUG:root:Ignoring node 227.9 as not in dataset
DEBUG:root:Ignoring node 228.0 as not in dataset
DEBUG:root:Ignoring node 228.00 as not in dataset
DEBUG:root:Ignoring node 228.01 as not in dataset
DEBUG:root:Ignoring node 228.02 as not in dataset
DEBUG:root:Ignoring node 228.03 as not in dataset
DEBUG:root:Ignoring node 228.04 as not in dataset
DEBUG:root:Ignoring node 228.09 as not in dataset
DEBUG:root:Ignoring node 228.1 as not in dataset
DEBUG:root:Ignoring node 229.0 as not in dataset
DEBUG:root:Ignoring node 229.8 as not in dataset
DEBUG:root:Ignoring node 229.9 as not in dataset
DEBUG:root:Ignoring node 230.0 as not in dataset
DEBUG:root:Ignoring node 230.1 as not in dataset
DEBUG:root:Ignoring node 230.2 as not in dataset
DEBUG:root:Ignoring node 230.3 as not in dataset
DEBUG:root:Ignoring node 230.4 as not in dataset
DEBUG:root:Ignoring node 230.5 as not in dataset
DEBUG:root:Ignoring node 230.6 as not in dataset
DEBUG:root:Ign

DEBUG:root:Ignoring node 246.3 as not in dataset
DEBUG:root:Ignoring node 246.8 as not in dataset
DEBUG:root:Ignoring node 246.9 as not in dataset
DEBUG:root:Ignoring node 249.0 as not in dataset
DEBUG:root:Ignoring node 249.00 as not in dataset
DEBUG:root:Ignoring node 249.01 as not in dataset
DEBUG:root:Ignoring node 249.1 as not in dataset
DEBUG:root:Ignoring node 249.10 as not in dataset
DEBUG:root:Ignoring node 249.11 as not in dataset
DEBUG:root:Ignoring node 249.2 as not in dataset
DEBUG:root:Ignoring node 249.20 as not in dataset
DEBUG:root:Ignoring node 249.21 as not in dataset
DEBUG:root:Ignoring node 249.3 as not in dataset
DEBUG:root:Ignoring node 249.30 as not in dataset
DEBUG:root:Ignoring node 249.31 as not in dataset
DEBUG:root:Ignoring node 249.4 as not in dataset
DEBUG:root:Ignoring node 249.40 as not in dataset
DEBUG:root:Ignoring node 249.41 as not in dataset
DEBUG:root:Ignoring node 249.5 as not in dataset
DEBUG:root:Ignoring node 249.50 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 263.9 as not in dataset
DEBUG:root:Ignoring node 264 as not in dataset
DEBUG:root:Ignoring node 264 as not in dataset
DEBUG:root:Ignoring node 264.0 as not in dataset
DEBUG:root:Ignoring node 264.1 as not in dataset
DEBUG:root:Ignoring node 264.2 as not in dataset
DEBUG:root:Ignoring node 264.3 as not in dataset
DEBUG:root:Ignoring node 264.4 as not in dataset
DEBUG:root:Ignoring node 264.5 as not in dataset
DEBUG:root:Ignoring node 264.6 as not in dataset
DEBUG:root:Ignoring node 264.7 as not in dataset
DEBUG:root:Ignoring node 264.8 as not in dataset
DEBUG:root:Ignoring node 264.9 as not in dataset
DEBUG:root:Ignoring node 265.0 as not in dataset
DEBUG:root:Ignoring node 265.1 as not in dataset
DEBUG:root:Ignoring node 265.2 as not in dataset
DEBUG:root:Ignoring node 266.0 as not in dataset
DEBUG:root:Ignoring node 266.1 as not in dataset
DEBUG:root:Ignoring node 266.2 as not in dataset
DEBUG:root:Ignoring node 266.9 as not in dataset
DEBUG:root:Ignoring node

DEBUG:root:Ignoring node 279.49 as not in dataset
DEBUG:root:Ignoring node 279.5 as not in dataset
DEBUG:root:Ignoring node 279.50 as not in dataset
DEBUG:root:Ignoring node 279.51 as not in dataset
DEBUG:root:Ignoring node 279.52 as not in dataset
DEBUG:root:Ignoring node 279.53 as not in dataset
DEBUG:root:Ignoring node 279.8 as not in dataset
DEBUG:root:Ignoring node 279.9 as not in dataset
DEBUG:root:Ignoring node 280.0 as not in dataset
DEBUG:root:Ignoring node 280.1 as not in dataset
DEBUG:root:Ignoring node 280.8 as not in dataset
Building Hierarchy from df: 3377it [00:02, 1614.76it/s]DEBUG:root:Ignoring node 280.9 as not in dataset
DEBUG:root:Ignoring node 281.0 as not in dataset
DEBUG:root:Ignoring node 281.1 as not in dataset
DEBUG:root:Ignoring node 281.2 as not in dataset
DEBUG:root:Ignoring node 281.3 as not in dataset
DEBUG:root:Ignoring node 281.4 as not in dataset
DEBUG:root:Ignoring node 281.8 as not in dataset
DEBUG:root:Ignoring node 281.9 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 367.81 as not in dataset
DEBUG:root:Ignoring node 367.89 as not in dataset
DEBUG:root:Ignoring node 367.9 as not in dataset
DEBUG:root:Ignoring node 368.0 as not in dataset
DEBUG:root:Ignoring node 368.00 as not in dataset
DEBUG:root:Ignoring node 368.01 as not in dataset
DEBUG:root:Ignoring node 368.02 as not in dataset
DEBUG:root:Ignoring node 368.03 as not in dataset
DEBUG:root:Ignoring node 368.1 as not in dataset
DEBUG:root:Ignoring node 368.10 as not in dataset
DEBUG:root:Ignoring node 368.11 as not in dataset
DEBUG:root:Ignoring node 368.12 as not in dataset
DEBUG:root:Ignoring node 368.13 as not in dataset
DEBUG:root:Ignoring node 368.14 as not in dataset
DEBUG:root:Ignoring node 368.15 as not in dataset
DEBUG:root:Ignoring node 368.16 as not in dataset
DEBUG:root:Ignoring node 368.2 as not in dataset
DEBUG:root:Ignoring node 368.3 as not in dataset
DEBUG:root:Ignoring node 368.30 as not in dataset
DEBUG:root:Ignoring node 368.31 as not in dataset
DEBUG

DEBUG:root:Ignoring node 371.41 as not in dataset
DEBUG:root:Ignoring node 371.42 as not in dataset
DEBUG:root:Ignoring node 371.43 as not in dataset
DEBUG:root:Ignoring node 371.44 as not in dataset
DEBUG:root:Ignoring node 371.45 as not in dataset
DEBUG:root:Ignoring node 371.46 as not in dataset
DEBUG:root:Ignoring node 371.48 as not in dataset
DEBUG:root:Ignoring node 371.49 as not in dataset
DEBUG:root:Ignoring node 371.5 as not in dataset
DEBUG:root:Ignoring node 371.50 as not in dataset
DEBUG:root:Ignoring node 371.51 as not in dataset
DEBUG:root:Ignoring node 371.52 as not in dataset
DEBUG:root:Ignoring node 371.53 as not in dataset
DEBUG:root:Ignoring node 371.54 as not in dataset
DEBUG:root:Ignoring node 371.55 as not in dataset
DEBUG:root:Ignoring node 371.56 as not in dataset
DEBUG:root:Ignoring node 371.57 as not in dataset
DEBUG:root:Ignoring node 371.58 as not in dataset
DEBUG:root:Ignoring node 371.6 as not in dataset
DEBUG:root:Ignoring node 371.60 as not in dataset
DE

DEBUG:root:Ignoring node 375.15 as not in dataset
DEBUG:root:Ignoring node 375.16 as not in dataset
DEBUG:root:Ignoring node 375.2 as not in dataset
DEBUG:root:Ignoring node 375.20 as not in dataset
DEBUG:root:Ignoring node 375.21 as not in dataset
DEBUG:root:Ignoring node 375.22 as not in dataset
DEBUG:root:Ignoring node 375.3 as not in dataset
DEBUG:root:Ignoring node 375.30 as not in dataset
DEBUG:root:Ignoring node 375.31 as not in dataset
DEBUG:root:Ignoring node 375.32 as not in dataset
DEBUG:root:Ignoring node 375.33 as not in dataset
DEBUG:root:Ignoring node 375.4 as not in dataset
DEBUG:root:Ignoring node 375.41 as not in dataset
DEBUG:root:Ignoring node 375.42 as not in dataset
DEBUG:root:Ignoring node 375.43 as not in dataset
DEBUG:root:Ignoring node 375.5 as not in dataset
DEBUG:root:Ignoring node 375.51 as not in dataset
DEBUG:root:Ignoring node 375.52 as not in dataset
DEBUG:root:Ignoring node 375.53 as not in dataset
DEBUG:root:Ignoring node 375.54 as not in dataset
DEBU

DEBUG:root:Ignoring node 378.56 as not in dataset
DEBUG:root:Ignoring node 378.6 as not in dataset
DEBUG:root:Ignoring node 378.60 as not in dataset
DEBUG:root:Ignoring node 378.61 as not in dataset
DEBUG:root:Ignoring node 378.62 as not in dataset
DEBUG:root:Ignoring node 378.63 as not in dataset
DEBUG:root:Ignoring node 378.7 as not in dataset
DEBUG:root:Ignoring node 378.71 as not in dataset
DEBUG:root:Ignoring node 378.72 as not in dataset
DEBUG:root:Ignoring node 378.73 as not in dataset
DEBUG:root:Ignoring node 378.8 as not in dataset
DEBUG:root:Ignoring node 378.81 as not in dataset
DEBUG:root:Ignoring node 378.82 as not in dataset
DEBUG:root:Ignoring node 378.83 as not in dataset
DEBUG:root:Ignoring node 378.84 as not in dataset
DEBUG:root:Ignoring node 378.85 as not in dataset
DEBUG:root:Ignoring node 378.86 as not in dataset
DEBUG:root:Ignoring node 378.87 as not in dataset
DEBUG:root:Ignoring node 378.9 as not in dataset
DEBUG:root:Ignoring node 379.0 as not in dataset
DEBUG

DEBUG:root:Ignoring node 383.30 as not in dataset
DEBUG:root:Ignoring node 383.31 as not in dataset
DEBUG:root:Ignoring node 383.32 as not in dataset
DEBUG:root:Ignoring node 383.33 as not in dataset
DEBUG:root:Ignoring node 383.8 as not in dataset
DEBUG:root:Ignoring node 383.81 as not in dataset
DEBUG:root:Ignoring node 383.89 as not in dataset
DEBUG:root:Ignoring node 383.9 as not in dataset
DEBUG:root:Ignoring node 384 as not in dataset
DEBUG:root:Ignoring node 384 as not in dataset
DEBUG:root:Ignoring node 384.0 as not in dataset
DEBUG:root:Ignoring node 384.00 as not in dataset
DEBUG:root:Ignoring node 384.01 as not in dataset
DEBUG:root:Ignoring node 384.09 as not in dataset
DEBUG:root:Ignoring node 384.1 as not in dataset
DEBUG:root:Ignoring node 384.2 as not in dataset
DEBUG:root:Ignoring node 384.20 as not in dataset
DEBUG:root:Ignoring node 384.21 as not in dataset
DEBUG:root:Ignoring node 384.22 as not in dataset
DEBUG:root:Ignoring node 384.23 as not in dataset
DEBUG:root:

DEBUG:root:Ignoring node 394.0 as not in dataset
DEBUG:root:Ignoring node 394.1 as not in dataset
DEBUG:root:Ignoring node 394.2 as not in dataset
DEBUG:root:Ignoring node 394.9 as not in dataset
DEBUG:root:Ignoring node 395.0 as not in dataset
DEBUG:root:Ignoring node 395.1 as not in dataset
DEBUG:root:Ignoring node 395.2 as not in dataset
DEBUG:root:Ignoring node 395.9 as not in dataset
DEBUG:root:Ignoring node 396.0 as not in dataset
DEBUG:root:Ignoring node 396.1 as not in dataset
DEBUG:root:Ignoring node 396.2 as not in dataset
DEBUG:root:Ignoring node 396.3 as not in dataset
DEBUG:root:Ignoring node 396.8 as not in dataset
DEBUG:root:Ignoring node 396.9 as not in dataset
DEBUG:root:Ignoring node 397.0 as not in dataset
DEBUG:root:Ignoring node 397.1 as not in dataset
DEBUG:root:Ignoring node 397.9 as not in dataset
DEBUG:root:Ignoring node 398.0 as not in dataset
DEBUG:root:Ignoring node 398.9 as not in dataset
DEBUG:root:Ignoring node 398.90 as not in dataset
DEBUG:root:Ignoring

DEBUG:root:Ignoring node 423.1 as not in dataset
DEBUG:root:Ignoring node 423.2 as not in dataset
DEBUG:root:Ignoring node 423.3 as not in dataset
DEBUG:root:Ignoring node 423.8 as not in dataset
DEBUG:root:Ignoring node 423.9 as not in dataset
DEBUG:root:Ignoring node 424.0 as not in dataset
DEBUG:root:Ignoring node 424.1 as not in dataset
DEBUG:root:Ignoring node 424.2 as not in dataset
DEBUG:root:Ignoring node 424.3 as not in dataset
DEBUG:root:Ignoring node 424.9 as not in dataset
DEBUG:root:Ignoring node 424.90 as not in dataset
DEBUG:root:Ignoring node 424.91 as not in dataset
DEBUG:root:Ignoring node 424.99 as not in dataset
DEBUG:root:Ignoring node 425.0 as not in dataset
DEBUG:root:Ignoring node 425.1 as not in dataset
DEBUG:root:Ignoring node 425.11 as not in dataset
DEBUG:root:Ignoring node 425.18 as not in dataset
DEBUG:root:Ignoring node 425.2 as not in dataset
DEBUG:root:Ignoring node 425.3 as not in dataset
DEBUG:root:Ignoring node 425.4 as not in dataset
DEBUG:root:Igno

DEBUG:root:Ignoring node 438.50 as not in dataset
DEBUG:root:Ignoring node 438.51 as not in dataset
DEBUG:root:Ignoring node 438.52 as not in dataset
DEBUG:root:Ignoring node 438.53 as not in dataset
DEBUG:root:Ignoring node 438.6 as not in dataset
DEBUG:root:Ignoring node 438.7 as not in dataset
DEBUG:root:Ignoring node 438.8 as not in dataset
DEBUG:root:Ignoring node 438.81 as not in dataset
DEBUG:root:Ignoring node 438.82 as not in dataset
DEBUG:root:Ignoring node 438.83 as not in dataset
DEBUG:root:Ignoring node 438.84 as not in dataset
DEBUG:root:Ignoring node 438.85 as not in dataset
DEBUG:root:Ignoring node 438.89 as not in dataset
DEBUG:root:Ignoring node 438.9 as not in dataset
DEBUG:root:Ignoring node 440.0 as not in dataset
DEBUG:root:Ignoring node 440.1 as not in dataset
DEBUG:root:Ignoring node 440.2 as not in dataset
DEBUG:root:Ignoring node 440.20 as not in dataset
DEBUG:root:Ignoring node 440.21 as not in dataset
DEBUG:root:Ignoring node 440.22 as not in dataset
DEBUG:r

DEBUG:root:Ignoring node 455.3 as not in dataset
DEBUG:root:Ignoring node 455.4 as not in dataset
DEBUG:root:Ignoring node 455.5 as not in dataset
DEBUG:root:Ignoring node 455.6 as not in dataset
DEBUG:root:Ignoring node 455.7 as not in dataset
DEBUG:root:Ignoring node 455.8 as not in dataset
DEBUG:root:Ignoring node 455.9 as not in dataset
DEBUG:root:Ignoring node 456.0 as not in dataset
DEBUG:root:Ignoring node 456.1 as not in dataset
DEBUG:root:Ignoring node 456.2 as not in dataset
DEBUG:root:Ignoring node 456.20 as not in dataset
DEBUG:root:Ignoring node 456.21 as not in dataset
DEBUG:root:Ignoring node 456.3 as not in dataset
DEBUG:root:Ignoring node 456.4 as not in dataset
DEBUG:root:Ignoring node 456.5 as not in dataset
DEBUG:root:Ignoring node 456.6 as not in dataset
DEBUG:root:Ignoring node 456.8 as not in dataset
DEBUG:root:Ignoring node 457.0 as not in dataset
DEBUG:root:Ignoring node 457.1 as not in dataset
DEBUG:root:Ignoring node 457.2 as not in dataset
DEBUG:root:Ignorin

DEBUG:root:Ignoring node 482.9 as not in dataset
DEBUG:root:Ignoring node 483.0 as not in dataset
DEBUG:root:Ignoring node 483.1 as not in dataset
DEBUG:root:Ignoring node 483.8 as not in dataset
DEBUG:root:Ignoring node 484.1 as not in dataset
DEBUG:root:Ignoring node 484.3 as not in dataset
DEBUG:root:Ignoring node 484.5 as not in dataset
DEBUG:root:Ignoring node 484.6 as not in dataset
DEBUG:root:Ignoring node 484.7 as not in dataset
DEBUG:root:Ignoring node 484.8 as not in dataset
DEBUG:root:Ignoring node 487.0 as not in dataset
DEBUG:root:Ignoring node 487.1 as not in dataset
DEBUG:root:Ignoring node 487.8 as not in dataset
DEBUG:root:Ignoring node 488.0 as not in dataset
DEBUG:root:Ignoring node 488.01 as not in dataset
DEBUG:root:Ignoring node 488.02 as not in dataset
DEBUG:root:Ignoring node 488.09 as not in dataset
DEBUG:root:Ignoring node 488.1 as not in dataset
DEBUG:root:Ignoring node 488.11 as not in dataset
DEBUG:root:Ignoring node 488.12 as not in dataset
DEBUG:root:Igno

DEBUG:root:Ignoring node 520.5 as not in dataset
DEBUG:root:Ignoring node 520.6 as not in dataset
DEBUG:root:Ignoring node 520.7 as not in dataset
DEBUG:root:Ignoring node 520.8 as not in dataset
DEBUG:root:Ignoring node 520.9 as not in dataset
DEBUG:root:Ignoring node 521.0 as not in dataset
DEBUG:root:Ignoring node 521.00 as not in dataset
DEBUG:root:Ignoring node 521.01 as not in dataset
DEBUG:root:Ignoring node 521.02 as not in dataset
DEBUG:root:Ignoring node 521.03 as not in dataset
DEBUG:root:Ignoring node 521.04 as not in dataset
DEBUG:root:Ignoring node 521.05 as not in dataset
DEBUG:root:Ignoring node 521.06 as not in dataset
DEBUG:root:Ignoring node 521.07 as not in dataset
DEBUG:root:Ignoring node 521.08 as not in dataset
DEBUG:root:Ignoring node 521.09 as not in dataset
DEBUG:root:Ignoring node 521.1 as not in dataset
DEBUG:root:Ignoring node 521.10 as not in dataset
DEBUG:root:Ignoring node 521.11 as not in dataset
DEBUG:root:Ignoring node 521.12 as not in dataset
DEBUG:r

DEBUG:root:Ignoring node 525.23 as not in dataset
DEBUG:root:Ignoring node 525.24 as not in dataset
DEBUG:root:Ignoring node 525.25 as not in dataset
DEBUG:root:Ignoring node 525.26 as not in dataset
DEBUG:root:Ignoring node 525.3 as not in dataset
DEBUG:root:Ignoring node 525.4 as not in dataset
DEBUG:root:Ignoring node 525.40 as not in dataset
DEBUG:root:Ignoring node 525.41 as not in dataset
DEBUG:root:Ignoring node 525.42 as not in dataset
DEBUG:root:Ignoring node 525.43 as not in dataset
DEBUG:root:Ignoring node 525.44 as not in dataset
DEBUG:root:Ignoring node 525.5 as not in dataset
DEBUG:root:Ignoring node 525.50 as not in dataset
DEBUG:root:Ignoring node 525.51 as not in dataset
DEBUG:root:Ignoring node 525.52 as not in dataset
DEBUG:root:Ignoring node 525.53 as not in dataset
DEBUG:root:Ignoring node 525.54 as not in dataset
Building Hierarchy from df: 7197it [00:04, 1533.94it/s]DEBUG:root:Ignoring node 525.6 as not in dataset
DEBUG:root:Ignoring node 525.60 as not in dataset

DEBUG:root:Ignoring node 533.0 as not in dataset
DEBUG:root:Ignoring node 533.00 as not in dataset
DEBUG:root:Ignoring node 533.01 as not in dataset
DEBUG:root:Ignoring node 533.1 as not in dataset
DEBUG:root:Ignoring node 533.10 as not in dataset
DEBUG:root:Ignoring node 533.11 as not in dataset
DEBUG:root:Ignoring node 533.2 as not in dataset
DEBUG:root:Ignoring node 533.20 as not in dataset
DEBUG:root:Ignoring node 533.21 as not in dataset
DEBUG:root:Ignoring node 533.3 as not in dataset
DEBUG:root:Ignoring node 533.30 as not in dataset
DEBUG:root:Ignoring node 533.31 as not in dataset
DEBUG:root:Ignoring node 533.4 as not in dataset
DEBUG:root:Ignoring node 533.40 as not in dataset
DEBUG:root:Ignoring node 533.41 as not in dataset
DEBUG:root:Ignoring node 533.5 as not in dataset
DEBUG:root:Ignoring node 533.50 as not in dataset
DEBUG:root:Ignoring node 533.51 as not in dataset
DEBUG:root:Ignoring node 533.6 as not in dataset
DEBUG:root:Ignoring node 533.60 as not in dataset
DEBUG:r

DEBUG:root:Ignoring node 553.21 as not in dataset
DEBUG:root:Ignoring node 553.29 as not in dataset
DEBUG:root:Ignoring node 553.3 as not in dataset
DEBUG:root:Ignoring node 553.8 as not in dataset
DEBUG:root:Ignoring node 553.9 as not in dataset
DEBUG:root:Ignoring node 555.0 as not in dataset
DEBUG:root:Ignoring node 555.1 as not in dataset
DEBUG:root:Ignoring node 555.2 as not in dataset
DEBUG:root:Ignoring node 555.9 as not in dataset
DEBUG:root:Ignoring node 556.0 as not in dataset
DEBUG:root:Ignoring node 556.1 as not in dataset
DEBUG:root:Ignoring node 556.2 as not in dataset
DEBUG:root:Ignoring node 556.3 as not in dataset
DEBUG:root:Ignoring node 556.4 as not in dataset
DEBUG:root:Ignoring node 556.5 as not in dataset
DEBUG:root:Ignoring node 556.6 as not in dataset
DEBUG:root:Ignoring node 556.8 as not in dataset
DEBUG:root:Ignoring node 556.9 as not in dataset
DEBUG:root:Ignoring node 557.0 as not in dataset
DEBUG:root:Ignoring node 557.1 as not in dataset
DEBUG:root:Ignorin

DEBUG:root:Ignoring node 574.60 as not in dataset
DEBUG:root:Ignoring node 574.61 as not in dataset
DEBUG:root:Ignoring node 574.7 as not in dataset
DEBUG:root:Ignoring node 574.70 as not in dataset
DEBUG:root:Ignoring node 574.71 as not in dataset
DEBUG:root:Ignoring node 574.8 as not in dataset
DEBUG:root:Ignoring node 574.80 as not in dataset
DEBUG:root:Ignoring node 574.81 as not in dataset
DEBUG:root:Ignoring node 574.9 as not in dataset
DEBUG:root:Ignoring node 574.90 as not in dataset
DEBUG:root:Ignoring node 574.91 as not in dataset
DEBUG:root:Ignoring node 575.0 as not in dataset
DEBUG:root:Ignoring node 575.1 as not in dataset
DEBUG:root:Ignoring node 575.10 as not in dataset
DEBUG:root:Ignoring node 575.11 as not in dataset
DEBUG:root:Ignoring node 575.12 as not in dataset
DEBUG:root:Ignoring node 575.2 as not in dataset
DEBUG:root:Ignoring node 575.3 as not in dataset
DEBUG:root:Ignoring node 575.4 as not in dataset
DEBUG:root:Ignoring node 575.5 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 597.0 as not in dataset
DEBUG:root:Ignoring node 597.8 as not in dataset
DEBUG:root:Ignoring node 597.80 as not in dataset
DEBUG:root:Ignoring node 597.81 as not in dataset
DEBUG:root:Ignoring node 597.89 as not in dataset
DEBUG:root:Ignoring node 598.0 as not in dataset
DEBUG:root:Ignoring node 598.00 as not in dataset
DEBUG:root:Ignoring node 598.01 as not in dataset
DEBUG:root:Ignoring node 598.1 as not in dataset
DEBUG:root:Ignoring node 598.2 as not in dataset
DEBUG:root:Ignoring node 598.8 as not in dataset
DEBUG:root:Ignoring node 598.9 as not in dataset
DEBUG:root:Ignoring node 599.0 as not in dataset
DEBUG:root:Ignoring node 599.1 as not in dataset
DEBUG:root:Ignoring node 599.2 as not in dataset
DEBUG:root:Ignoring node 599.3 as not in dataset
DEBUG:root:Ignoring node 599.4 as not in dataset
DEBUG:root:Ignoring node 599.5 as not in dataset
DEBUG:root:Ignoring node 599.6 as not in dataset
DEBUG:root:Ignoring node 599.60 as not in dataset
DEBUG:root:Ign

DEBUG:root:Ignoring node 617.3 as not in dataset
DEBUG:root:Ignoring node 617.4 as not in dataset
DEBUG:root:Ignoring node 617.5 as not in dataset
DEBUG:root:Ignoring node 617.6 as not in dataset
DEBUG:root:Ignoring node 617.8 as not in dataset
DEBUG:root:Ignoring node 617.9 as not in dataset
DEBUG:root:Ignoring node 618.0 as not in dataset
DEBUG:root:Ignoring node 618.00 as not in dataset
DEBUG:root:Ignoring node 618.01 as not in dataset
DEBUG:root:Ignoring node 618.02 as not in dataset
DEBUG:root:Ignoring node 618.03 as not in dataset
DEBUG:root:Ignoring node 618.04 as not in dataset
DEBUG:root:Ignoring node 618.05 as not in dataset
DEBUG:root:Ignoring node 618.09 as not in dataset
DEBUG:root:Ignoring node 618.1 as not in dataset
DEBUG:root:Ignoring node 618.2 as not in dataset
DEBUG:root:Ignoring node 618.3 as not in dataset
DEBUG:root:Ignoring node 618.4 as not in dataset
DEBUG:root:Ignoring node 618.5 as not in dataset
DEBUG:root:Ignoring node 618.6 as not in dataset
DEBUG:root:Ig

DEBUG:root:Ignoring node 633.9 as not in dataset
DEBUG:root:Ignoring node 633.90 as not in dataset
DEBUG:root:Ignoring node 633.91 as not in dataset
DEBUG:root:Ignoring node 634.0 as not in dataset
DEBUG:root:Ignoring node 634.00 as not in dataset
DEBUG:root:Ignoring node 634.01 as not in dataset
DEBUG:root:Ignoring node 634.02 as not in dataset
DEBUG:root:Ignoring node 634.1 as not in dataset
DEBUG:root:Ignoring node 634.10 as not in dataset
DEBUG:root:Ignoring node 634.11 as not in dataset
DEBUG:root:Ignoring node 634.12 as not in dataset
DEBUG:root:Ignoring node 634.2 as not in dataset
DEBUG:root:Ignoring node 634.20 as not in dataset
DEBUG:root:Ignoring node 634.21 as not in dataset
DEBUG:root:Ignoring node 634.22 as not in dataset
DEBUG:root:Ignoring node 634.3 as not in dataset
DEBUG:root:Ignoring node 634.30 as not in dataset
DEBUG:root:Ignoring node 634.31 as not in dataset
DEBUG:root:Ignoring node 634.32 as not in dataset
DEBUG:root:Ignoring node 634.4 as not in dataset
DEBUG:

DEBUG:root:Ignoring node 675.8 as not in dataset
DEBUG:root:Ignoring node 675.80 as not in dataset
DEBUG:root:Ignoring node 675.81 as not in dataset
DEBUG:root:Ignoring node 675.82 as not in dataset
DEBUG:root:Ignoring node 675.83 as not in dataset
DEBUG:root:Ignoring node 675.84 as not in dataset
DEBUG:root:Ignoring node 675.9 as not in dataset
DEBUG:root:Ignoring node 675.90 as not in dataset
DEBUG:root:Ignoring node 675.91 as not in dataset
DEBUG:root:Ignoring node 675.92 as not in dataset
DEBUG:root:Ignoring node 675.93 as not in dataset
DEBUG:root:Ignoring node 675.94 as not in dataset
DEBUG:root:Ignoring node 676 as not in dataset
DEBUG:root:Ignoring node 676 as not in dataset
DEBUG:root:Ignoring node 676.0 as not in dataset
DEBUG:root:Ignoring node 676.00 as not in dataset
DEBUG:root:Ignoring node 676.01 as not in dataset
DEBUG:root:Ignoring node 676.02 as not in dataset
DEBUG:root:Ignoring node 676.03 as not in dataset
DEBUG:root:Ignoring node 676.04 as not in dataset
Building 

DEBUG:root:Ignoring node 693.1 as not in dataset
DEBUG:root:Ignoring node 693.8 as not in dataset
DEBUG:root:Ignoring node 693.9 as not in dataset
DEBUG:root:Ignoring node 694.0 as not in dataset
DEBUG:root:Ignoring node 694.1 as not in dataset
DEBUG:root:Ignoring node 694.2 as not in dataset
DEBUG:root:Ignoring node 694.3 as not in dataset
DEBUG:root:Ignoring node 694.4 as not in dataset
DEBUG:root:Ignoring node 694.5 as not in dataset
DEBUG:root:Ignoring node 694.6 as not in dataset
DEBUG:root:Ignoring node 694.60 as not in dataset
DEBUG:root:Ignoring node 694.61 as not in dataset
DEBUG:root:Ignoring node 694.8 as not in dataset
DEBUG:root:Ignoring node 694.9 as not in dataset
DEBUG:root:Ignoring node 695.0 as not in dataset
DEBUG:root:Ignoring node 695.1 as not in dataset
DEBUG:root:Ignoring node 695.10 as not in dataset
Building Hierarchy from df: 9966it [00:06, 1600.60it/s]DEBUG:root:Ignoring node 695.11 as not in dataset
DEBUG:root:Ignoring node 695.12 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 711.04 as not in dataset
DEBUG:root:Ignoring node 711.05 as not in dataset
DEBUG:root:Ignoring node 711.06 as not in dataset
DEBUG:root:Ignoring node 711.07 as not in dataset
DEBUG:root:Ignoring node 711.08 as not in dataset
DEBUG:root:Ignoring node 711.09 as not in dataset
DEBUG:root:Ignoring node 711.1 as not in dataset
DEBUG:root:Ignoring node 711.10 as not in dataset
DEBUG:root:Ignoring node 711.11 as not in dataset
DEBUG:root:Ignoring node 711.12 as not in dataset
DEBUG:root:Ignoring node 711.13 as not in dataset
DEBUG:root:Ignoring node 711.14 as not in dataset
DEBUG:root:Ignoring node 711.15 as not in dataset
DEBUG:root:Ignoring node 711.16 as not in dataset
DEBUG:root:Ignoring node 711.17 as not in dataset
DEBUG:root:Ignoring node 711.18 as not in dataset
DEBUG:root:Ignoring node 711.19 as not in dataset
DEBUG:root:Ignoring node 711.2 as not in dataset
DEBUG:root:Ignoring node 711.20 as not in dataset
DEBUG:root:Ignoring node 711.21 as not in dataset
DE

DEBUG:root:Ignoring node 713.4 as not in dataset
DEBUG:root:Ignoring node 713.5 as not in dataset
DEBUG:root:Ignoring node 713.6 as not in dataset
DEBUG:root:Ignoring node 713.7 as not in dataset
DEBUG:root:Ignoring node 713.8 as not in dataset
DEBUG:root:Ignoring node 714.0 as not in dataset
DEBUG:root:Ignoring node 714.1 as not in dataset
DEBUG:root:Ignoring node 714.2 as not in dataset
DEBUG:root:Ignoring node 714.3 as not in dataset
DEBUG:root:Ignoring node 714.30 as not in dataset
DEBUG:root:Ignoring node 714.31 as not in dataset
DEBUG:root:Ignoring node 714.32 as not in dataset
DEBUG:root:Ignoring node 714.33 as not in dataset
DEBUG:root:Ignoring node 714.4 as not in dataset
DEBUG:root:Ignoring node 714.8 as not in dataset
DEBUG:root:Ignoring node 714.81 as not in dataset
DEBUG:root:Ignoring node 714.89 as not in dataset
DEBUG:root:Ignoring node 714.9 as not in dataset
DEBUG:root:Ignoring node 715.0 as not in dataset
DEBUG:root:Ignoring node 715.00 as not in dataset
DEBUG:root:Ig

DEBUG:root:Ignoring node 717.1 as not in dataset
DEBUG:root:Ignoring node 717.2 as not in dataset
DEBUG:root:Ignoring node 717.3 as not in dataset
DEBUG:root:Ignoring node 717.4 as not in dataset
DEBUG:root:Ignoring node 717.40 as not in dataset
DEBUG:root:Ignoring node 717.41 as not in dataset
DEBUG:root:Ignoring node 717.42 as not in dataset
DEBUG:root:Ignoring node 717.43 as not in dataset
DEBUG:root:Ignoring node 717.49 as not in dataset
DEBUG:root:Ignoring node 717.5 as not in dataset
DEBUG:root:Ignoring node 717.6 as not in dataset
DEBUG:root:Ignoring node 717.7 as not in dataset
DEBUG:root:Ignoring node 717.8 as not in dataset
DEBUG:root:Ignoring node 717.81 as not in dataset
DEBUG:root:Ignoring node 717.82 as not in dataset
DEBUG:root:Ignoring node 717.83 as not in dataset
DEBUG:root:Ignoring node 717.84 as not in dataset
DEBUG:root:Ignoring node 717.85 as not in dataset
DEBUG:root:Ignoring node 717.89 as not in dataset
DEBUG:root:Ignoring node 717.9 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 719.41 as not in dataset
DEBUG:root:Ignoring node 719.42 as not in dataset
DEBUG:root:Ignoring node 719.43 as not in dataset
DEBUG:root:Ignoring node 719.44 as not in dataset
DEBUG:root:Ignoring node 719.45 as not in dataset
DEBUG:root:Ignoring node 719.46 as not in dataset
DEBUG:root:Ignoring node 719.47 as not in dataset
DEBUG:root:Ignoring node 719.48 as not in dataset
DEBUG:root:Ignoring node 719.49 as not in dataset
DEBUG:root:Ignoring node 719.5 as not in dataset
DEBUG:root:Ignoring node 719.50 as not in dataset
DEBUG:root:Ignoring node 719.51 as not in dataset
DEBUG:root:Ignoring node 719.52 as not in dataset
DEBUG:root:Ignoring node 719.53 as not in dataset
DEBUG:root:Ignoring node 719.54 as not in dataset
DEBUG:root:Ignoring node 719.55 as not in dataset
DEBUG:root:Ignoring node 719.56 as not in dataset
DEBUG:root:Ignoring node 719.57 as not in dataset
DEBUG:root:Ignoring node 719.58 as not in dataset
DEBUG:root:Ignoring node 719.59 as not in dataset
D

DEBUG:root:Ignoring node 726.9 as not in dataset
DEBUG:root:Ignoring node 726.90 as not in dataset
DEBUG:root:Ignoring node 726.91 as not in dataset
DEBUG:root:Ignoring node 727.0 as not in dataset
DEBUG:root:Ignoring node 727.00 as not in dataset
DEBUG:root:Ignoring node 727.01 as not in dataset
DEBUG:root:Ignoring node 727.02 as not in dataset
DEBUG:root:Ignoring node 727.03 as not in dataset
DEBUG:root:Ignoring node 727.04 as not in dataset
DEBUG:root:Ignoring node 727.05 as not in dataset
DEBUG:root:Ignoring node 727.06 as not in dataset
DEBUG:root:Ignoring node 727.09 as not in dataset
DEBUG:root:Ignoring node 727.1 as not in dataset
DEBUG:root:Ignoring node 727.2 as not in dataset
DEBUG:root:Ignoring node 727.3 as not in dataset
DEBUG:root:Ignoring node 727.4 as not in dataset
DEBUG:root:Ignoring node 727.40 as not in dataset
DEBUG:root:Ignoring node 727.41 as not in dataset
DEBUG:root:Ignoring node 727.42 as not in dataset
DEBUG:root:Ignoring node 727.43 as not in dataset
DEBUG:

DEBUG:root:Ignoring node 730.95 as not in dataset
DEBUG:root:Ignoring node 730.96 as not in dataset
DEBUG:root:Ignoring node 730.97 as not in dataset
DEBUG:root:Ignoring node 730.98 as not in dataset
DEBUG:root:Ignoring node 730.99 as not in dataset
DEBUG:root:Ignoring node 731.0 as not in dataset
DEBUG:root:Ignoring node 731.1 as not in dataset
DEBUG:root:Ignoring node 731.2 as not in dataset
DEBUG:root:Ignoring node 731.3 as not in dataset
DEBUG:root:Ignoring node 731.8 as not in dataset
DEBUG:root:Ignoring node 732.0 as not in dataset
DEBUG:root:Ignoring node 732.1 as not in dataset
DEBUG:root:Ignoring node 732.2 as not in dataset
DEBUG:root:Ignoring node 732.3 as not in dataset
DEBUG:root:Ignoring node 732.4 as not in dataset
DEBUG:root:Ignoring node 732.5 as not in dataset
DEBUG:root:Ignoring node 732.6 as not in dataset
DEBUG:root:Ignoring node 732.7 as not in dataset
DEBUG:root:Ignoring node 732.8 as not in dataset
DEBUG:root:Ignoring node 732.9 as not in dataset
DEBUG:root:Igno

DEBUG:root:Ignoring node 739.9 as not in dataset
DEBUG:root:Ignoring node 740 as not in dataset
DEBUG:root:Ignoring node 740 as not in dataset
DEBUG:root:Ignoring node 740.0 as not in dataset
DEBUG:root:Ignoring node 740.1 as not in dataset
DEBUG:root:Ignoring node 740.2 as not in dataset
DEBUG:root:Ignoring node 741.0 as not in dataset
DEBUG:root:Ignoring node 741.00 as not in dataset
DEBUG:root:Ignoring node 741.01 as not in dataset
DEBUG:root:Ignoring node 741.02 as not in dataset
DEBUG:root:Ignoring node 741.03 as not in dataset
DEBUG:root:Ignoring node 741.9 as not in dataset
DEBUG:root:Ignoring node 741.90 as not in dataset
DEBUG:root:Ignoring node 741.91 as not in dataset
DEBUG:root:Ignoring node 741.92 as not in dataset
DEBUG:root:Ignoring node 741.93 as not in dataset
DEBUG:root:Ignoring node 742.0 as not in dataset
DEBUG:root:Ignoring node 742.1 as not in dataset
DEBUG:root:Ignoring node 742.2 as not in dataset
DEBUG:root:Ignoring node 742.3 as not in dataset
DEBUG:root:Ignor

DEBUG:root:Ignoring node 747.41 as not in dataset
DEBUG:root:Ignoring node 747.42 as not in dataset
DEBUG:root:Ignoring node 747.49 as not in dataset
DEBUG:root:Ignoring node 747.5 as not in dataset
DEBUG:root:Ignoring node 747.6 as not in dataset
DEBUG:root:Ignoring node 747.60 as not in dataset
DEBUG:root:Ignoring node 747.61 as not in dataset
DEBUG:root:Ignoring node 747.62 as not in dataset
DEBUG:root:Ignoring node 747.63 as not in dataset
DEBUG:root:Ignoring node 747.64 as not in dataset
DEBUG:root:Ignoring node 747.69 as not in dataset
DEBUG:root:Ignoring node 747.8 as not in dataset
DEBUG:root:Ignoring node 747.81 as not in dataset
DEBUG:root:Ignoring node 747.82 as not in dataset
DEBUG:root:Ignoring node 747.83 as not in dataset
DEBUG:root:Ignoring node 747.89 as not in dataset
DEBUG:root:Ignoring node 747.9 as not in dataset
Building Hierarchy from df: 11383it [00:07, 1560.20it/s]DEBUG:root:Ignoring node 748.0 as not in dataset
DEBUG:root:Ignoring node 748.1 as not in dataset


Building Hierarchy from df: 11541it [00:07, 1485.22it/s]DEBUG:root:Ignoring node 754.43 as not in dataset
DEBUG:root:Ignoring node 754.44 as not in dataset
DEBUG:root:Ignoring node 754.5 as not in dataset
DEBUG:root:Ignoring node 754.50 as not in dataset
DEBUG:root:Ignoring node 754.51 as not in dataset
DEBUG:root:Ignoring node 754.52 as not in dataset
DEBUG:root:Ignoring node 754.53 as not in dataset
DEBUG:root:Ignoring node 754.59 as not in dataset
DEBUG:root:Ignoring node 754.6 as not in dataset
DEBUG:root:Ignoring node 754.60 as not in dataset
DEBUG:root:Ignoring node 754.61 as not in dataset
DEBUG:root:Ignoring node 754.62 as not in dataset
DEBUG:root:Ignoring node 754.69 as not in dataset
DEBUG:root:Ignoring node 754.7 as not in dataset
DEBUG:root:Ignoring node 754.70 as not in dataset
DEBUG:root:Ignoring node 754.71 as not in dataset
DEBUG:root:Ignoring node 754.79 as not in dataset
DEBUG:root:Ignoring node 754.8 as not in dataset
DEBUG:root:Ignoring node 754.81 as not in datase

DEBUG:root:Ignoring node 760.61 as not in dataset
DEBUG:root:Ignoring node 760.62 as not in dataset
DEBUG:root:Ignoring node 760.63 as not in dataset
DEBUG:root:Ignoring node 760.64 as not in dataset
DEBUG:root:Ignoring node 760.7 as not in dataset
DEBUG:root:Ignoring node 760.70 as not in dataset
DEBUG:root:Ignoring node 760.71 as not in dataset
DEBUG:root:Ignoring node 760.72 as not in dataset
DEBUG:root:Ignoring node 760.73 as not in dataset
DEBUG:root:Ignoring node 760.74 as not in dataset
DEBUG:root:Ignoring node 760.75 as not in dataset
DEBUG:root:Ignoring node 760.76 as not in dataset
DEBUG:root:Ignoring node 760.77 as not in dataset
DEBUG:root:Ignoring node 760.78 as not in dataset
DEBUG:root:Ignoring node 760.79 as not in dataset
DEBUG:root:Ignoring node 760.8 as not in dataset
DEBUG:root:Ignoring node 760.9 as not in dataset
DEBUG:root:Ignoring node 761.0 as not in dataset
DEBUG:root:Ignoring node 761.1 as not in dataset
DEBUG:root:Ignoring node 761.2 as not in dataset
DEBUG:

DEBUG:root:Ignoring node 770.12 as not in dataset
DEBUG:root:Ignoring node 770.13 as not in dataset
DEBUG:root:Ignoring node 770.14 as not in dataset
DEBUG:root:Ignoring node 770.15 as not in dataset
DEBUG:root:Ignoring node 770.16 as not in dataset
DEBUG:root:Ignoring node 770.17 as not in dataset
DEBUG:root:Ignoring node 770.18 as not in dataset
DEBUG:root:Ignoring node 770.2 as not in dataset
DEBUG:root:Ignoring node 770.3 as not in dataset
DEBUG:root:Ignoring node 770.4 as not in dataset
DEBUG:root:Ignoring node 770.5 as not in dataset
DEBUG:root:Ignoring node 770.6 as not in dataset
DEBUG:root:Ignoring node 770.7 as not in dataset
DEBUG:root:Ignoring node 770.8 as not in dataset
DEBUG:root:Ignoring node 770.81 as not in dataset
DEBUG:root:Ignoring node 770.82 as not in dataset
DEBUG:root:Ignoring node 770.83 as not in dataset
DEBUG:root:Ignoring node 770.84 as not in dataset
DEBUG:root:Ignoring node 770.85 as not in dataset
DEBUG:root:Ignoring node 770.86 as not in dataset
DEBUG:r

DEBUG:root:Ignoring node 780.71 as not in dataset
DEBUG:root:Ignoring node 780.72 as not in dataset
DEBUG:root:Ignoring node 780.79 as not in dataset
DEBUG:root:Ignoring node 780.8 as not in dataset
DEBUG:root:Ignoring node 780.9 as not in dataset
DEBUG:root:Ignoring node 780.91 as not in dataset
DEBUG:root:Ignoring node 780.92 as not in dataset
DEBUG:root:Ignoring node 780.93 as not in dataset
DEBUG:root:Ignoring node 780.94 as not in dataset
DEBUG:root:Ignoring node 780.95 as not in dataset
DEBUG:root:Ignoring node 780.96 as not in dataset
DEBUG:root:Ignoring node 780.97 as not in dataset
DEBUG:root:Ignoring node 780.99 as not in dataset
DEBUG:root:Ignoring node 781.0 as not in dataset
DEBUG:root:Ignoring node 781.1 as not in dataset
DEBUG:root:Ignoring node 781.2 as not in dataset
DEBUG:root:Ignoring node 781.3 as not in dataset
DEBUG:root:Ignoring node 781.4 as not in dataset
DEBUG:root:Ignoring node 781.5 as not in dataset
DEBUG:root:Ignoring node 781.6 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node 788.6 as not in dataset
DEBUG:root:Ignoring node 788.61 as not in dataset
DEBUG:root:Ignoring node 788.62 as not in dataset
DEBUG:root:Ignoring node 788.63 as not in dataset
DEBUG:root:Ignoring node 788.64 as not in dataset
DEBUG:root:Ignoring node 788.65 as not in dataset
DEBUG:root:Ignoring node 788.69 as not in dataset
DEBUG:root:Ignoring node 788.7 as not in dataset
DEBUG:root:Ignoring node 788.8 as not in dataset
DEBUG:root:Ignoring node 788.9 as not in dataset
DEBUG:root:Ignoring node 788.91 as not in dataset
DEBUG:root:Ignoring node 788.99 as not in dataset
Building Hierarchy from df: 12282it [00:07, 1425.47it/s]DEBUG:root:Ignoring node 789.0 as not in dataset
DEBUG:root:Ignoring node 789.00 as not in dataset
DEBUG:root:Ignoring node 789.01 as not in dataset
DEBUG:root:Ignoring node 789.02 as not in dataset
DEBUG:root:Ignoring node 789.03 as not in dataset
DEBUG:root:Ignoring node 789.04 as not in dataset
DEBUG:root:Ignoring node 789.05 as not in dataset

DEBUG:root:Ignoring node 795.31 as not in dataset
DEBUG:root:Ignoring node 795.39 as not in dataset
DEBUG:root:Ignoring node 795.4 as not in dataset
DEBUG:root:Ignoring node 795.5 as not in dataset
DEBUG:root:Ignoring node 795.51 as not in dataset
DEBUG:root:Ignoring node 795.52 as not in dataset
DEBUG:root:Ignoring node 795.6 as not in dataset
DEBUG:root:Ignoring node 795.7 as not in dataset
DEBUG:root:Ignoring node 795.71 as not in dataset
DEBUG:root:Ignoring node 795.79 as not in dataset
DEBUG:root:Ignoring node 795.8 as not in dataset
DEBUG:root:Ignoring node 795.81 as not in dataset
DEBUG:root:Ignoring node 795.82 as not in dataset
DEBUG:root:Ignoring node 795.89 as not in dataset
DEBUG:root:Ignoring node 796.0 as not in dataset
DEBUG:root:Ignoring node 796.1 as not in dataset
DEBUG:root:Ignoring node 796.2 as not in dataset
DEBUG:root:Ignoring node 796.3 as not in dataset
DEBUG:root:Ignoring node 796.4 as not in dataset
DEBUG:root:Ignoring node 796.5 as not in dataset
DEBUG:root:

DEBUG:root:Ignoring node 801.10 as not in dataset
DEBUG:root:Ignoring node 801.11 as not in dataset
DEBUG:root:Ignoring node 801.12 as not in dataset
DEBUG:root:Ignoring node 801.13 as not in dataset
DEBUG:root:Ignoring node 801.14 as not in dataset
DEBUG:root:Ignoring node 801.15 as not in dataset
DEBUG:root:Ignoring node 801.16 as not in dataset
DEBUG:root:Ignoring node 801.19 as not in dataset
DEBUG:root:Ignoring node 801.2 as not in dataset
DEBUG:root:Ignoring node 801.20 as not in dataset
DEBUG:root:Ignoring node 801.21 as not in dataset
DEBUG:root:Ignoring node 801.22 as not in dataset
DEBUG:root:Ignoring node 801.23 as not in dataset
DEBUG:root:Ignoring node 801.24 as not in dataset
DEBUG:root:Ignoring node 801.25 as not in dataset
DEBUG:root:Ignoring node 801.26 as not in dataset
DEBUG:root:Ignoring node 801.29 as not in dataset
DEBUG:root:Ignoring node 801.3 as not in dataset
DEBUG:root:Ignoring node 801.30 as not in dataset
DEBUG:root:Ignoring node 801.31 as not in dataset
DE

DEBUG:root:Ignoring node 803.6 as not in dataset
DEBUG:root:Ignoring node 803.60 as not in dataset
DEBUG:root:Ignoring node 803.61 as not in dataset
DEBUG:root:Ignoring node 803.62 as not in dataset
DEBUG:root:Ignoring node 803.63 as not in dataset
DEBUG:root:Ignoring node 803.64 as not in dataset
DEBUG:root:Ignoring node 803.65 as not in dataset
DEBUG:root:Ignoring node 803.66 as not in dataset
DEBUG:root:Ignoring node 803.69 as not in dataset
DEBUG:root:Ignoring node 803.7 as not in dataset
DEBUG:root:Ignoring node 803.70 as not in dataset
DEBUG:root:Ignoring node 803.71 as not in dataset
DEBUG:root:Ignoring node 803.72 as not in dataset
DEBUG:root:Ignoring node 803.73 as not in dataset
DEBUG:root:Ignoring node 803.74 as not in dataset
DEBUG:root:Ignoring node 803.75 as not in dataset
DEBUG:root:Ignoring node 803.76 as not in dataset
DEBUG:root:Ignoring node 803.79 as not in dataset
DEBUG:root:Ignoring node 803.8 as not in dataset
DEBUG:root:Ignoring node 803.80 as not in dataset
DEB

DEBUG:root:Ignoring node 913.8 as not in dataset
DEBUG:root:Ignoring node 913.9 as not in dataset
DEBUG:root:Ignoring node 914.0 as not in dataset
DEBUG:root:Ignoring node 914.1 as not in dataset
DEBUG:root:Ignoring node 914.2 as not in dataset
DEBUG:root:Ignoring node 914.3 as not in dataset
DEBUG:root:Ignoring node 914.4 as not in dataset
DEBUG:root:Ignoring node 914.5 as not in dataset
DEBUG:root:Ignoring node 914.6 as not in dataset
DEBUG:root:Ignoring node 914.7 as not in dataset
DEBUG:root:Ignoring node 914.8 as not in dataset
DEBUG:root:Ignoring node 914.9 as not in dataset
DEBUG:root:Ignoring node 915 as not in dataset
DEBUG:root:Ignoring node 915 as not in dataset
DEBUG:root:Ignoring node 915.0 as not in dataset
DEBUG:root:Ignoring node 915.1 as not in dataset
DEBUG:root:Ignoring node 915.2 as not in dataset
DEBUG:root:Ignoring node 915.3 as not in dataset
DEBUG:root:Ignoring node 915.4 as not in dataset
DEBUG:root:Ignoring node 915.5 as not in dataset
DEBUG:root:Ignoring node

DEBUG:root:Ignoring node 939.0 as not in dataset
DEBUG:root:Ignoring node 939.1 as not in dataset
DEBUG:root:Ignoring node 939.2 as not in dataset
DEBUG:root:Ignoring node 939.3 as not in dataset
DEBUG:root:Ignoring node 939.9 as not in dataset
DEBUG:root:Ignoring node 940 as not in dataset
DEBUG:root:Ignoring node 940 as not in dataset
DEBUG:root:Ignoring node 940.0 as not in dataset
DEBUG:root:Ignoring node 940.1 as not in dataset
DEBUG:root:Ignoring node 940.2 as not in dataset
DEBUG:root:Ignoring node 940.3 as not in dataset
DEBUG:root:Ignoring node 940.4 as not in dataset
DEBUG:root:Ignoring node 940.5 as not in dataset
DEBUG:root:Ignoring node 940.9 as not in dataset
DEBUG:root:Ignoring node 941.0 as not in dataset
DEBUG:root:Ignoring node 941.00 as not in dataset
DEBUG:root:Ignoring node 941.01 as not in dataset
DEBUG:root:Ignoring node 941.02 as not in dataset
DEBUG:root:Ignoring node 941.03 as not in dataset
DEBUG:root:Ignoring node 941.04 as not in dataset
DEBUG:root:Ignoring

DEBUG:root:Ignoring node 943.4 as not in dataset
DEBUG:root:Ignoring node 943.40 as not in dataset
DEBUG:root:Ignoring node 943.41 as not in dataset
DEBUG:root:Ignoring node 943.42 as not in dataset
DEBUG:root:Ignoring node 943.43 as not in dataset
DEBUG:root:Ignoring node 943.44 as not in dataset
DEBUG:root:Ignoring node 943.45 as not in dataset
DEBUG:root:Ignoring node 943.46 as not in dataset
DEBUG:root:Ignoring node 943.49 as not in dataset
DEBUG:root:Ignoring node 943.5 as not in dataset
DEBUG:root:Ignoring node 943.50 as not in dataset
DEBUG:root:Ignoring node 943.51 as not in dataset
DEBUG:root:Ignoring node 943.52 as not in dataset
DEBUG:root:Ignoring node 943.53 as not in dataset
DEBUG:root:Ignoring node 943.54 as not in dataset
DEBUG:root:Ignoring node 943.55 as not in dataset
DEBUG:root:Ignoring node 943.56 as not in dataset
DEBUG:root:Ignoring node 943.59 as not in dataset
DEBUG:root:Ignoring node 944.0 as not in dataset
DEBUG:root:Ignoring node 944.00 as not in dataset
DEB

DEBUG:root:Ignoring node 948.40 as not in dataset
DEBUG:root:Ignoring node 948.41 as not in dataset
DEBUG:root:Ignoring node 948.42 as not in dataset
DEBUG:root:Ignoring node 948.43 as not in dataset
DEBUG:root:Ignoring node 948.44 as not in dataset
DEBUG:root:Ignoring node 948.5 as not in dataset
DEBUG:root:Ignoring node 948.50 as not in dataset
DEBUG:root:Ignoring node 948.51 as not in dataset
DEBUG:root:Ignoring node 948.52 as not in dataset
DEBUG:root:Ignoring node 948.53 as not in dataset
DEBUG:root:Ignoring node 948.54 as not in dataset
DEBUG:root:Ignoring node 948.55 as not in dataset
DEBUG:root:Ignoring node 948.6 as not in dataset
DEBUG:root:Ignoring node 948.60 as not in dataset
DEBUG:root:Ignoring node 948.61 as not in dataset
DEBUG:root:Ignoring node 948.62 as not in dataset
DEBUG:root:Ignoring node 948.63 as not in dataset
DEBUG:root:Ignoring node 948.64 as not in dataset
DEBUG:root:Ignoring node 948.65 as not in dataset
DEBUG:root:Ignoring node 948.66 as not in dataset
DE

DEBUG:root:Ignoring node 959.9 as not in dataset
DEBUG:root:Ignoring node 960 as not in dataset
DEBUG:root:Ignoring node 960 as not in dataset
DEBUG:root:Ignoring node 960.0 as not in dataset
DEBUG:root:Ignoring node 960.1 as not in dataset
DEBUG:root:Ignoring node 960.2 as not in dataset
DEBUG:root:Ignoring node 960.3 as not in dataset
DEBUG:root:Ignoring node 960.4 as not in dataset
DEBUG:root:Ignoring node 960.5 as not in dataset
DEBUG:root:Ignoring node 960.6 as not in dataset
DEBUG:root:Ignoring node 960.7 as not in dataset
DEBUG:root:Ignoring node 960.8 as not in dataset
DEBUG:root:Ignoring node 960.9 as not in dataset
DEBUG:root:Ignoring node 961.0 as not in dataset
DEBUG:root:Ignoring node 961.1 as not in dataset
DEBUG:root:Ignoring node 961.2 as not in dataset
DEBUG:root:Ignoring node 961.3 as not in dataset
DEBUG:root:Ignoring node 961.4 as not in dataset
DEBUG:root:Ignoring node 961.5 as not in dataset
DEBUG:root:Ignoring node 961.6 as not in dataset
DEBUG:root:Ignoring node

DEBUG:root:Ignoring node 976.9 as not in dataset
DEBUG:root:Ignoring node 977.0 as not in dataset
DEBUG:root:Ignoring node 977.1 as not in dataset
DEBUG:root:Ignoring node 977.2 as not in dataset
DEBUG:root:Ignoring node 977.3 as not in dataset
DEBUG:root:Ignoring node 977.4 as not in dataset
DEBUG:root:Ignoring node 977.8 as not in dataset
DEBUG:root:Ignoring node 977.9 as not in dataset
DEBUG:root:Ignoring node 978 as not in dataset
DEBUG:root:Ignoring node 978 as not in dataset
DEBUG:root:Ignoring node 978.0 as not in dataset
DEBUG:root:Ignoring node 978.1 as not in dataset
DEBUG:root:Ignoring node 978.2 as not in dataset
DEBUG:root:Ignoring node 978.3 as not in dataset
DEBUG:root:Ignoring node 978.4 as not in dataset
DEBUG:root:Ignoring node 978.5 as not in dataset
DEBUG:root:Ignoring node 978.6 as not in dataset
DEBUG:root:Ignoring node 978.8 as not in dataset
DEBUG:root:Ignoring node 978.9 as not in dataset
DEBUG:root:Ignoring node 979 as not in dataset
DEBUG:root:Ignoring node 9

DEBUG:root:Ignoring node 995.8 as not in dataset
DEBUG:root:Ignoring node 995.80 as not in dataset
DEBUG:root:Ignoring node 995.81 as not in dataset
DEBUG:root:Ignoring node 995.82 as not in dataset
DEBUG:root:Ignoring node 995.83 as not in dataset
DEBUG:root:Ignoring node 995.84 as not in dataset
DEBUG:root:Ignoring node 995.85 as not in dataset
DEBUG:root:Ignoring node 995.86 as not in dataset
DEBUG:root:Ignoring node 995.89 as not in dataset
DEBUG:root:Ignoring node 995.9 as not in dataset
DEBUG:root:Ignoring node 995.90 as not in dataset
DEBUG:root:Ignoring node 995.91 as not in dataset
DEBUG:root:Ignoring node 995.92 as not in dataset
DEBUG:root:Ignoring node 995.93 as not in dataset
DEBUG:root:Ignoring node 995.94 as not in dataset
DEBUG:root:Ignoring node 996.0 as not in dataset
DEBUG:root:Ignoring node 996.00 as not in dataset
DEBUG:root:Ignoring node 996.01 as not in dataset
DEBUG:root:Ignoring node 996.02 as not in dataset
DEBUG:root:Ignoring node 996.03 as not in dataset
DEB

DEBUG:root:Ignoring node 999.63 as not in dataset
DEBUG:root:Ignoring node 999.69 as not in dataset
DEBUG:root:Ignoring node 999.7 as not in dataset
DEBUG:root:Ignoring node 999.70 as not in dataset
DEBUG:root:Ignoring node 999.71 as not in dataset
DEBUG:root:Ignoring node 999.72 as not in dataset
DEBUG:root:Ignoring node 999.73 as not in dataset
DEBUG:root:Ignoring node 999.74 as not in dataset
DEBUG:root:Ignoring node 999.75 as not in dataset
DEBUG:root:Ignoring node 999.76 as not in dataset
DEBUG:root:Ignoring node 999.77 as not in dataset
DEBUG:root:Ignoring node 999.78 as not in dataset
DEBUG:root:Ignoring node 999.79 as not in dataset
DEBUG:root:Ignoring node 999.8 as not in dataset
DEBUG:root:Ignoring node 999.80 as not in dataset
DEBUG:root:Ignoring node 999.81 as not in dataset
DEBUG:root:Ignoring node 999.82 as not in dataset
DEBUG:root:Ignoring node 999.83 as not in dataset
DEBUG:root:Ignoring node 999.84 as not in dataset
DEBUG:root:Ignoring node 999.85 as not in dataset
DE

DEBUG:root:Ignoring node V10.53 as not in dataset
DEBUG:root:Ignoring node V10.59 as not in dataset
DEBUG:root:Ignoring node V10.6 as not in dataset
DEBUG:root:Ignoring node V10.60 as not in dataset
DEBUG:root:Ignoring node V10.61 as not in dataset
DEBUG:root:Ignoring node V10.62 as not in dataset
DEBUG:root:Ignoring node V10.63 as not in dataset
DEBUG:root:Ignoring node V10.69 as not in dataset
DEBUG:root:Ignoring node V10.7 as not in dataset
DEBUG:root:Ignoring node V10.71 as not in dataset
DEBUG:root:Ignoring node V10.72 as not in dataset
DEBUG:root:Ignoring node V10.79 as not in dataset
DEBUG:root:Ignoring node V10.8 as not in dataset
DEBUG:root:Ignoring node V10.81 as not in dataset
DEBUG:root:Ignoring node V10.82 as not in dataset
DEBUG:root:Ignoring node V10.83 as not in dataset
DEBUG:root:Ignoring node V10.84 as not in dataset
DEBUG:root:Ignoring node V10.85 as not in dataset
DEBUG:root:Ignoring node V10.86 as not in dataset
DEBUG:root:Ignoring node V10.87 as not in dataset
DEB

DEBUG:root:Ignoring node V16.7 as not in dataset
DEBUG:root:Ignoring node V16.8 as not in dataset
DEBUG:root:Ignoring node V16.9 as not in dataset
DEBUG:root:Ignoring node V17.0 as not in dataset
DEBUG:root:Ignoring node V17.1 as not in dataset
DEBUG:root:Ignoring node V17.2 as not in dataset
DEBUG:root:Ignoring node V17.3 as not in dataset
DEBUG:root:Ignoring node V17.4 as not in dataset
DEBUG:root:Ignoring node V17.41 as not in dataset
DEBUG:root:Ignoring node V17.49 as not in dataset
DEBUG:root:Ignoring node V17.5 as not in dataset
DEBUG:root:Ignoring node V17.6 as not in dataset
DEBUG:root:Ignoring node V17.7 as not in dataset
DEBUG:root:Ignoring node V17.8 as not in dataset
DEBUG:root:Ignoring node V17.81 as not in dataset
DEBUG:root:Ignoring node V17.89 as not in dataset
DEBUG:root:Ignoring node V18.0 as not in dataset
DEBUG:root:Ignoring node V18.1 as not in dataset
DEBUG:root:Ignoring node V18.11 as not in dataset
DEBUG:root:Ignoring node V18.19 as not in dataset
DEBUG:root:Ign

DEBUG:root:Ignoring node V28.89 as not in dataset
DEBUG:root:Ignoring node V28.9 as not in dataset
DEBUG:root:Ignoring node V29.0 as not in dataset
DEBUG:root:Ignoring node V29.1 as not in dataset
DEBUG:root:Ignoring node V29.2 as not in dataset
DEBUG:root:Ignoring node V29.3 as not in dataset
DEBUG:root:Ignoring node V29.8 as not in dataset
DEBUG:root:Ignoring node V29.9 as not in dataset
DEBUG:root:Ignoring node V30.0 as not in dataset
DEBUG:root:Ignoring node V30.00 as not in dataset
DEBUG:root:Ignoring node V30.01 as not in dataset
DEBUG:root:Ignoring node V30.1 as not in dataset
DEBUG:root:Ignoring node V30.2 as not in dataset
DEBUG:root:Ignoring node V31.0 as not in dataset
DEBUG:root:Ignoring node V31.00 as not in dataset
DEBUG:root:Ignoring node V31.01 as not in dataset
DEBUG:root:Ignoring node V31.1 as not in dataset
DEBUG:root:Ignoring node V31.2 as not in dataset
DEBUG:root:Ignoring node V32 as not in dataset
DEBUG:root:Ignoring node V32 as not in dataset
DEBUG:root:Ignoring

DEBUG:root:Ignoring node V45.82 as not in dataset
DEBUG:root:Ignoring node V45.83 as not in dataset
DEBUG:root:Ignoring node V45.84 as not in dataset
DEBUG:root:Ignoring node V45.85 as not in dataset
DEBUG:root:Ignoring node V45.86 as not in dataset
DEBUG:root:Ignoring node V45.87 as not in dataset
DEBUG:root:Ignoring node V45.88 as not in dataset
DEBUG:root:Ignoring node V45.89 as not in dataset
DEBUG:root:Ignoring node V46.0 as not in dataset
DEBUG:root:Ignoring node V46.1 as not in dataset
DEBUG:root:Ignoring node V46.11 as not in dataset
DEBUG:root:Ignoring node V46.12 as not in dataset
DEBUG:root:Ignoring node V46.13 as not in dataset
DEBUG:root:Ignoring node V46.14 as not in dataset
DEBUG:root:Ignoring node V46.2 as not in dataset
DEBUG:root:Ignoring node V46.3 as not in dataset
DEBUG:root:Ignoring node V46.8 as not in dataset
DEBUG:root:Ignoring node V46.9 as not in dataset
DEBUG:root:Ignoring node V47 as not in dataset
DEBUG:root:Ignoring node V47 as not in dataset
DEBUG:root:I

DEBUG:root:Ignoring node V57.21 as not in dataset
DEBUG:root:Ignoring node V57.22 as not in dataset
DEBUG:root:Ignoring node V57.3 as not in dataset
DEBUG:root:Ignoring node V57.4 as not in dataset
DEBUG:root:Ignoring node V57.8 as not in dataset
DEBUG:root:Ignoring node V57.81 as not in dataset
DEBUG:root:Ignoring node V57.89 as not in dataset
DEBUG:root:Ignoring node V57.9 as not in dataset
DEBUG:root:Ignoring node V58.0 as not in dataset
DEBUG:root:Ignoring node V58.1 as not in dataset
DEBUG:root:Ignoring node V58.11 as not in dataset
DEBUG:root:Ignoring node V58.12 as not in dataset
DEBUG:root:Ignoring node V58.2 as not in dataset
DEBUG:root:Ignoring node V58.3 as not in dataset
DEBUG:root:Ignoring node V58.30 as not in dataset
DEBUG:root:Ignoring node V58.31 as not in dataset
DEBUG:root:Ignoring node V58.32 as not in dataset
DEBUG:root:Ignoring node V58.4 as not in dataset
DEBUG:root:Ignoring node V58.41 as not in dataset
DEBUG:root:Ignoring node V58.42 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node V65.49 as not in dataset
DEBUG:root:Ignoring node V65.5 as not in dataset
DEBUG:root:Ignoring node V65.8 as not in dataset
DEBUG:root:Ignoring node V65.9 as not in dataset
DEBUG:root:Ignoring node V66.0 as not in dataset
DEBUG:root:Ignoring node V66.1 as not in dataset
DEBUG:root:Ignoring node V66.2 as not in dataset
DEBUG:root:Ignoring node V66.3 as not in dataset
DEBUG:root:Ignoring node V66.4 as not in dataset
DEBUG:root:Ignoring node V66.5 as not in dataset
DEBUG:root:Ignoring node V66.6 as not in dataset
DEBUG:root:Ignoring node V66.7 as not in dataset
DEBUG:root:Ignoring node V66.9 as not in dataset
DEBUG:root:Ignoring node V67.0 as not in dataset
DEBUG:root:Ignoring node V67.00 as not in dataset
DEBUG:root:Ignoring node V67.01 as not in dataset
DEBUG:root:Ignoring node V67.09 as not in dataset
DEBUG:root:Ignoring node V67.1 as not in dataset
DEBUG:root:Ignoring node V67.2 as not in dataset
DEBUG:root:Ignoring node V67.3 as not in dataset
DEBUG:root:Ignor

DEBUG:root:Ignoring node V77.1 as not in dataset
DEBUG:root:Ignoring node V77.2 as not in dataset
DEBUG:root:Ignoring node V77.3 as not in dataset
DEBUG:root:Ignoring node V77.4 as not in dataset
DEBUG:root:Ignoring node V77.5 as not in dataset
DEBUG:root:Ignoring node V77.6 as not in dataset
DEBUG:root:Ignoring node V77.7 as not in dataset
DEBUG:root:Ignoring node V77.8 as not in dataset
DEBUG:root:Ignoring node V77.9 as not in dataset
DEBUG:root:Ignoring node V77.91 as not in dataset
DEBUG:root:Ignoring node V77.99 as not in dataset
DEBUG:root:Ignoring node V78 as not in dataset
DEBUG:root:Ignoring node V78 as not in dataset
DEBUG:root:Ignoring node V78.0 as not in dataset
DEBUG:root:Ignoring node V78.1 as not in dataset
DEBUG:root:Ignoring node V78.2 as not in dataset
DEBUG:root:Ignoring node V78.3 as not in dataset
DEBUG:root:Ignoring node V78.8 as not in dataset
DEBUG:root:Ignoring node V78.9 as not in dataset
DEBUG:root:Ignoring node V79 as not in dataset
DEBUG:root:Ignoring node

DEBUG:root:Ignoring node V90.81 as not in dataset
DEBUG:root:Ignoring node V90.83 as not in dataset
DEBUG:root:Ignoring node V90.89 as not in dataset
DEBUG:root:Ignoring node V90.9 as not in dataset
DEBUG:root:Ignoring node V91-V91 as not in dataset
DEBUG:root:Ignoring node V91 as not in dataset
DEBUG:root:Ignoring node V91 as not in dataset
DEBUG:root:Ignoring node V91.0 as not in dataset
DEBUG:root:Ignoring node V91.00 as not in dataset
DEBUG:root:Ignoring node V91.01 as not in dataset
DEBUG:root:Ignoring node V91.02 as not in dataset
DEBUG:root:Ignoring node V91.03 as not in dataset
DEBUG:root:Ignoring node V91.09 as not in dataset
DEBUG:root:Ignoring node V91.1 as not in dataset
DEBUG:root:Ignoring node V91.10 as not in dataset
DEBUG:root:Ignoring node V91.11 as not in dataset
DEBUG:root:Ignoring node V91.12 as not in dataset
DEBUG:root:Ignoring node V91.19 as not in dataset
DEBUG:root:Ignoring node V91.2 as not in dataset
DEBUG:root:Ignoring node V91.20 as not in dataset
DEBUG:roo

DEBUG:root:Ignoring node E019 as not in dataset
DEBUG:root:Ignoring node E019.0 as not in dataset
DEBUG:root:Ignoring node E019.1 as not in dataset
DEBUG:root:Ignoring node E019.2 as not in dataset
DEBUG:root:Ignoring node E019.9 as not in dataset
DEBUG:root:Ignoring node E029 as not in dataset
DEBUG:root:Ignoring node E029 as not in dataset
DEBUG:root:Ignoring node E029.0 as not in dataset
DEBUG:root:Ignoring node E029.1 as not in dataset
DEBUG:root:Ignoring node E029.2 as not in dataset
DEBUG:root:Ignoring node E029.9 as not in dataset
DEBUG:root:Ignoring node E030 as not in dataset
DEBUG:root:Ignoring node E030 as not in dataset
DEBUG:root:Ignoring node E800 as not in dataset
DEBUG:root:Ignoring node E800 as not in dataset
DEBUG:root:Ignoring node E800.0 as not in dataset
DEBUG:root:Ignoring node E800.1 as not in dataset
DEBUG:root:Ignoring node E800.2 as not in dataset
DEBUG:root:Ignoring node E800.3 as not in dataset
DEBUG:root:Ignoring node E800.8 as not in dataset
DEBUG:root:Ign

DEBUG:root:Ignoring node E818.5 as not in dataset
DEBUG:root:Ignoring node E818.6 as not in dataset
DEBUG:root:Ignoring node E818.7 as not in dataset
DEBUG:root:Ignoring node E818.8 as not in dataset
DEBUG:root:Ignoring node E818.9 as not in dataset
DEBUG:root:Ignoring node E819.0 as not in dataset
DEBUG:root:Ignoring node E819.1 as not in dataset
DEBUG:root:Ignoring node E819.2 as not in dataset
DEBUG:root:Ignoring node E819.3 as not in dataset
DEBUG:root:Ignoring node E819.4 as not in dataset
DEBUG:root:Ignoring node E819.5 as not in dataset
DEBUG:root:Ignoring node E819.6 as not in dataset
DEBUG:root:Ignoring node E819.7 as not in dataset
DEBUG:root:Ignoring node E819.8 as not in dataset
DEBUG:root:Ignoring node E819.9 as not in dataset
DEBUG:root:Ignoring node E820 as not in dataset
DEBUG:root:Ignoring node E820 as not in dataset
DEBUG:root:Ignoring node E820.0 as not in dataset
DEBUG:root:Ignoring node E820.1 as not in dataset
DEBUG:root:Ignoring node E820.2 as not in dataset
DEBU

NotImplementedError: Cannot convert a symbolic Tensor (gru/strided_slice:0) to a numpy array.

In [None]:
%tb
