In [5]:
import logging
import tensorflow as tf
import mlflow
import random

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

sys.argv.clear()
sys.argv.append("")

## Experiment and mlflow configuration

In [6]:
from src import config
from src.features import preprocessing,sequences,knowledge
from src.training import models
from src import refinement

experiment_config = config.ExperimentConfig()
experiment_config.model_type = "gram"

model_config = models.config.ModelConfig()
model_config.rnn_type: str = "gru"

# keep all other default configurations 
mimic_preprocessor_config = preprocessing.mimic.MimicPreprocessorConfig()
sequence_config = sequences.config.SequenceConfig()
knowledge_config = knowledge.config.KnowledgeConfig()
refinement_config = refinement.config.RefinementConfig()

In [7]:
def log_all_configs_to_mlflow():
    for config in [
        experiment_config,
        mimic_preprocessor_config,
        sequence_config,
        model_config,
        knowledge_config,
        refinement_config,
    ]:
        for config_name, config_value in vars(config).items():
            full_config_name = config.__class__.__name__ + config_name
            mlflow.log_param(full_config_name, str(config_value))

In [8]:
mlflow.set_experiment("Domain Guided Monitoring")
with mlflow.start_run() as run:
    run_id = run.info.run_id
logging.info("Starting run %s", run_id)
tf.random.set_seed(experiment_config.tensorflow_seed)
random.seed(experiment_config.random_seed)

## Load MIMIC sequence

In [10]:
sequence_preprocessor = preprocessing.MimicPreprocessor(
    config=mimic_preprocessor_config,
)
sequence_column_name = mimic_preprocessor_config.sequence_column_name
sequence_df = sequence_preprocessor.load_data()

transformer = sequences.transformer.NextPartialSequenceTransformerFromDataframe(sequence_config)
metadata = transformer.collect_metadata(sequence_df, sequence_column_name)

## Generate dataset

In [12]:
sequence_df_pkl_file: str = "data/sequences_df.pkl"
sequence_df.to_pickle(sequence_df_pkl_file)

train_sequences, test_sequences = transformer._split_train_test(sequence_df, sequence_column_name)

def generate(for_train):
    relevant_sequences = train_sequences if for_train else test_sequences
    for sequence in relevant_sequences:
        split_sequences = transformer._split_sequence(sequence)
        for split_sequence in split_sequences:
            transformer._translate_and_pad(split_sequence, metadata)
            yield split_sequence.x_vecs_stacked, split_sequence.y_vec

def generate_train():
    return generate(for_train=True)

def generate_test():
    return generate(for_train=False)

In [13]:
train_dataset = (
    tf.data.Dataset.from_generator(
        generate_train,
        output_types=(tf.float32, tf.float32),
    )
    .shuffle(
        experiment_config.dataset_shuffle_buffer,
        seed=experiment_config.dataset_shuffle_seed,
        reshuffle_each_iteration=True,
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_generator(
        generate_test,
        output_types=(tf.float32, tf.float32),
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

2022-05-02 00:54:53.820510: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-02 00:54:53.820572: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-02 00:54:53.820628: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hj-ubuntu): /proc/driver/nvidia/version does not exist
2022-05-02 00:54:53.823678: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-05-02 00:54:53.855411: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2599990000 Hz
2022-05-02 00:54:53.857172: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7efbd0000b60 initialized for platform Host (this does not guarantee that 

## Process knowledge and genarate model

In [14]:
# process knowledge to generate hierarchy
hierarchy_preprocessor = preprocessing.ICD9HierarchyPreprocessor(
    config=mimic_preprocessor_config
)
hierarchy_df = hierarchy_preprocessor.load_data()
hierarchy = knowledge.HierarchyKnowledge(
    config=knowledge_config,
)
hierarchy.build_hierarchy_from_df(hierarchy_df, metadata.x_vocab)

# load model
model = models.GramModel()

# build model
model.build(metadata, hierarchy)

Building Hierarchy from df: 18960it [00:01, 17908.69it/s]
Initializing gram_embedding connections: 100%|█| 939/939 [00:04<00:00, 203.44it/


## Run Experiment

In [15]:
# train model
model.train_dataset(
    train_dataset,
    test_dataset,
    experiment_config.multilabel_classification,
    experiment_config.n_epochs,
)

Calculating percentile frequencies...: 351it [00:04, 84.94it/s]


Epoch 1/10


2022-05-02 00:55:39.811920: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:906] Skipping loop optimization for Merge node with control input: sequential/gram_embedding/Assert_5/AssertGuard/branch_executed/_29


    350/Unknown - 5s 16ms/step - loss: 0.0807 - categorical_accuracy: 0.0962 - top_5_categorical_accuracy: 0.3244 - top_10_categorical_accuracy: 0.4353 - top_20_categorical_accuracy: 0.4971 - top_5_categorical_accuracy_cp0: 0.2948 - top_5_categorical_accuracy_cp1: 0.3294 - top_5_categorical_accuracy_cp2: 0.3421 - top_5_categorical_accuracy_cp3: 0.3750 - top_5_categorical_accuracy_cp4: 0.4155 - top_5_categorical_accuracy_p0: 0.2766 - top_5_categorical_accuracy_p1: 0.2412 - top_5_categorical_accuracy_p2: 0.2666 - top_5_categorical_accuracy_p3: 0.2922 - top_5_categorical_accuracy_p4: 0.3580 - top_10_categorical_accuracy_cp0: 0.3933 - top_10_categorical_accuracy_cp1: 0.4394 - top_10_categorical_accuracy_cp2: 0.4527 - top_10_categorical_accuracy_cp3: 0.4989 - top_10_categorical_accuracy_cp4: 0.5466 - top_10_categorical_accuracy_p0: 0.2766 - top_10_categorical_accuracy_p1: 0.3306 - top_10_categorical_accuracy_p2: 0.3355 - top_10_categorical_accuracy_p3: 0.3854 - top_10_categorical_accuracy_p

2022-05-02 00:55:52.170477: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:906] Skipping loop optimization for Merge node with control input: sequential/gram_embedding/Assert_5/AssertGuard/branch_executed/_29


Epoch 2/10
Epoch 3/10


Epoch 4/10
Epoch 5/10
Epoch 6/10


Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10


In [18]:
# logging dataset info
mlflow.log_metric("train_size", len([x for x in train_dataset]))
mlflow.log_metric("test_size", len([x for x in test_dataset]))
mlflow.log_metric("x_vocab_size", len(metadata.x_vocab))
mlflow.log_metric("y_vocab_size", len(metadata.y_vocab))

# generate artifacts
# skip

# set mlflow tags
mlflow.set_tag("sequence_type", experiment_config.sequence_type)
mlflow.set_tag("model_type", experiment_config.model_type)
if len(metadata.y_vocab) == 1:
    mlflow.set_tag("task_type", "risk_prediction")
else:
    mlflow.set_tag("task_type", "sequence_prediction")

logging.info("Finished run %s", run_id)
